mpi : fix after master merge

Merge branch 'mpi' into refactor-mpi
[mpi] continue-on-error: true
2026-04-23 16:37:33 +03:00 · 2023-07-09 22:23:04 +03:00 · 2023-07-09 22:20:14 +03:00 · 2023-07-09 15:10:43 -04:00 · 2023-07-09 15:05:58 -04:00 · 2023-07-09 15:02:19 -04:00
7 changed files with 86 additions and 106 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -20,7 +20,6 @@ build-static/
 build-cublas/
 build-opencl/
 build-metal/
-build-mpi/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -321,11 +321,6 @@ if (LLAMA_MPI)
        set(c_flags   ${c_flags}   -Wno-cast-qual)
        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${MPI_C_LIBRARIES})
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
-        # Even if you're only using the C header, C++ programs may bring in MPI
-        # C++ functions, so more linkage is needed
-        if (MPI_CXX_FOUND)
-            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}     ${MPI_CXX_LIBRARIES})
-        endif()
    else()
        message(WARNING "MPI not found")
    endif()
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -450,7 +450,6 @@ void ggml_metal_graph_compute(
                //}

                switch (dst->op) {
-                    case GGML_OP_NONE:
                    case GGML_OP_RESHAPE:
                    case GGML_OP_VIEW:
                    case GGML_OP_TRANSPOSE:
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -56,7 +56,7 @@ void ggml_mpi_eval_init(
    MPI_Bcast(n_threads, 1, MPI_INT, 0, MPI_COMM_WORLD);
 }

-static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
+int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
    struct ggml_tensor * t = ggml_graph_get_tensor(gf, name);
    if (t == NULL) {
        fprintf(stderr, "%s: tensor %s not found\n", __func__, name);
@@ -73,39 +73,13 @@ static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
    return -1;
 }

-static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
-    MPI_Datatype mpi_type;
-
-    switch (t->type) {
-        case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
-        case GGML_TYPE_F32: mpi_type = MPI_FLOAT;   break;
-        default: GGML_ASSERT(false && "not implemented");
-    }
-
-    const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, MPI_COMM_WORLD);
-    GGML_ASSERT(retval == MPI_SUCCESS);
-}
-
-static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
-    MPI_Datatype mpi_type;
-
-    switch (t->type) {
-        case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
-        case GGML_TYPE_F32: mpi_type = MPI_FLOAT;   break;
-        default: GGML_ASSERT(false && "not implemented");
-    }
-
-    MPI_Status status; UNUSED(status);
-
-    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
-    GGML_ASSERT(retval == MPI_SUCCESS);
-}
-
 // TODO: there are many improvements that can be done to this implementation
-void ggml_mpi_graph_compute_pre(
+void ggml_mpi_graph_compute(
        struct ggml_mpi_context * ctx_mpi,
+        struct ggml_context     * ctx,
             struct ggml_cgraph * gf,
-                            int   n_layers) {
+                            int   n_layers,
+                            int   n_threads) {
    const int mpi_rank = ctx_mpi->rank;
    const int mpi_size = ctx_mpi->size;

@@ -135,19 +109,41 @@ void ggml_mpi_graph_compute_pre(
    // node 0:   [(n-1) * n_per_node,            n_nodes)
    //
    if (mpi_rank > 0) {
-        if (mpi_rank == 1) {
-            // the first node (1) receives the input tokens from the main node (0)
-            ggml_mpi_tensor_recv(inp_tokens, 0);
-        } else {
-            // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
-            ggml_mpi_tensor_recv(inp0, mpi_rank - 1);
+        if (mpi_rank == 1) { // the first node receives the input tokens from the main node
+            MPI_Status status; UNUSED(status);
+
+            const int mpi_rank_src = mpi_rank - 1;
+
+            const int retval = MPI_Recv(inp_tokens->data, ggml_nelements(inp_tokens), MPI_INT, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+            GGML_ASSERT(retval == MPI_SUCCESS);
+        } else { // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
+            MPI_Status status; UNUSED(status);
+
+            const int mpi_rank_src = mpi_rank - 1;
+
+            //printf("%s: node %d: waiting for %d elements from %d\n", __func__, mpi_rank, (int) ggml_nelements(inp0), mpi_rank_src);
+            const int retval = MPI_Recv(inp0->data, ggml_nelements(inp0), MPI_FLOAT, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+            GGML_ASSERT(retval == MPI_SUCCESS);
        }
    } else if (mpi_size > 1) {
        // node 0 sends the input tokens to node 1
-        ggml_mpi_tensor_send(inp_tokens, 1);
+        {
+            const int mpi_rank_dst = mpi_rank + 1;
+
+            const int retval = MPI_Send(inp_tokens->data, ggml_nelements(inp_tokens), MPI_INT, mpi_rank_dst, 0, MPI_COMM_WORLD);
+            GGML_ASSERT(retval == MPI_SUCCESS);
+        }

        // recv the output data from the last node
-        ggml_mpi_tensor_recv(inp0, mpi_size - 1);
+        {
+            MPI_Status status; UNUSED(status);
+
+            const int mpi_rank_src = mpi_size - 1;
+
+            //fprintf(stderr, "%s: node %d: waiting for %d elements from %d\n", __func__, mpi_rank, (int) ggml_nelements(inp0), mpi_rank_src);
+            const int retval = MPI_Recv(inp0->data, ggml_nelements(inp0), MPI_FLOAT, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+            GGML_ASSERT(retval == MPI_SUCCESS);
+        }
    }

    {
@@ -198,19 +194,20 @@ void ggml_mpi_graph_compute_pre(

        //fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, mpi_rank, gf->n_nodes, il0, il1);
    }
-}

-void ggml_mpi_graph_compute_post(
-        struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers) {
-    UNUSED(n_layers);
+    ggml_graph_compute_with_ctx(ctx, gf, n_threads);

-    const int mpi_rank = ctx_mpi->rank;
-    const int mpi_size = ctx_mpi->size;
+    //fprintf(stderr, "%s: node %d: done\n", __func__, mpi_rank);

    // send the output data to the next node
    if (mpi_rank > 0) {
-        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size);
+        struct ggml_tensor * output = gf->nodes[gf->n_nodes - 1];
+
+        const int mpi_rank_dst = (mpi_rank + 1) % mpi_size;
+
+        //fprintf(stderr, "%s: node %d: sending %d elements to node %d\n", __func__, mpi_rank, ggml_nelements(output), mpi_rank_dst);
+
+        const int retval = MPI_Send(output->data, ggml_nelements(output), MPI_FLOAT, mpi_rank_dst, 0, MPI_COMM_WORLD);
+        GGML_ASSERT(retval == MPI_SUCCESS);
    }
 }
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -24,15 +24,12 @@ void ggml_mpi_eval_init(
                            int * n_past,
                            int * n_threads);

-void ggml_mpi_graph_compute_pre(
+void ggml_mpi_graph_compute(
        struct ggml_mpi_context * ctx_mpi,
+        struct ggml_context     * ctx,
             struct ggml_cgraph * gf,
-                            int   n_layers);
-
-void ggml_mpi_graph_compute_post(
-        struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers);
+                            int   n_layers,
+                            int   n_threads);

 #ifdef __cplusplus
 }
--- a/llama.cpp
+++ b/llama.cpp
@@ -1632,10 +1632,6 @@ static bool llama_eval_internal(
    // run the computation
    ggml_build_forward_expand(&gf, cur);

-#if GGML_USE_MPI
-    ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
-#endif
-
 #ifdef GGML_USE_METAL
    if (lctx.ctx_metal && N == 1) {
        ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
@@ -1660,19 +1656,14 @@ static bool llama_eval_internal(

        ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
    }
+#elif GGML_USE_MPI
+    ggml_mpi_graph_compute(lctx.ctx_mpi, ctx0, &gf, n_layer, n_threads);
+
+    cur = gf.nodes[gf.n_nodes - 1];
 #else
    ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
 #endif

-#if GGML_USE_MPI
-    ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
-#endif
-
-    // update kv token count
-    lctx.kv_self.n = n_past + N;
-
-    struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
-
    if (cgraph_fname) {
        ggml_graph_export(&gf, cgraph_fname);
    }
@@ -1688,26 +1679,38 @@ static bool llama_eval_internal(
    //    ggml_graph_dump_dot(&gf, NULL, "llama.dot");
    //}

-    // extract logits
+    //embd_w.resize(n_vocab*N);
+    //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
+
+    // update kv token count
+    lctx.kv_self.n = n_past + N;
+
+#ifdef GGML_USE_MPI
+    if (ggml_mpi_rank(lctx.ctx_mpi) == 0) {
+#else
    {
-        auto & logits_out = lctx.logits;
+#endif
+        // extract logits
+        {
+            auto & logits_out = lctx.logits;

-        if (lctx.logits_all) {
-            logits_out.resize(n_vocab * N);
-            memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
-        } else {
-            // return result for just the last token
-            logits_out.resize(n_vocab);
-            memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+            if (lctx.logits_all) {
+                logits_out.resize(n_vocab * N);
+                memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
+            } else {
+                // return result for just the last token
+                logits_out.resize(n_vocab);
+                memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+            }
        }
-    }

-    // extract embeddings
-    if (!lctx.embedding.empty()) {
-        auto & embedding_out = lctx.embedding;
+        // extract embeddings
+        if (!lctx.embedding.empty()) {
+            auto & embedding_out = lctx.embedding;

-        embedding_out.resize(n_embd);
-        memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
+            embedding_out.resize(n_embd);
+            memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
+        }
    }

    if (mem_per_token == 0) {
@@ -2454,14 +2457,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        } else {
            new_type = quantized_type;
 #ifdef GGML_USE_K_QUANTS
-            bool convert_incompatible_tensor = false;
            if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
                quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
                int nx = tensor.ne.at(0);
                int ny = tensor.ne.at(1);
                if (nx % QK_K != 0 || ny % QK_K != 0) {
-                    fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
-                    convert_incompatible_tensor = true;
+                    fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
+                    fprintf(stderr, "This is required to be able to use k-quants for now!\n");
+                    fprintf(stderr, "========================================================================================\n\n");
+                    throw std::runtime_error("Unsupported tensor size encountered\n");
                }
            }
            if (tensor.name == "output.weight") {
@@ -2489,17 +2493,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
            }
-            if (convert_incompatible_tensor) {
-                if (tensor.name == "output.weight") {
-                    new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
-                    fprintf(stderr, "F16 will be used for this tensor instead.\n");
-                } else if (tensor.name == "tok_embeddings.weight") {
-                    new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
-                    fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
-                } else {
-                    throw std::runtime_error("Unsupported tensor size encountered\n");
-                }
-            }
 #endif

            float * f32_data;
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -31,7 +31,7 @@ int main(int argc, char **argv) {
    llama_model * model;
    llama_context * ctx;

-    llama_backend_init(false);
+    llama_init_backend(false);

    // load the vocab
    {
@@ -99,7 +99,7 @@ int main(int argc, char **argv) {
    llama_free_model(model);
    llama_free(ctx);

-    llama_backend_free();
+    llama_finalize_backend();

    return 0;
 }
Author	SHA1	Message	Date
Georgi Gerganov	0492363137	mpi : fix after master merge	2023-07-09 22:23:04 +03:00
Georgi Gerganov	81c5ddd532	Merge branch 'mpi' into refactor-mpi	2023-07-09 22:20:14 +03:00
Evan Miller	03cc12be0d	[mpi] continue-on-error: true	2023-07-09 15:10:43 -04:00
Evan Miller	4a9a4748e9	Add OpenMPI to GH action	2023-07-09 15:05:58 -04:00
Evan Miller	0f557c2ac4	Merge branch 'master' into mpi	2023-07-09 15:02:19 -04:00
Georgi Gerganov	9da9d26c70	mpi : minor	2023-07-09 18:38:32 +03:00
Georgi Gerganov	beadbf3380	mpi : fix inference	2023-07-09 18:26:20 +03:00
Georgi Gerganov	ef37dd14e7	mpi : fix output tensor after MPI compute (still not working)	2023-07-09 17:01:08 +03:00
Georgi Gerganov	c717c5185f	mpi : various fixes - communication now works but results are wrong	2023-07-09 16:40:16 +03:00
Georgi Gerganov	01abb3b3b9	mpi : move all MPI logic into ggml-mpi Not tested yet	2023-07-09 16:04:27 +03:00
Georgi Gerganov	e339d35579	mpi : add names for layer inputs + prep ggml_mpi_graph_compute()	2023-07-09 14:42:36 +03:00
Georgi Gerganov	3232db628c	mpi : trying to move more MPI stuff into ggml-mpi (WIP) (#2099 )	2023-07-09 14:08:53 +03:00
Evan Miller	ef61acfbf5	Add info to README	2023-07-07 09:02:23 -04:00
Evan Miller	55207ba2b8	Add GH workflow, fix test	2023-07-06 21:40:18 -04:00
Evan Miller	1f0a2cfeda	Update CMakeLists.txt	2023-07-06 21:25:34 -04:00
Evan Miller	06a239343c	PR comments	2023-07-06 20:18:41 -04:00
Evan Miller	32deabfdc8	Merge branch 'master' into mpi	2023-07-06 19:04:50 -04:00
Evan Miller	042c5b278f	wrap includes	2023-07-04 00:13:20 -04:00
Evan Miller	668ba5fe0b	fixes	2023-07-04 00:09:02 -04:00
Evan Miller	d05ca74dd8	fix warnings, update README	2023-07-03 23:53:43 -04:00
Evan Miller	f85785f650	MPI support, first cut	2023-07-03 21:51:05 -04:00