mpi : fix after master merge

Merge branch 'mpi' into refactor-mpi
[mpi] continue-on-error: true
2026-04-23 16:37:33 +03:00 · 2023-07-09 22:23:04 +03:00 · 2023-07-09 22:20:14 +03:00 · 2023-07-09 15:10:43 -04:00 · 2023-07-09 15:05:58 -04:00 · 2023-07-09 15:02:19 -04:00
25 changed files with 849 additions and 1940 deletions
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -10,13 +10,13 @@ shift
 # Join the remaining arguments into a single string
 arg2="$@"

-if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    python3 ./convert.py "$arg2"
-elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
-    ./quantize "$arg2"
-elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
-    ./main "$arg2"
-elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
+if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
+    python3 ./convert.py $arg2
+elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
+    ./quantize $arg2
+elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
+    ./main $arg2
+elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
        if [ -f "${i/f16/q4_0}" ]; then
@@ -26,8 +26,6 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
            ./quantize "$i" "${i/f16/q4_0}" q4_0
        fi
    done
-elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
-    ./server "$arg2"
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
@@ -39,6 +37,4 @@ else
    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
    echo "  --all-in-one (-a): Execute --convert & --quantize"
    echo "              ex: \"/models/\" 7B"
-    echo "  --server (-s): Run a model on the server"
-    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -c 2048 -ngl 43 -mg 1 --port 8080"
 fi
--- a/.gitignore
+++ b/.gitignore
@@ -20,7 +20,6 @@ build-static/
 build-cublas/
 build-opencl/
 build-metal/
-build-mpi/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -272,7 +272,7 @@ if (LLAMA_CUBLAS)

    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
        if (LLAMA_CUDA_DMMV_F16)
-            set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
+            set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
        else()
            set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
        endif()
@@ -321,11 +321,6 @@ if (LLAMA_MPI)
        set(c_flags   ${c_flags}   -Wno-cast-qual)
        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${MPI_C_LIBRARIES})
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
-        # Even if you're only using the C header, C++ programs may bring in MPI
-        # C++ functions, so more linkage is needed
-        if (MPI_CXX_FOUND)
-            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}     ${MPI_CXX_LIBRARIES})
-        endif()
    else()
        message(WARNING "MPI not found")
    endif()
--- a/16
+++ b/16
@@ -151,6 +151,9 @@ ifdef LLAMA_MPI
 	CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
 	CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
 	OBJS     += ggml-mpi.o
+
+ggml-mpi.o: ggml-mpi.c ggml-mpi.h
+	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI

 ifdef LLAMA_OPENBLAS
@@ -223,6 +226,9 @@ ifdef LLAMA_METAL
 	CXXFLAGS += -DGGML_USE_METAL
 	LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
 	OBJS     += ggml-metal.o
+
+ggml-metal.o: ggml-metal.m ggml-metal.h
+	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_METAL

 ifneq ($(filter aarch64%,$(UNAME_M)),)
@@ -247,16 +253,6 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
 endif

-ifdef LLAMA_METAL
-ggml-metal.o: ggml-metal.m ggml-metal.h
-	$(CC) $(CFLAGS) -c $< -o $@
-endif # LLAMA_METAL
-
-ifdef LLAMA_MPI
-ggml-mpi.o: ggml-mpi.c ggml-mpi.h
-	$(CC) $(CFLAGS) -c $< -o $@
-endif # LLAMA_MPI
-
 ifdef LLAMA_NO_K_QUANTS
 k_quants.o: k_quants.c k_quants.h
 	$(CC) $(CFLAGS) -c $< -o $@
--- a/README.md
+++ b/README.md
@@ -239,7 +239,7 @@ In order to build llama.cpp you have three different options.
 - Using `Zig`:

    ```bash
-    zig build -Doptimize=ReleaseFast
+    zig build -Drelease-fast
    ```

 ### Metal Build
--- a/build.zig
+++ b/build.zig
@@ -1,19 +1,9 @@
 const std = @import("std");
-const commit_hash = @embedFile(".git/refs/heads/master");

-// Zig Version: 0.11.0-dev.3986+e05c242cd
+// Zig Version: 0.11.0-dev.3379+629f0d23b
 pub fn build(b: *std.build.Builder) void {
    const target = b.standardTargetOptions(.{});
    const optimize = b.standardOptimizeOption(.{});
-
-    const config_header = b.addConfigHeader(
-        .{ .style = .blank, .include_path = "build-info.h" },
-        .{
-            .BUILD_NUMBER = 0,
-            .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
-        },
-    );
-
    const lib = b.addStaticLibrary(.{
        .name = "llama",
        .target = target,
@@ -23,21 +13,24 @@ pub fn build(b: *std.build.Builder) void {
    lib.linkLibCpp();
    lib.addIncludePath(".");
    lib.addIncludePath("./examples");
-    lib.addConfigHeader(config_header);
-    lib.addCSourceFiles(&.{"ggml.c"}, &.{"-std=c11"});
-    lib.addCSourceFiles(&.{"llama.cpp"}, &.{"-std=c++11"});
+    lib.addCSourceFiles(&.{
+        "ggml.c",
+    }, &.{"-std=c11"});
+    lib.addCSourceFiles(&.{
+        "llama.cpp",
+    }, &.{"-std=c++11"});
    b.installArtifact(lib);

    const examples = .{
        "main",
        "baby-llama",
        "embedding",
-        "metal",
+        // "metal",
        "perplexity",
        "quantize",
        "quantize-stats",
        "save-load-state",
-        "server",
+        // "server",
        "simple",
        "train-text-from-scratch",
    };
@@ -50,19 +43,16 @@ pub fn build(b: *std.build.Builder) void {
        });
        exe.addIncludePath(".");
        exe.addIncludePath("./examples");
-        exe.addConfigHeader(config_header);
        exe.addCSourceFiles(&.{
-            std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{ example_name, example_name }),
+            std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}),
            "examples/common.cpp",
        }, &.{"-std=c++11"});
        exe.linkLibrary(lib);
        b.installArtifact(exe);
-
        const run_cmd = b.addRunArtifact(exe);
        run_cmd.step.dependOn(b.getInstallStep());
        if (b.args) |args| run_cmd.addArgs(args);
-
-        const run_step = b.step("run-" ++ example_name, "Run the app");
+        const run_step = b.step("run_" ++ example_name, "Run the app");
        run_step.dependOn(&run_cmd.step);
    }
 }
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -236,24 +236,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.mirostat_tau = std::stof(argv[i]);
-        } else if (arg == "--cfg-negative-prompt") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.cfg_negative_prompt = argv[i];
-        } else if (arg == "--cfg-scale") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.cfg_scale = std::stof(argv[i]);
-        } else if (arg == "--cfg-smooth-factor") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.cfg_smooth_factor = std::stof(argv[i]);
        } else if (arg == "-b" || arg == "--batch-size") {
            if (++i >= argc) {
                invalid_param = true;
@@ -488,10 +470,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "                        modifies the likelihood of token appearing in the completion,\n");
    fprintf(stderr, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
    fprintf(stderr, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
-    fprintf(stderr, "  --cfg-negative-prompt PROMPT \n");
-    fprintf(stderr, "                        negative prompt to use for guidance. (default: empty)\n");
-    fprintf(stderr, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
-    fprintf(stderr, "  --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor);
    fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
    fprintf(stderr, "  --no-penalize-nl      do not penalize newline token\n");
@@ -558,7 +536,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
    return res;
 }

-struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
    auto lparams = llama_context_default_params();

    lparams.n_ctx        = params.n_ctx;
@@ -574,12 +552,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    lparams.logits_all   = params.perplexity;
    lparams.embedding    = params.embedding;

-    return lparams;
-}
-
-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
-    auto lparams = llama_context_params_from_gpt_params(params);
-
    llama_model * model  = llama_load_model_from_file(params.model.c_str(), lparams);
    if (model == NULL) {
        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
--- a/examples/common.h
+++ b/examples/common.h
@@ -48,12 +48,6 @@ struct gpt_params {
    float   mirostat_tau      = 5.00f; // target entropy
    float   mirostat_eta      = 0.10f; // learning rate

-    // Classifier-Free Guidance
-    // https://arxiv.org/abs/2306.17806
-    std::string cfg_negative_prompt;       // string to help guidance
-    float       cfg_scale         = 1.f;   // How strong is guidance
-    float       cfg_smooth_factor = 1.f;   // Smooth factor between old and new logits
-
    std::string model             = "models/7B/ggml-model.bin"; // model path
    std::string model_alias       = "unknown"; // model alias
    std::string prompt            = "";
@@ -105,7 +99,6 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
 //

 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
-struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);

 //
 // Console utils
--- a/examples/embd-input/README.md
+++ b/examples/embd-input/README.md
@@ -17,7 +17,7 @@ make
 import torch

 bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
-pth_path = "./examples/embd-input/llava_projection.pth"
+pth_path = "./examples/embd_input/llava_projection.pth"

 dic = torch.load(bin_path)
 used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
--- a/examples/embd-input/llava.py
+++ b/examples/embd-input/llava.py
@@ -59,7 +59,7 @@ if __name__=="__main__":
    # Also here can use pytorch_model-00003-of-00003.bin directly.
    a.load_projection(os.path.join(
        os.path.dirname(__file__) ,
-        "llava_projection.pth"))
+        "llava_projetion.pth"))
    respose = a.chat_with_image(
        Image.open("./media/llama1-logo.png").convert('RGB'),
        "what is the text in the picture?")
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -109,16 +109,10 @@ int main(int argc, char ** argv) {

    llama_model * model;
    llama_context * ctx;
-    llama_context * ctx_guidance = NULL;
    g_ctx = &ctx;

    // load the model and apply lora adapter, if any
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (params.cfg_scale > 1.f) {
-        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
-        ctx_guidance = llama_new_context_with_model(model, lparams);
-    }
-
    if (model == NULL) {
        fprintf(stderr, "%s: error: unable to load model\n", __func__);
        return 1;
@@ -189,28 +183,15 @@ int main(int argc, char ** argv) {
    // tokenize the prompt
    std::vector<llama_token> embd_inp;

-    // Add a space in front of the first character to match OG llama tokenizer behavior
-    params.prompt.insert(0, 1, ' ');
-
    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
+        // Add a space in front of the first character to match OG llama tokenizer behavior
+        params.prompt.insert(0, 1, ' ');
+
        embd_inp = ::llama_tokenize(ctx, params.prompt, true);
    } else {
        embd_inp = session_tokens;
    }

-    // Tokenize negative prompt
-    std::vector<llama_token> guidance_inp;
-    int guidance_offset = 0;
-    int original_prompt_len = 0;
-    if (ctx_guidance) {
-        params.cfg_negative_prompt.insert(0, 1, ' ');
-        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, true);
-
-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
-        original_prompt_len = original_inp.size();
-        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
-    }
-
    const int n_ctx = llama_n_ctx(ctx);

    if ((int) embd_inp.size() > n_ctx - 4) {
@@ -277,16 +258,6 @@ int main(int argc, char ** argv) {
        for (int i = 0; i < (int) embd_inp.size(); i++) {
            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
        }
-
-        if (ctx_guidance) {
-            fprintf(stderr, "\n");
-            fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
-            fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
-            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
-            }
-        }
-
        if (params.n_keep > 0) {
        fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
@@ -363,13 +334,11 @@ int main(int argc, char ** argv) {
    int n_remain           = params.n_predict;
    int n_consumed         = 0;
    int n_session_consumed = 0;
-    int n_past_guidance    = 0;

    // the first thing we will do is to output the prompt, so set color accordingly
    console_set_color(con_st, CONSOLE_COLOR_PROMPT);

    std::vector<llama_token> embd;
-    std::vector<llama_token> embd_guidance;

    // do one empty run to warm up the model
    {
@@ -398,12 +367,11 @@ int main(int argc, char ** argv) {
            // if we run out of context:
            // - take the n_keep first tokens from the original prompt (via n_past)
            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
+            if (n_past + (int) embd.size() > n_ctx) {
                const int n_left = n_past - params.n_keep;

                // always keep the first token - BOS
                n_past = std::max(1, params.n_keep);
-                n_past_guidance = std::max(1, params.n_keep + guidance_offset);

                // insert n_left/2 tokens at the start of embd from last_n_tokens
                embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
@@ -444,48 +412,6 @@ int main(int argc, char ** argv) {

            // evaluate tokens in batches
            // embd is typically prepared beforehand to fit within a batch, but not always
-
-            if (ctx_guidance) {
-                int input_size = 0;
-                llama_token* input_buf = NULL;
-
-                if (n_past_guidance < (int) guidance_inp.size()) {
-                    // Guidance context should have the same data with these modifications:
-                    //
-                    // * Replace the initial prompt
-                    // * Shift everything by guidance_offset
-                    embd_guidance = guidance_inp;
-                    if (embd.begin() + original_prompt_len < embd.end()) {
-                        embd_guidance.insert(
-                            embd_guidance.end(),
-                            embd.begin() + original_prompt_len,
-                            embd.end()
-                        );
-                    }
-
-                    input_buf = embd_guidance.data();
-                    input_size = embd_guidance.size();
-                    //fprintf(stderr, "\n---------------------\n");
-                    //for (int i = 0; i < (int) embd_guidance.size(); i++) {
-                        //fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i]));
-                    //}
-                    //fprintf(stderr, "\n---------------------\n");
-                } else {
-                    input_buf = embd.data();
-                    input_size = embd.size();
-                }
-
-                for (int i = 0; i < input_size; i += params.n_batch) {
-                    int n_eval = std::min(input_size - i, params.n_batch);
-                    if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
-                        fprintf(stderr, "%s : failed to eval\n", __func__);
-                        return 1;
-                    }
-
-                    n_past_guidance += n_eval;
-                }
-            }
-
            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
                int n_eval = (int) embd.size() - i;
                if (n_eval > params.n_batch) {
@@ -505,7 +431,6 @@ int main(int argc, char ** argv) {
        }

        embd.clear();
-        embd_guidance.clear();

        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
            // out of user input, sample next token
@@ -548,10 +473,6 @@ int main(int argc, char ** argv) {

                llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

-                if (ctx_guidance) {
-                    llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale, params.cfg_smooth_factor);
-                }
-
                // Apply penalties
                float nl_logit = logits[llama_token_nl()];
                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
@@ -747,7 +668,6 @@ int main(int argc, char ** argv) {
    }

    llama_print_timings(ctx);
-    if (ctx_guidance) { llama_free(ctx_guidance); }
    llama_free(ctx);
    llama_free_model(model);

--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1354,9 +1354,17 @@ struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
        }
    }

-    for (int i = 0; i < GGML_MAX_SRC; ++i) {
-        if (t->src[i]) {
-            expand(g, t->src[i]);
+    if (t->src0) {
+        expand(g, t->src0);
+    }
+
+    if (t->src1) {
+        expand(g, t->src1);
+    }
+
+    for (int i = 0; i < GGML_MAX_OPT; ++i) {
+        if (t->opt[i]) {
+            expand(g, t->opt[i]);
        }
    }

--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -13,8 +13,6 @@
 #include "ggml-cuda.h"
 #include "ggml.h"

-#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
-
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
@@ -76,7 +74,7 @@ typedef void (*ggml_cuda_op_t)(

 #define QK4_0 32
 #define QR4_0 2
-#define QI4_0 (QK4_0 / (4 * QR4_0))
+#define QI4_0 4
 typedef struct {
    half    d;              // delta
    uint8_t qs[QK4_0 / 2];  // nibbles / quants
@@ -85,7 +83,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0

 #define QK4_1 32
 #define QR4_1 2
-#define QI4_1 (QK4_1 / (4 * QR4_1))
+#define QI4_1 4
 typedef struct {
    half    d;              // delta
    half    m;              // min
@@ -95,7 +93,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong

 #define QK5_0 32
 #define QR5_0 2
-#define QI5_0 (QK5_0 / (4 * QR5_0))
+#define QI5_0 4
 typedef struct {
    half d;                 // delta
    uint8_t qh[4];          // 5-th bit of quants
@@ -105,7 +103,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5

 #define QK5_1 32
 #define QR5_1 2
-#define QI5_1 (QK5_1 / (4 * QR5_1))
+#define QI5_1 4
 typedef struct {
    half d;                 // delta
    half m;                 // min
@@ -116,7 +114,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +

 #define QK8_0 32
 #define QR8_0 1
-#define QI8_0 (QK8_0 / (4 * QR8_0))
+#define QI8_0 8
 typedef struct {
    half    d;              // delta
    int8_t  qs[QK8_0];      // quants
@@ -125,7 +123,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo

 #define QK8_1 32
 #define QR8_1 1
-#define QI8_1 (QK8_1 / (4 * QR8_1))
+#define QI8_1 8
 typedef struct {
    half    d;              // delta
    half    s;              // unquantized sum
@@ -145,8 +143,6 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
 #define K_SCALE_SIZE 12
 #endif

-#define QR2_K 4
-#define QI2_K (QK_K / (4*QR2_K))
 typedef struct {
    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
    uint8_t qs[QK_K/4];      // quants
@@ -155,8 +151,6 @@ typedef struct {
 } block_q2_K;
 static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");

-#define QR3_K 4
-#define QI3_K (QK_K / (4*QR3_K))
 typedef struct {
    uint8_t hmask[QK_K/8];     // quants - high bit
    uint8_t qs[QK_K/4];        // quants - low 2 bits
@@ -169,8 +163,6 @@ typedef struct {
 } block_q3_K;
 //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");

-#define QR4_K 2
-#define QI4_K (QK_K / (4*QR4_K))
 #ifdef GGML_QKK_64
 typedef struct {
    half    d[2];              // super-block scales/mins
@@ -188,8 +180,6 @@ typedef struct {
 static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
 #endif

-#define QR5_K 2
-#define QI5_K (QK_K / (4*QR5_K))
 #ifdef GGML_QKK_64
 typedef struct {
    half d;                  // super-block scale
@@ -209,8 +199,6 @@ typedef struct {
 static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
 #endif

-#define QR6_K 2
-#define QI6_K (QK_K / (4*QR6_K))
 typedef struct {
    uint8_t ql[QK_K/2];   // quants, lower 4 bits
    uint8_t qh[QK_K/4];   // quants, upper 2 bits
@@ -224,7 +212,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_

 #define CUDA_ADD_BLOCK_SIZE 256
 #define CUDA_MUL_BLOCK_SIZE 256
-#define CUDA_GELU_BLOCK_SIZE 256
 #define CUDA_SILU_BLOCK_SIZE 256
 #define CUDA_CPY_BLOCK_SIZE 32
 #define CUDA_SCALE_BLOCK_SIZE 256
@@ -252,13 +239,13 @@ struct ggml_tensor_extra_gpu {
    cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
 };

-static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
+static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

-    if (i >= kx) {
+    if (i >= k) {
        return;
    }
-    dst[i] = x[i] + y[i%ky];
+    dst[i] = x[i] + y[i];
 }

 static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
@@ -279,19 +266,6 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
    dst[i] = x[i] * y[i%ky];
 }

-static __global__ void gelu_f32(const float * x, float * dst, const int k) {
-    const float GELU_COEF_A    = 0.044715f;
-    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    float xi = x[i];
-    dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
-}
-
 static __global__ void silu_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

@@ -301,46 +275,16 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
    dst[i] = x[i] / (1.0f + expf(-x[i]));
 }

-static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    const int tid = threadIdx.x;
-
-    const float eps = 1e-5f;
-
-    float mean = 0.0f;
-    float var = 0.0f;
-
-    for (int col = tid; col < ncols; col += WARP_SIZE) {
-        const float xi = x[row*ncols + col];
-        mean += xi;
-        var += xi * xi;
-    }
-
-    // sum up partial sums
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
-        var += __shfl_xor_sync(0xffffffff, var, mask, 32);
-    }
-
-    mean /= ncols;
-    var = var / ncols - mean * mean;
-    const float inv_var = rsqrtf(var + eps);
-
-    for (int col = tid; col < ncols; col += WARP_SIZE) {
-        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
-    }
-}
-
 static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    const int tid = threadIdx.x;

-    const float eps = 1e-6f;
+    const float eps = 1e-6;

    float tmp = 0.0f; // partial sum for thread in warp

-    for (int col = tid; col < ncols; col += WARP_SIZE) {
+    for (int i = 0; i < ncols; i += WARP_SIZE) {
+        const int col = i + tid;
        const float xi = x[row*ncols + col];
        tmp += xi * xi;
    }
@@ -352,9 +296,10 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
    }

    const float mean = tmp / ncols;
-    const float scale = rsqrtf(mean + eps);
+    const float scale = 1.0f / sqrtf(mean + eps);

-    for (int col = tid; col < ncols; col += WARP_SIZE) {
+    for (int i = 0; i < ncols; i += WARP_SIZE) {
+        const int col = i + tid;
        dst[row*ncols + col] = scale * x[row*ncols + col];
    }
 }
@@ -1283,9 +1228,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
    y[iybs + iqs + y_offset] = v.y;
 }

-static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;

    int vi;
@@ -1306,12 +1250,11 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
    return sumi*d;
 #else
    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+#endif // __CUDA_ARCH__ >= 600
 }

-static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;

    const int vi  = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1332,12 +1275,11 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
    return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
 #else
    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+#endif // __CUDA_ARCH__ >= 600
 }

-static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;

    int qs;
@@ -1368,12 +1310,11 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
    return sumi*d;
 #else
    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+#endif // __CUDA_ARCH__ >= 600
 }

-static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;

    const int qs  = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1403,12 +1344,11 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
    return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
 #else
    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+#endif // __CUDA_ARCH__ >= 600
 }

-static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;

    int vi;
@@ -1423,220 +1363,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
    return sumi*d;
 #else
    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
-
-    const int bq8_offset = QR2_K * (iqs / QI8_1);
-    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
-
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-    const float    d = bq2_K->d;
-    const float dmin = bq2_K->dmin;
-
-    const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
-
-    for (int i = 0; i < QR2_K; ++i) {
-        const int sc = bq2_K->scales[scale_offset + 2*i];
-
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        const float d8i = bq8i->d;
-
-        const int vi = (v >> (2*i)) & 0x03030303;
-        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
-
-        sumf_d += d8i * (__dp4a(vi,         ui, 0) * (sc & 0xF)); // SIMD dot product
-        sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >>  4)); // multiply constant q2_K part with sum of q8_1 values
-    }
-
-    return d*sumf_d - dmin*sumf_m;
-#else
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
-
-    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
-    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
-
-    float sumf = 0.0f;
-
-    const float d = bq3_K->d;
-
-    int vl;
-    memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
-
-    int vh;
-    memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
-    vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
-    vh >>= bq8_offset;
-
-    for (int i = 0; i < QR3_K; ++i) {
-        const int isc = scale_offset + 2*i;
-
-        const int isc_low = isc % (QK_K/32);
-        const int sc_shift_low = 4 * (isc / (QK_K/32));
-        const int sc_low  = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
-
-        const int isc_high = isc % (QK_K/64);
-        const int sc_shift_high = 2 * (isc / (QK_K/64));
-        const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
-
-        const int sc = (sc_low | sc_high) - 32;
-
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
-        const float d8i = bq8i->d;
-
-        const int vil = (vl >> (2*i)) & 0x03030303;
-
-        const int vih = ((vh >> i) << 2) & 0x04040404;
-
-        const int vi = __vsubss4(vil, vih);
-
-        sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
-    }
-
-    return d*sumf;
-#else
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
-
-    const int bq8_offset = QR4_K * (iqs / QI8_1);
-
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-    const float    d = bq4_K->d;
-    const float dmin = bq4_K->dmin;
-
-    const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
-
-    for (int i = 0; i < QR4_K; ++i) {
-        const int isc = bq8_offset + i;
-
-        uint8_t sc, m;
-        get_scale_min_k4(isc, bq4_K->scales, sc, m);
-
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
-        const float d8i = bq8i->d;
-
-        const int vi = (v >> (4*i)) & 0x0F0F0F0F;
-
-        sumf_d += d8i * (__dp4a(vi,         ui, 0) * sc); // SIMD dot product
-        sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m);  // multiply constant part of q4_K with sum of q8_1 values
-    }
-
-    return d*sumf_d - dmin*sumf_m;
-#else
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-
-    const int bq8_offset = QR5_K * (iqs / QI8_1);
-
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-    const float    d = bq5_K->d;
-    const float dmin = bq5_K->dmin;
-
-    const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
-
-    const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
-
-    for (int i = 0; i < QR5_K; ++i) {
-        const int isc = bq8_offset + i;
-
-        uint8_t sc, m;
-        get_scale_min_k4(isc, bq5_K->scales, sc, m);
-
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
-        const float d8i = bq8i->d;
-
-        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
-
-        const int vih = ((vh >> i) << 4) & 0x10101010;
-
-        const int vi = vil | vih;
-
-        sumf_d += d8i * (__dp4a(vi,         ui, 0) * sc); // SIMD dot product
-        sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m);  // multiply constant part of q5_K with sum of q8_1 values
-    }
-
-    return d*sumf_d - dmin*sumf_m;
-#else
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
-
-    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
-    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
-    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
-
-    float sumf = 0.0f;
-
-    const float d = bq6_K->d;
-
-    int vl;
-    memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
-
-    int vh;
-    memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
-
-    for (int i = 0; i < QR6_K; ++i) {
-        const int sc = bq6_K->scales[scale_offset + 4*i];
-
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
-        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
-        const float d8i = bq8i->d;
-
-        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
-
-        const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
-
-        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
-
-        sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
-    }
-
-    return d*sumf;
-#else
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+#endif // __CUDA_ARCH__ >= 600
 }

 template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
@@ -1659,7 +1386,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
    for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
        const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index

-        const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
+        const int iby = i + threadIdx.x / qi; // y block index

        const int iqs  = threadIdx.x % qi; // x block quant index when casting the quants to int

@@ -1897,40 +1624,6 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
    dst[i + 1] = x0*sin_theta + x1*cos_theta;
 }

-static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
-    const int col = blockDim.x*blockIdx.x + threadIdx.x;
-    const int half_n_dims = ncols/4;
-
-    if (col >= half_n_dims) {
-        return;
-    }
-
-    const int row = blockDim.y*blockIdx.y + threadIdx.y;
-    const int i = row*ncols + col;
-
-    const float col_theta_scale = powf(theta_scale, col);
-
-    const float theta = p*col_theta_scale;
-    const float sin_theta = sinf(theta);
-    const float cos_theta = cosf(theta);
-
-    const float x0 = x[i + 0];
-    const float x1 = x[i + half_n_dims];
-
-    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
-    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
-
-    const float block_theta = block_p*col_theta_scale;
-    const float sin_block_theta = sinf(block_theta);
-    const float cos_block_theta = cosf(block_theta);
-
-    const float x2 = x[i + half_n_dims * 2];
-    const float x3 = x[i + half_n_dims * 3];
-
-    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
-    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
-}
-
 static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
    const int col = blockDim.x*blockIdx.x + threadIdx.x;
    const int row = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1996,9 +1689,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
    dst[i] = scale * x[i];
 }

-static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
-    const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
-    add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
+static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
+    add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
 }

 static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
@@ -2011,22 +1704,11 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
    mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
 }

-static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
-    gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
-}
-
 static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
    silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }

-static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % WARP_SIZE == 0);
-    const dim3 block_dims(WARP_SIZE, 1, 1);
-    norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
-}
-
 static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % WARP_SIZE == 0);
    const dim3 block_dims(WARP_SIZE, 1, 1);
@@ -2192,7 +1874,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
 }

 static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK4_0 == 0);
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(1, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -2201,7 +1883,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
 }

 static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK4_1 == 0);
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(1, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -2210,7 +1892,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
 }

 static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK5_0 == 0);
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(1, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -2219,7 +1901,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
 }

 static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK5_1 == 0);
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(1, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -2228,7 +1910,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
 }

 static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK8_0 == 0);
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(1, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -2236,51 +1918,6 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }

-static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
-}
-
-static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
-}
-
-static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
-}
-
-static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
-}
-
-static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
-}
-
 static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
    dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -2373,14 +2010,6 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
    rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
 }

-static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
-    GGML_ASSERT(nrows % 4 == 0);
-    const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
-    const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
-    const dim3 block_nums(num_blocks_x, nrows, 1);
-    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
-}
-
 static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
    const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
@@ -2608,19 +2237,16 @@ inline void ggml_cuda_op_add(

    GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
    GGML_ASSERT(src1_ddf_i != nullptr);
-    GGML_ASSERT(dst_ddf_i  != nullptr);
+    GGML_ASSERT(dst_ddf_i != nullptr);

-    const int64_t ne00 = src0->ne[0];
+    const int64_t ne0 = src0->ne[0];
    const int64_t i01_diff = i01_high - i01_low;

-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-
    // compute
    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-        add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
+        add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
-        add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
+        add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
    } else {
        GGML_ASSERT(false);
    }
@@ -2639,43 +2265,29 @@ inline void ggml_cuda_op_mul(

    GGML_ASSERT(src0_ddf_i != nullptr);
    GGML_ASSERT(src1_ddf_i != nullptr);
-    GGML_ASSERT(dst_ddf_i  != nullptr);
+    GGML_ASSERT(dst_ddf_i != nullptr);

    const int64_t ne00 = src0->ne[0];
-    const int64_t i01_diff = i01_high - i01_low;

    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];

-    mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
+    for (int64_t i01 = i01_low; i01 < i01_high; i01++) {
+        const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
+
+        float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
+        float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
+        float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
+
+        // compute
+        mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
+    }

    (void) dst;
    (void) src0_ddq_i;
    (void) i02;
 }

-inline void ggml_cuda_op_gelu(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
-    float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
-    cudaStream_t & cudaStream_main){
-
-    GGML_ASSERT(src0_ddf_i != nullptr);
-    GGML_ASSERT(dst_ddf_i != nullptr);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t i01_diff = i01_high - i01_low;
-
-    // compute
-    gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
-
-    (void) src1;
-    (void) dst;
-    (void) src0_ddq_i;
-    (void) src1_ddf_i;
-    (void) i02;
-    (void) i1;
-}
-
 inline void ggml_cuda_op_silu(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
    float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2698,28 +2310,6 @@ inline void ggml_cuda_op_silu(
    (void) i1;
 }

-inline void ggml_cuda_op_norm(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
-    float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
-    cudaStream_t & cudaStream_main){
-
-    GGML_ASSERT(src0_ddf_i != nullptr);
-    GGML_ASSERT(dst_ddf_i != nullptr);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t i01_diff = i01_high - i01_low;
-
-    // compute
-    norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
-
-    (void) src1;
-    (void) dst;
-    (void) src0_ddq_i;
-    (void) src1_ddf_i;
-    (void) i02;
-    (void) i1;
-}
-
 inline void ggml_cuda_op_rms_norm(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
    float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2760,22 +2350,13 @@ inline void ggml_cuda_op_mul_mat_vec(
    int id;
    CUDA_CHECK(cudaGetDevice(&id));

-    bool mul_mat_vec_q_implemented =
-        src0->type == GGML_TYPE_Q4_0 ||
+    const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
        src0->type == GGML_TYPE_Q4_1 ||
        src0->type == GGML_TYPE_Q5_0 ||
        src0->type == GGML_TYPE_Q5_1 ||
        src0->type == GGML_TYPE_Q8_0;
-#if QK_K == 256
-    mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
-        src0->type == GGML_TYPE_Q2_K ||
-        src0->type == GGML_TYPE_Q3_K ||
-        src0->type == GGML_TYPE_Q4_K ||
-        src0->type == GGML_TYPE_Q5_K ||
-        src0->type == GGML_TYPE_Q6_K;
-#endif // QK_K == 256

-    const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
+    const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 600 && mul_mat_vec_q_implemented;
 #endif

    if (use_mul_mat_vec_q) {
@@ -2801,21 +2382,6 @@ inline void ggml_cuda_op_mul_mat_vec(
            case GGML_TYPE_Q8_0:
                mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
                break;
-            case GGML_TYPE_Q2_K:
-                mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
-                break;
-            case GGML_TYPE_Q3_K:
-                mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
-                break;
-            case GGML_TYPE_Q4_K:
-                mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
-                break;
-            case GGML_TYPE_Q5_K:
-                mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
-                break;
-            case GGML_TYPE_Q6_K:
-                mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
-                break;
            default:
                GGML_ASSERT(false);
                break;
@@ -2950,21 +2516,13 @@ inline void ggml_cuda_op_rope(
    const int n_past = ((int32_t *) src1->data)[0];
    const int n_dims = ((int32_t *) src1->data)[1];
    const int mode   = ((int32_t *) src1->data)[2];
-    const int n_ctx  = ((int32_t *) src1->data)[3];
+    GGML_ASSERT(mode == 0);

    const float theta_scale = powf(10000.0, -2.0f/n_dims);
    const float p = ((mode & 1) == 0 ? n_past + i02 : i02);

-    bool is_glm = mode & 4;
-
    // compute
-    if (is_glm) {
-        const float id_p = min(p, n_ctx - 2.f);
-        const float block_p = max(p - (n_ctx - 2.f), 0.f);
-        rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
-    } else {
-        rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
-    }
+    rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);

    (void) dst;
    (void) src0_ddq_i;
@@ -3367,21 +2925,11 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
 }

-void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
-    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
-}
-
 void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
 }

-void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
-    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
-}
-
 void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
@@ -3612,7 +3160,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
        }


-        CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
+        cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);

        extra->data_device[id] = buf;

@@ -3652,36 +3200,36 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
    }

    // recursively assign CUDA buffers until a compute tensor is found
-    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
-        const ggml_op src0_op = tensor->src[0]->op;
+    if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
+        const ggml_op src0_op = tensor->src0->op;
        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
-            ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
+            ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
        }
    }
-    if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
-        ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
+    if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
+        ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
    }

    tensor->backend = GGML_BACKEND_GPU;
    struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
    memset(extra, 0, sizeof(*extra));

-    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
+    const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
        tensor->op == GGML_OP_VIEW ||
        force_inplace;
    const size_t size = ggml_nbytes(tensor);

    CUDA_CHECK(cudaSetDevice(g_main_device));
-    if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
-        struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
+    if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
+        struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
        size_t offset = 0;
        if (tensor->op == GGML_OP_VIEW) {
-            memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
+            memcpy(&offset, tensor->opt[0]->data, sizeof(size_t));
        }
        extra->data_device[g_main_device] = src0_ddc + offset;
    } else if (tensor->op == GGML_OP_CPY) {
-        struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
+        struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra;
        void * src1_ddv = src1_extra->data_device[g_main_device];
        extra->data_device[g_main_device] = src1_ddv;
    } else if (scratch) {
@@ -3752,8 +3300,8 @@ void ggml_cuda_free_scratch() {
 bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
    ggml_cuda_func_t func;
    const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
-        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
-        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
+        || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
+        || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);

    switch (tensor->op) {
        case GGML_OP_ADD:
@@ -3768,24 +3316,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
            }
            func = ggml_cuda_mul;
            break;
-        case GGML_OP_GELU:
-            if (!any_on_device) {
-                return false;
-            }
-            func = ggml_cuda_gelu;
-            break;
        case GGML_OP_SILU:
            if (!any_on_device) {
                return false;
            }
            func = ggml_cuda_silu;
            break;
-        case GGML_OP_NORM:
-            if (!any_on_device) {
-                return false;
-            }
-            func = ggml_cuda_norm;
-            break;
        case GGML_OP_RMS_NORM:
            if (!any_on_device) {
                return false;
@@ -3793,7 +3329,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
            func = ggml_cuda_rms_norm;
            break;
        case GGML_OP_MUL_MAT:
-            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
+            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
                return false;
            }
            func = ggml_cuda_mul_mat;
@@ -3847,6 +3383,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return true;
    }
-    func(tensor->src[0], tensor->src[1], tensor);
+    func(tensor->src0, tensor->src1, tensor);
    return true;
 }
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -393,8 +393,8 @@ void ggml_metal_graph_compute(
            for (int i = node_start; i < node_end; ++i) {
                metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));

-                struct ggml_tensor * src0 = gf->nodes[i]->src[0];
-                struct ggml_tensor * src1 = gf->nodes[i]->src[1];
+                struct ggml_tensor * src0 = gf->nodes[i]->src0;
+                struct ggml_tensor * src1 = gf->nodes[i]->src1;
                struct ggml_tensor * dst  = gf->nodes[i];

                const int64_t  ne00 = src0 ? src0->ne[0] : 0;
@@ -450,7 +450,6 @@ void ggml_metal_graph_compute(
                //}

                switch (dst->op) {
-                    case GGML_OP_NONE:
                    case GGML_OP_RESHAPE:
                    case GGML_OP_VIEW:
                    case GGML_OP_TRANSPOSE:
@@ -740,7 +739,8 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];

                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src0t == GGML_TYPE_Q2_K ||
                                         src0t == GGML_TYPE_Q3_K ||
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -365,10 +365,6 @@ kernel void kernel_rms_norm(
    }
 }

-// putting them in the kernel cause a significant performance penalty
-#define N_DST 4 // each SIMD group works on 4 rows
-#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
-#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
 kernel void kernel_mul_mat_q4_0_f32(
        device const  void * src0,
        device const float * src1,
@@ -376,83 +372,64 @@ kernel void kernel_mul_mat_q4_0_f32(
        constant   int64_t & ne00,
        constant   int64_t & ne10,
        constant   int64_t & ne0,
-        constant   int64_t & ne01[[buffer(4)]],
+        threadgroup float  * sum [[threadgroup(0)]],
        uint2 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+        uint2 tpitg[[thread_position_in_threadgroup]],
+        uint2  tptg[[threads_per_threadgroup]]) {
    const int nb = ne00/QK4_0;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    device const block_q4_0 * x = (device const block_q4_0 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
+
+    const int64_t r0 = tgpig.x;
+    const int64_t r1 = tgpig.y;
+
+    device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
    device const float      * y = (device const float      *) src1 + r1*ne10;
-    block_q4_0 qb_curr, qb_next;
-    float4 y_curr[8];       // src1 vector cache
-    float sumf[N_DST]={0.f}, all_sum;
-    thread float * yl=(thread float *)y_curr;

-    // bootstrap
-    qb_curr = x[tiisg];
-    // each thread in a SIMD group deals with 1 block.
-    for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
+    const int nth = tptg.x*tptg.y;
+    const int ith = tptg.y*tpitg.x + tpitg.y;
+
+    const int ix = tpitg.y/4;           // 0 or 1
+    const int iy = tpitg.y - 4*ix;      // 0...3
+
+    const int first = 4 * iy;
+
+    float sumf = 0;
+
+    for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
+
+        const float d = (float)x[i].d;
+
+        device const uint8_t * xl = x[i].qs + first;
+        device const float   * yl = y + i * QK4_0 + first;
+
+        float2 acc = {0.0f, 0.0f};
+
+        for (int j = 0; j < 4; ++j) {
+
+            acc[0] += yl[j] * (xl[j] & 0xF) + yl[j+16] * (xl[j] >> 4);
+            acc[1] += yl[j] + yl[j+16];

-        float sumy = 0;
-        for (int i = 0; i < QK4_0 / 4; i++) {
-            y_curr[i] = *((device float4  *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i));
-            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
        }
-        sumy *= (-8.f);

-        for (int row = 0; row < N_DST; row++) {
-            // prefetch next x block
-            qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH];
-
-            // calculate
-            float d = qb_curr.d;
-            float acc = sumy;
-            for (int i = 0; i < 16; i++) {
-                acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
-            }
-            sumf[row] += d * acc;
-            qb_curr = qb_next;
-        }
+        sumf += d * (acc[0] - 8.f*acc[1]);
    }

-    if (nb % N_SIMDWIDTH == 0) {
-        for (int row = 0; row < N_DST; ++row) {
-            all_sum = simd_sum(sumf[row]);
-            if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
-                dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
-            }
-        }
-    } else {
+    sum[ith] = sumf;

-        float sumy = 0;
-        for (int i = 0; i < QK4_0 / 4; i++) {
-            y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i));
-            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
-        }
-        sumy *= (-8.f);
-
-        for (int row = 0; row < N_DST; row++) {
-            // prefetch next x block
-            qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH];
-
-            // calculate
-            float d = qb_curr.d;
-            float acc = sumy;
-            for (int i = 0; i < 16; i++) {
-                acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
-            }
-            if (tiisg < nb % N_SIMDWIDTH) {
-                sumf[row] += d * acc;
-            }
-            qb_curr = qb_next;
-
-            all_sum = simd_sum(sumf[row]);
-            if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
-                dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
-            }
-        }
+    //
+    // Accumulate the sum from all threads in the threadgroup
+    //
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    if (ith%4 == 0) {
+        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    if (ith%16 == 0) {
+        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    if (ith == 0) {
+        for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
+        dst[r1*ne0 + r0] = sum[0];
    }
 }

@@ -463,83 +440,65 @@ kernel void kernel_mul_mat_q4_1_f32(
        constant   int64_t & ne00,
        constant   int64_t & ne10,
        constant   int64_t & ne0,
-        constant   int64_t & ne01[[buffer(4)]],
+        threadgroup float  * sum [[threadgroup(0)]],
        uint2 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]]) {
-    const int nb = ne00/QK4_0;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    device const block_q4_1 * x = (device const block_q4_1 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
+        uint2 tpitg[[thread_position_in_threadgroup]],
+        uint2  tptg[[threads_per_threadgroup]]) {
+    const int nb = ne00/QK4_1;
+
+    const int64_t r0 = tgpig.x;
+    const int64_t r1 = tgpig.y;
+
+    device const block_q4_1 * x = (device const block_q4_1 *) src0 + r0*nb;
    device const float      * y = (device const float      *) src1 + r1*ne10;
-    block_q4_1 qb_curr, qb_next;
-    float4 y_curr[8];       // src1 vector cache
-    float sumf[N_DST]={0.f}, all_sum;
-    thread float * yl=(thread float *)y_curr;

-    // bootstrap
-    qb_curr = x[tiisg];
-    // each thread in a SIMD group deals with 1 block.
-    for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
+    const uint nth = tptg.x*tptg.y;
+    const uint ith = tptg.y*tpitg.x + tpitg.y;
+
+    const int ix = tpitg.y/4;           // 0 or 1
+    const int iy = tpitg.y - 4*ix;      // 0...3
+
+    const int first = 4 * iy;
+
+    float sumf = 0;
+
+    for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
+
+        const float d = (float)x[i].d;
+        const float m = (float)x[i].m;
+
+        device const uint8_t * xl = x[i].qs + first;
+        device const float   * yl = y + i * QK4_1 + first;
+
+        float2 acc = {0.0f, 0.0f};
+
+        for (int j = 0; j < 4; ++j) {
+
+            acc[0] += yl[j+ 0] * (d * (xl[j] & 0xF) + m);
+            acc[1] += yl[j+16] * (d * (xl[j] >>  4) + m);

-        float sumy = 0;
-        for (int i = 0; i < QK4_0 / 4; i++) {
-            y_curr[i] = *((device float4  *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i));
-            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
        }

-        for (int row = 0; row < N_DST; row++) {
-            // prefetch next x block
-            qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH];
-
-            // calculate
-            const float d = qb_curr.d;
-            const float m = qb_curr.m;
-            float acc = 0.f;
-            for (int i = 0; i < 16; i++) {
-                acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
-            }
-            sumf[row] += d * acc + m * sumy;
-            qb_curr = qb_next;
-        }
+        sumf += acc[0] + acc[1];
    }

-    if (nb % N_SIMDWIDTH == 0) {
-        for (int row = 0; row < N_DST; ++row) {
-            all_sum = simd_sum(sumf[row]);
-            if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
-                dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
-            }
-        }
-    } else {
+    sum[ith] = sumf;

-        float sumy = 0;
-        for (int i = 0; i < QK4_0 / 4; i++) {
-            y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i));
-            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
-        }
-
-        for (int row = 0; row < N_DST; row++) {
-            // prefetch next x block
-            qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH];
-
-            // calculate
-            const float d = qb_curr.d;
-            const float m = qb_curr.m;
-            float acc = 0.f;
-            for (int i = 0; i < 16; i++) {
-                acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
-            }
-            if (tiisg < nb % N_SIMDWIDTH) {
-                sumf[row] += d * acc + m * sumy;
-            }
-            qb_curr = qb_next;
-
-            all_sum = simd_sum(sumf[row]);
-            if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
-                dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
-            }
-        }
+    //
+    // Accumulate the sum from all threads in the threadgroup
+    //
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    if (ith%4 == 0) {
+        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    if (ith%16 == 0) {
+        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    if (ith == 0) {
+        for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
+        dst[r1*ne0 + r0] = sum[0];
    }
 }

--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -56,7 +56,7 @@ void ggml_mpi_eval_init(
    MPI_Bcast(n_threads, 1, MPI_INT, 0, MPI_COMM_WORLD);
 }

-static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
+int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
    struct ggml_tensor * t = ggml_graph_get_tensor(gf, name);
    if (t == NULL) {
        fprintf(stderr, "%s: tensor %s not found\n", __func__, name);
@@ -73,39 +73,13 @@ static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
    return -1;
 }

-static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
-    MPI_Datatype mpi_type;
-
-    switch (t->type) {
-        case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
-        case GGML_TYPE_F32: mpi_type = MPI_FLOAT;   break;
-        default: GGML_ASSERT(false && "not implemented");
-    }
-
-    const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, MPI_COMM_WORLD);
-    GGML_ASSERT(retval == MPI_SUCCESS);
-}
-
-static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
-    MPI_Datatype mpi_type;
-
-    switch (t->type) {
-        case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
-        case GGML_TYPE_F32: mpi_type = MPI_FLOAT;   break;
-        default: GGML_ASSERT(false && "not implemented");
-    }
-
-    MPI_Status status; UNUSED(status);
-
-    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
-    GGML_ASSERT(retval == MPI_SUCCESS);
-}
-
 // TODO: there are many improvements that can be done to this implementation
-void ggml_mpi_graph_compute_pre(
+void ggml_mpi_graph_compute(
        struct ggml_mpi_context * ctx_mpi,
+        struct ggml_context     * ctx,
             struct ggml_cgraph * gf,
-                            int   n_layers) {
+                            int   n_layers,
+                            int   n_threads) {
    const int mpi_rank = ctx_mpi->rank;
    const int mpi_size = ctx_mpi->size;

@@ -135,19 +109,41 @@ void ggml_mpi_graph_compute_pre(
    // node 0:   [(n-1) * n_per_node,            n_nodes)
    //
    if (mpi_rank > 0) {
-        if (mpi_rank == 1) {
-            // the first node (1) receives the input tokens from the main node (0)
-            ggml_mpi_tensor_recv(inp_tokens, 0);
-        } else {
-            // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
-            ggml_mpi_tensor_recv(inp0, mpi_rank - 1);
+        if (mpi_rank == 1) { // the first node receives the input tokens from the main node
+            MPI_Status status; UNUSED(status);
+
+            const int mpi_rank_src = mpi_rank - 1;
+
+            const int retval = MPI_Recv(inp_tokens->data, ggml_nelements(inp_tokens), MPI_INT, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+            GGML_ASSERT(retval == MPI_SUCCESS);
+        } else { // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
+            MPI_Status status; UNUSED(status);
+
+            const int mpi_rank_src = mpi_rank - 1;
+
+            //printf("%s: node %d: waiting for %d elements from %d\n", __func__, mpi_rank, (int) ggml_nelements(inp0), mpi_rank_src);
+            const int retval = MPI_Recv(inp0->data, ggml_nelements(inp0), MPI_FLOAT, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+            GGML_ASSERT(retval == MPI_SUCCESS);
        }
    } else if (mpi_size > 1) {
        // node 0 sends the input tokens to node 1
-        ggml_mpi_tensor_send(inp_tokens, 1);
+        {
+            const int mpi_rank_dst = mpi_rank + 1;
+
+            const int retval = MPI_Send(inp_tokens->data, ggml_nelements(inp_tokens), MPI_INT, mpi_rank_dst, 0, MPI_COMM_WORLD);
+            GGML_ASSERT(retval == MPI_SUCCESS);
+        }

        // recv the output data from the last node
-        ggml_mpi_tensor_recv(inp0, mpi_size - 1);
+        {
+            MPI_Status status; UNUSED(status);
+
+            const int mpi_rank_src = mpi_size - 1;
+
+            //fprintf(stderr, "%s: node %d: waiting for %d elements from %d\n", __func__, mpi_rank, (int) ggml_nelements(inp0), mpi_rank_src);
+            const int retval = MPI_Recv(inp0->data, ggml_nelements(inp0), MPI_FLOAT, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+            GGML_ASSERT(retval == MPI_SUCCESS);
+        }
    }

    {
@@ -175,11 +171,11 @@ void ggml_mpi_graph_compute_pre(
        // attach the input data to all nodes that need it
        // TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
        for (int i = idx_l0; i < idx_l1; i++) {
-            if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
-                gf->nodes[i]->src[0] =  inp0;
+            if (gf->nodes[i]->src0 == gf->nodes[idx_l0]) {
+                gf->nodes[i]->src0 =  inp0;
            }
-            if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
-                gf->nodes[i]->src[1] =  inp0;
+            if (gf->nodes[i]->src1 == gf->nodes[idx_l0]) {
+                gf->nodes[i]->src1 =  inp0;
            }
        }

@@ -198,19 +194,20 @@ void ggml_mpi_graph_compute_pre(

        //fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, mpi_rank, gf->n_nodes, il0, il1);
    }
-}

-void ggml_mpi_graph_compute_post(
-        struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers) {
-    UNUSED(n_layers);
+    ggml_graph_compute_with_ctx(ctx, gf, n_threads);

-    const int mpi_rank = ctx_mpi->rank;
-    const int mpi_size = ctx_mpi->size;
+    //fprintf(stderr, "%s: node %d: done\n", __func__, mpi_rank);

    // send the output data to the next node
    if (mpi_rank > 0) {
-        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size);
+        struct ggml_tensor * output = gf->nodes[gf->n_nodes - 1];
+
+        const int mpi_rank_dst = (mpi_rank + 1) % mpi_size;
+
+        //fprintf(stderr, "%s: node %d: sending %d elements to node %d\n", __func__, mpi_rank, ggml_nelements(output), mpi_rank_dst);
+
+        const int retval = MPI_Send(output->data, ggml_nelements(output), MPI_FLOAT, mpi_rank_dst, 0, MPI_COMM_WORLD);
+        GGML_ASSERT(retval == MPI_SUCCESS);
    }
 }
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -24,15 +24,12 @@ void ggml_mpi_eval_init(
                            int * n_past,
                            int * n_threads);

-void ggml_mpi_graph_compute_pre(
+void ggml_mpi_graph_compute(
        struct ggml_mpi_context * ctx_mpi,
+        struct ggml_context     * ctx,
             struct ggml_cgraph * gf,
-                            int   n_layers);
-
-void ggml_mpi_graph_compute_post(
-        struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers);
+                            int   n_layers,
+                            int   n_threads);

 #ifdef __cplusplus
 }
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@@ -132,10 +132,10 @@
 //   {
 //       struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
 //
-//       // a[2, 1] = 1.0f;
+//       // a[1, 2] = 1.0f;
 //       *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
 //
-//       // a[0, 2] = 2.0f;
+//       // a[2, 0] = 2.0f;
 //       *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
 //
 //       ...
@@ -197,17 +197,12 @@
 #define GGML_MAX_NODES         4096
 #define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
-#define GGML_MAX_SRC           6
+#define GGML_MAX_OPT           4
 #define GGML_MAX_NAME          48
 #define GGML_DEFAULT_N_THREADS 4

-
-#define GGML_EXIT_SUCCESS 0
-#define GGML_EXIT_ABORTED 1
-
 #define GGML_UNUSED(x) (void)(x)

-
 #define GGML_ASSERT(x) \
    do { \
        if (!(x)) { \
@@ -368,8 +363,6 @@ extern "C" {
        GGML_OP_CLAMP,
        GGML_OP_CONV_1D,
        GGML_OP_CONV_2D,
-        GGML_OP_POOL_1D,
-        GGML_OP_POOL_2D,

        GGML_OP_FLASH_ATTN,
        GGML_OP_FLASH_FF,
@@ -421,7 +414,9 @@ extern "C" {
        bool is_param;

        struct ggml_tensor * grad;
-        struct ggml_tensor * src[GGML_MAX_SRC];
+        struct ggml_tensor * src0;
+        struct ggml_tensor * src1;
+        struct ggml_tensor * opt[GGML_MAX_OPT];

        // performance
        int     perf_runs;
@@ -449,10 +444,6 @@ extern "C" {

        // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
        int n_tasks[GGML_MAX_NODES];
-
-        // abort ggml_graph_compute when true
-        bool (*abort_callback)(void * data);
-        void * abort_callback_data;
    };

    // computation graph
@@ -1175,31 +1166,6 @@ extern "C" {
            int                   s,
            int                   d);

-    enum ggml_op_pool {
-        GGML_OP_POOL_MAX,
-        GGML_OP_POOL_AVG,
-        GGML_OP_POOL_COUNT,
-    };
-
-    GGML_API struct ggml_tensor* ggml_pool_1d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            enum ggml_op_pool     op,
-            int                   k0, // kernel size
-            int                   s0, // stride
-            int                   p0); // padding
-
-    GGML_API struct ggml_tensor* ggml_pool_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            enum ggml_op_pool     op,
-            int                   k0,
-            int                   k1,
-            int                   s0,
-            int                   s1,
-            int                   p0,
-            int                   p1);
-
    GGML_API struct ggml_tensor * ggml_flash_attn(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,
@@ -1339,7 +1305,7 @@ extern "C" {
    // ggml_graph_plan() has to be called before ggml_graph_compute()
    // when plan.work_size > 0, caller must allocate memory for plan.work_data
    GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API               int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API              void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
    GGML_API              void ggml_graph_reset  (struct ggml_cgraph * cgraph);

    // same as ggml_graph_compute() but the work data is allocated as a part of the context
--- a/k_quants.h
+++ b/k_quants.h
@@ -15,14 +15,6 @@
 #define K_SCALE_SIZE 12
 #endif

-#ifndef static_assert
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
-#define static_assert(cond, msg) _Static_assert(cond, msg)
-#else
-#define static_assert(cond, msg) struct global_scope_noop_trick
-#endif
-#endif
-
 //
 // Super-block quantization structures
 //
--- a/llama.cpp
+++ b/llama.cpp
@@ -303,7 +303,7 @@ struct llama_model {
 };

 struct llama_context {
-    llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
+    llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
 #ifdef GGML_USE_METAL
    ~llama_context() {
        if (ctx_metal) {
@@ -324,6 +324,7 @@ struct llama_context {
    int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)

    const llama_model & model;
+    const llama_vocab & vocab;

    bool model_owner = false;

@@ -1631,10 +1632,6 @@ static bool llama_eval_internal(
    // run the computation
    ggml_build_forward_expand(&gf, cur);

-#if GGML_USE_MPI
-    ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
-#endif
-
 #ifdef GGML_USE_METAL
    if (lctx.ctx_metal && N == 1) {
        ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
@@ -1659,19 +1656,14 @@ static bool llama_eval_internal(

        ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
    }
+#elif GGML_USE_MPI
+    ggml_mpi_graph_compute(lctx.ctx_mpi, ctx0, &gf, n_layer, n_threads);
+
+    cur = gf.nodes[gf.n_nodes - 1];
 #else
    ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
 #endif

-#if GGML_USE_MPI
-    ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
-#endif
-
-    // update kv token count
-    lctx.kv_self.n = n_past + N;
-
-    struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
-
    if (cgraph_fname) {
        ggml_graph_export(&gf, cgraph_fname);
    }
@@ -1687,26 +1679,38 @@ static bool llama_eval_internal(
    //    ggml_graph_dump_dot(&gf, NULL, "llama.dot");
    //}

-    // extract logits
+    //embd_w.resize(n_vocab*N);
+    //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
+
+    // update kv token count
+    lctx.kv_self.n = n_past + N;
+
+#ifdef GGML_USE_MPI
+    if (ggml_mpi_rank(lctx.ctx_mpi) == 0) {
+#else
    {
-        auto & logits_out = lctx.logits;
+#endif
+        // extract logits
+        {
+            auto & logits_out = lctx.logits;

-        if (lctx.logits_all) {
-            logits_out.resize(n_vocab * N);
-            memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
-        } else {
-            // return result for just the last token
-            logits_out.resize(n_vocab);
-            memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+            if (lctx.logits_all) {
+                logits_out.resize(n_vocab * N);
+                memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
+            } else {
+                // return result for just the last token
+                logits_out.resize(n_vocab);
+                memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+            }
        }
-    }

-    // extract embeddings
-    if (!lctx.embedding.empty()) {
-        auto & embedding_out = lctx.embedding;
+        // extract embeddings
+        if (!lctx.embedding.empty()) {
+            auto & embedding_out = lctx.embedding;

-        embedding_out.resize(n_embd);
-        memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
+            embedding_out.resize(n_embd);
+            memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
+        }
    }

    if (mem_per_token == 0) {
@@ -2166,62 +2170,6 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
    }
 }

-static void llama_log_softmax(float * array, size_t size) {
-    float max_l = *std::max_element(array, array + size);
-    float sum = 0.f;
-    for (size_t i = 0; i < size; ++i) {
-        float p = expf(array[i] - max_l);
-        sum += p;
-        array[i] = p;
-    }
-
-    for (size_t i = 0; i < size; ++i) {
-        array[i] = logf(array[i] / sum);
-    }
-}
-
-void llama_sample_classifier_free_guidance(
-          struct llama_context * ctx,
-        llama_token_data_array * candidates,
-          struct llama_context * guidance_ctx,
-                         float   scale,
-                         float   smooth_factor) {
-    int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
-
-    assert(ctx);
-    auto n_vocab = llama_n_vocab(ctx);
-    assert(n_vocab == (int)candidates->size);
-    assert(!candidates->sorted);
-
-    std::vector<float> logits_base;
-    logits_base.reserve(candidates->size);
-    for (size_t i = 0; i < candidates->size; ++i) {
-        logits_base.push_back(candidates->data[i].logit);
-    }
-    llama_log_softmax(logits_base.data(), candidates->size);
-
-    float* logits_guidance = llama_get_logits(guidance_ctx);
-    llama_log_softmax(logits_guidance, n_vocab);
-
-    for (int i = 0; i < n_vocab; ++i) {
-        float logit_guidance = logits_guidance[i];
-        float logit_base = logits_base[i];
-        logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance;
-    }
-
-    llama_log_softmax(logits_guidance, n_vocab);
-
-    for (int i = 0; i < n_vocab; ++i) {
-        float logit_base = logits_base[i];
-        float logit_guidance = logits_guidance[i];
-
-        candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
-    }
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}

 llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
    assert(ctx);
@@ -2509,14 +2457,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        } else {
            new_type = quantized_type;
 #ifdef GGML_USE_K_QUANTS
-            bool convert_incompatible_tensor = false;
            if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
                quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
                int nx = tensor.ne.at(0);
                int ny = tensor.ne.at(1);
                if (nx % QK_K != 0 || ny % QK_K != 0) {
-                    fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
-                    convert_incompatible_tensor = true;
+                    fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
+                    fprintf(stderr, "This is required to be able to use k-quants for now!\n");
+                    fprintf(stderr, "========================================================================================\n\n");
+                    throw std::runtime_error("Unsupported tensor size encountered\n");
                }
            }
            if (tensor.name == "output.weight") {
@@ -2544,17 +2493,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
            }
-            if (convert_incompatible_tensor) {
-                if (tensor.name == "output.weight") {
-                    new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
-                    fprintf(stderr, "F16 will be used for this tensor instead.\n");
-                } else if (tensor.name == "tok_embeddings.weight") {
-                    new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
-                    fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
-                } else {
-                    throw std::runtime_error("Unsupported tensor size encountered\n");
-                }
-            }
 #endif

            float * f32_data;
@@ -2696,7 +2634,7 @@ struct llama_context * llama_new_context_with_model(
        return nullptr;
    }

-    llama_context * ctx = new llama_context(*model);
+    llama_context * ctx = new llama_context(*model, model->vocab);

    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
@@ -3534,13 +3472,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
    return 0;
 }

-int llama_tokenize_with_model(
-    const struct llama_model * model,
+int llama_tokenize(
+        struct llama_context * ctx,
                  const char * text,
                 llama_token * tokens,
                         int   n_max_tokens,
                        bool   add_bos) {
-    auto res = llama_tokenize(model->vocab, text, add_bos);
+    auto res = llama_tokenize(ctx->vocab, text, add_bos);

    if (n_max_tokens < (int) res.size()) {
        fprintf(stderr, "%s: too many tokens\n", __func__);
@@ -3554,29 +3492,8 @@ int llama_tokenize_with_model(
    return res.size();
 }

-int llama_tokenize(
-        struct llama_context * ctx,
-                  const char * text,
-                 llama_token * tokens,
-                         int   n_max_tokens,
-                        bool   add_bos) {
-    return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
-}
-
-int llama_n_vocab_from_model(const struct llama_model * model) {
-    return model->vocab.id_to_token.size();
-}
-
-int llama_n_ctx_from_model(const struct llama_model * model) {
-    return model->hparams.n_ctx;
-}
-
-int llama_n_embd_from_model(const struct llama_model * model) {
-    return model->hparams.n_embd;
-}
-
 int llama_n_vocab(const struct llama_context * ctx) {
-    return ctx->model.vocab.id_to_token.size();
+    return ctx->vocab.id_to_token.size();
 }

 int llama_n_ctx(const struct llama_context * ctx) {
@@ -3587,25 +3504,17 @@ int llama_n_embd(const struct llama_context * ctx) {
    return ctx->model.hparams.n_embd;
 }

-int llama_get_vocab_from_model(
-        const struct llama_model * model,
-        const char * * strings,
-        float  * scores,
-        int capacity) {
-    int n = std::min(capacity, (int) model->vocab.id_to_token.size());
-    for (int i = 0; i<n; ++i) {
-        strings[i] = model->vocab.id_to_token[i].tok.c_str();
-        scores[i]  = model->vocab.id_to_token[i].score;
-    }
-    return n;
-}
-
 int llama_get_vocab(
        const struct llama_context * ctx,
        const char * * strings,
        float  * scores,
        int capacity) {
-    return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
+    int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
+    for (int i = 0; i<n; ++i) {
+        strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
+        scores[i]  = ctx->vocab.id_to_token[i].score;
+    }
+    return n;
 }

 float * llama_get_logits(struct llama_context * ctx) {
@@ -3616,16 +3525,12 @@ float * llama_get_embeddings(struct llama_context * ctx) {
    return ctx->embedding.data();
 }

-const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
-    if (token >= llama_n_vocab_from_model(model)) {
+const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
+    if (token >= llama_n_vocab(ctx)) {
        return nullptr;
    }

-    return model->vocab.id_to_token[token].tok.c_str();
-}
-
-const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
-    return llama_token_to_str_with_model(&ctx->model, token);
+    return ctx->vocab.id_to_token[token].tok.c_str();
 }

 llama_token llama_token_bos() {
--- a/llama.h
+++ b/llama.h
@@ -270,21 +270,10 @@ extern "C" {
                             int   n_max_tokens,
                            bool   add_bos);

-    LLAMA_API int llama_tokenize_with_model(
-        const struct llama_model * model,
-                      const char * text,
-                     llama_token * tokens,
-                             int   n_max_tokens,
-                            bool   add_bos);
-
    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
    LLAMA_API int llama_n_embd (const struct llama_context * ctx);

-    LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
-    LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
-    LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
-
    // Get the vocabulary as output parameters.
    // Returns number of results.
    LLAMA_API int llama_get_vocab(
@@ -293,12 +282,6 @@ extern "C" {
                                 float * scores,
                                   int   capacity);

-    LLAMA_API int llama_get_vocab_from_model(
-              const struct llama_model * model,
-                          const char * * strings,
-                                 float * scores,
-                                   int   capacity);
-
    // Token logits obtained from the last call to llama_eval()
    // The logits for the last token are stored in the last row
    // Can be mutated in order to change the probabilities of the next token
@@ -311,13 +294,7 @@ extern "C" {
    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);

    // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(
-            const struct llama_context * ctx,
-                           llama_token   token);
-
-    LLAMA_API const char * llama_token_to_str_with_model(
-              const struct llama_model * model,
-                           llama_token   token);
+    LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);

    // Special tokens
    LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
@@ -332,18 +309,6 @@ extern "C" {
    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
    LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);

-    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
-    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
-    /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
-    /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
-    /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
-    LLAMA_API void llama_sample_classifier_free_guidance(
-              struct llama_context * ctx,
-            llama_token_data_array * candidates,
-              struct llama_context * guidance_ctx,
-                             float   scale,
-                             float   smooth_factor);
-
    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
    LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);

--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -10,9 +10,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-#if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Wdouble-promotion"
-#endif

 #define MAX_NARGS 3

--- a/tests/test-opt.c
+++ b/tests/test-opt.c
@@ -7,9 +7,7 @@

 #define MAX_NARGS 2

-#if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Wdouble-promotion"
-#endif

 //
 // logging
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -31,7 +31,7 @@ int main(int argc, char **argv) {
    llama_model * model;
    llama_context * ctx;

-    llama_backend_init(false);
+    llama_init_backend(false);

    // load the vocab
    {
@@ -99,7 +99,7 @@ int main(int argc, char **argv) {
    llama_free_model(model);
    llama_free(ctx);

-    llama_backend_free();
+    llama_finalize_backend();

    return 0;
 }
Author	SHA1	Message	Date
Georgi Gerganov	0492363137	mpi : fix after master merge	2023-07-09 22:23:04 +03:00
Georgi Gerganov	81c5ddd532	Merge branch 'mpi' into refactor-mpi	2023-07-09 22:20:14 +03:00
Evan Miller	03cc12be0d	[mpi] continue-on-error: true	2023-07-09 15:10:43 -04:00
Evan Miller	4a9a4748e9	Add OpenMPI to GH action	2023-07-09 15:05:58 -04:00
Evan Miller	0f557c2ac4	Merge branch 'master' into mpi	2023-07-09 15:02:19 -04:00
Georgi Gerganov	9da9d26c70	mpi : minor	2023-07-09 18:38:32 +03:00
Georgi Gerganov	beadbf3380	mpi : fix inference	2023-07-09 18:26:20 +03:00
Georgi Gerganov	ef37dd14e7	mpi : fix output tensor after MPI compute (still not working)	2023-07-09 17:01:08 +03:00
Georgi Gerganov	c717c5185f	mpi : various fixes - communication now works but results are wrong	2023-07-09 16:40:16 +03:00
Georgi Gerganov	01abb3b3b9	mpi : move all MPI logic into ggml-mpi Not tested yet	2023-07-09 16:04:27 +03:00
Georgi Gerganov	e339d35579	mpi : add names for layer inputs + prep ggml_mpi_graph_compute()	2023-07-09 14:42:36 +03:00
Georgi Gerganov	3232db628c	mpi : trying to move more MPI stuff into ggml-mpi (WIP) (#2099 )	2023-07-09 14:08:53 +03:00
Evan Miller	ef61acfbf5	Add info to README	2023-07-07 09:02:23 -04:00
Evan Miller	55207ba2b8	Add GH workflow, fix test	2023-07-06 21:40:18 -04:00
Evan Miller	1f0a2cfeda	Update CMakeLists.txt	2023-07-06 21:25:34 -04:00
Evan Miller	06a239343c	PR comments	2023-07-06 20:18:41 -04:00
Evan Miller	32deabfdc8	Merge branch 'master' into mpi	2023-07-06 19:04:50 -04:00
Evan Miller	042c5b278f	wrap includes	2023-07-04 00:13:20 -04:00
Evan Miller	668ba5fe0b	fixes	2023-07-04 00:09:02 -04:00
Evan Miller	d05ca74dd8	fix warnings, update README	2023-07-03 23:53:43 -04:00
Evan Miller	f85785f650	MPI support, first cut	2023-07-03 21:51:05 -04:00