wip

llama : add pooling switch
embeddings : fix llama_batch_init arg
2026-04-23 16:37:33 +03:00 · 2024-03-04 17:07:12 +02:00 · 2024-03-04 14:06:33 +02:00 · 2024-03-04 14:06:00 +02:00 · 2024-03-04 13:34:16 +02:00 · 2024-03-04 11:43:16 +02:00
36 changed files with 46850 additions and 45524 deletions
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -1,6 +1,5 @@
 {
  lib,
-  glibc,
  config,
  stdenv,
  mkShell,
@@ -31,11 +30,6 @@
  useRocm ? config.rocmSupport,
  useVulkan ? false,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
-
-  # It's necessary to consistently use backendStdenv when building with CUDA support,
-  # otherwise we get libstdc++ errors downstream.
-  effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
-  enableStatic ? effectiveStdenv.hostPlatform.isStatic
 }@inputs:

 let
@@ -47,7 +41,10 @@ let
    versionOlder
    ;

+  # It's necessary to consistently use backendStdenv when building with CUDA support,
+  # otherwise we get libstdc++ errors downstream.
  stdenv = throw "Use effectiveStdenv instead";
+  effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;

  suffices =
    lib.optionals useBlas [ "BLAS" ]
@@ -170,9 +167,6 @@ effectiveStdenv.mkDerivation (
        # TODO: Replace with autoAddDriverRunpath
        # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
        cudaPackages.autoAddOpenGLRunpathHook
-      ]
-      ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
-        glibc.static
      ];

    buildInputs =
@@ -187,7 +181,7 @@ effectiveStdenv.mkDerivation (
      [
        (cmakeBool "LLAMA_NATIVE" false)
        (cmakeBool "LLAMA_BUILD_SERVER" true)
-        (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
+        (cmakeBool "BUILD_SHARED_LIBS" true)
        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
        (cmakeBool "LLAMA_BLAS" useBlas)
        (cmakeBool "LLAMA_CLBLAST" useOpenCL)
@@ -196,7 +190,6 @@ effectiveStdenv.mkDerivation (
        (cmakeBool "LLAMA_METAL" useMetalKit)
        (cmakeBool "LLAMA_MPI" useMpi)
        (cmakeBool "LLAMA_VULKAN" useVulkan)
-        (cmakeBool "LLAMA_STATIC" enableStatic)
      ]
      ++ optionals useCuda [
        (
--- a/README.md
+++ b/README.md
@@ -10,7 +10,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Recent API changes

- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
 - [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849

 ### Hot topics
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -45,8 +45,7 @@ fi

 if [ ! -z ${GG_BUILD_SYCL} ]; then
    if [ -z ${ONEAPI_ROOT} ]; then
-        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
-        echo "source /opt/intel/oneapi/setvars.sh"
+        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
        exit 1
    fi

--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -19,12 +19,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
        endif()
    endif()

-    if(EXISTS "${GIT_DIR}/index")
-        set(GIT_INDEX "${GIT_DIR}/index")
-    else()
-        message(WARNING "Git index not found in git repository.")
-        set(GIT_INDEX "")
-    endif()
+    set(GIT_INDEX "${GIT_DIR}/index")
 else()
    message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
    set(GIT_INDEX "")
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -513,6 +513,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_sequences = std::stoi(argv[i]);
+        } else if (arg == "--p-accept" || arg == "-pa") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.p_accept = std::stof(argv[i]);
        } else if (arg == "--p-split" || arg == "-ps") {
            if (++i >= argc) {
                invalid_param = true;
@@ -1038,6 +1044,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
    printf("  -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
    printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
+    printf("  -pa N, --p-accept N   speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
    printf("  -ps N, --p-split N    speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
    printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
    printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
--- a/common/common.h
+++ b/common/common.h
@@ -53,10 +53,11 @@ struct gpt_params {
    int32_t n_ctx                 = 512;   // context size
    int32_t n_batch               = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                = 0;     // number of tokens to keep from initial prompt
-    int32_t n_draft               = 5;     // number of tokens to draft during speculative decoding
+    int32_t n_draft               = 8;     // number of tokens to draft during speculative decoding
    int32_t n_chunks              = -1;    // max number of chunks to process (-1 = unlimited)
    int32_t n_parallel            = 1;     // number of parallel sequences to decode
    int32_t n_sequences           = 1;     // number of sequences to decode
+    float   p_accept              = 0.5f;  // speculative decoding accept probability
    float   p_split               = 0.1f;  // speculative decoding split probability
    int32_t n_gpu_layers          = -1;    // number of layers to store in VRAM (-1 - use default)
    int32_t n_gpu_layers_draft    = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -295,77 +295,6 @@ static llama_token llama_sampling_sample_impl(
    return id;
 }

-static llama_token_data_array llama_sample_probability_distribution_impl(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx) {
-    const llama_sampling_params & params = ctx_sampling->params;
-
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
-
-    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
-    const float   penalty_repeat  = params.penalty_repeat;
-    const float   penalty_freq    = params.penalty_freq;
-    const float   penalty_present = params.penalty_present;
-    const bool    penalize_nl     = params.penalize_nl;
-
-    auto & prev = ctx_sampling->prev;
-    auto & cur  = ctx_sampling->cur;
-
-    // Get a pointer to the logits
-    float * logits = llama_get_logits_ith(ctx_main, idx);
-
-    // Declare original_logits at the beginning of the function scope
-    std::vector<float> original_logits;
-
-    // apply params.logit_bias map
-    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-        logits[it->first] += it->second;
-    }
-
-    if (ctx_cfg) {
-        float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
-        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
-    }
-
-    cur.clear();
-
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-    }
-
-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
-
-    // apply penalties
-    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
-    const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
-    if (penalty_tokens_used_size) {
-        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
-
-        llama_sample_repetition_penalties(ctx_main, &cur_p,
-                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
-                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
-
-        if (!penalize_nl) {
-            for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
-                    cur_p.data[idx].logit = nl_logit;
-                    break;
-                }
-            }
-        }
-    }
-
-    // apply grammar checks
-    if (ctx_sampling->grammar != NULL) {
-        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
-    }
-
-    llama_sample_softmax(ctx_main, &cur_p);
-    return cur_p;
-}
-
 llama_token llama_sampling_sample(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
@@ -375,14 +304,6 @@ llama_token llama_sampling_sample(
    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
 }

-llama_token_data_array llama_sampling_probability_distribution(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx) {
-    return llama_sample_probability_distribution_impl(ctx_sampling,ctx_main, ctx_cfg, idx);
-}
-
 void llama_sampling_accept(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -131,13 +131,6 @@ llama_token llama_sampling_sample(
        struct llama_context * ctx_cfg,
        int idx = 0);

-// returns the probability that token of given id will be sampled
-llama_token_data_array llama_sampling_probability_distribution(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        struct llama_context * ctx_cfg,
-        int idx = 0);
-
 void llama_sampling_accept(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -36,10 +36,8 @@ class SentencePieceTokenTypes(IntEnum):
    UNUSED = 5
    BYTE = 6

-
 AnyModel = TypeVar("AnyModel", bound="type[Model]")

-
 class Model(ABC):
    _model_classes: dict[str, type[Model]] = {}

@@ -189,7 +187,6 @@ class Model(ABC):
    @classmethod
    def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
        assert names
-
        def func(modelcls: type[Model]):
            for name in names:
                cls._model_classes[name] = modelcls
--- a/convert.py
+++ b/convert.py
@@ -1377,6 +1377,7 @@ def main(args_in: list[str] | None = None) -> None:
        # We currently only support Q8_0 output on little endian systems.
        output_choices.append("q8_0")
    parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
+    parser.add_argument("--awq-path",     type=Path,              help="Path to scale awq cache file", default=None)
    parser.add_argument("--dump",         action="store_true",    help="don't convert, just show what's in the model")
    parser.add_argument("--dump-single",  action="store_true",    help="don't convert, just show what's in a single model file")
    parser.add_argument("--vocab-only",   action="store_true",    help="extract only the vocab")
@@ -1392,6 +1393,18 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")

    args = parser.parse_args(args_in)
+    if args.awq_path:
+        sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
+        from awq.apply_awq import add_scale_weights  # type: ignore[import-not-found]
+        tmp_model_path = args.model / "weighted_model"
+        if tmp_model_path.is_dir():
+            print(f"{tmp_model_path} exists as a weighted model.")
+        else:
+            tmp_model_path.mkdir(parents=True, exist_ok=True)
+            print("Saving new weighted model ...")
+            add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
+            print(f"Saved weighted model at {tmp_model_path}.")
+        args.model = tmp_model_path

    if args.dump_single:
        model_plus = lazy_load_file(args.model)
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -23,7 +23,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
    }
 }

-static void normalize(const float * vec, float * out, int n) {
+static void normalize(float * vec, float * out, int n) {
    float norm = 0;
    for (int i = 0; i < n; i++) {
        norm += vec[i] * vec[i];
@@ -50,18 +50,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
            continue;
        }

-        // try to get sequence embeddings - supported only when pooling_type is not NONE
-        const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-        if (embd == NULL) {
-            embd = llama_get_embeddings_ith(ctx, i);
-            if (embd == NULL) {
-                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
-                continue;
-            }
-        }
-
+        float * emb = llama_get_embeddings_ith(ctx, i);
        float * out = output + batch.seq_id[i][0] * n_embd;
-        normalize(embd, out, n_embd);
+        normalize(emb, out, n_embd);
    }
 }

--- a/examples/server-embd.py
+++ b/examples/server-embd.py
@@ -2,7 +2,7 @@ import asyncio
 import requests
 import numpy as np

-n = 8
+n = 1

 result = []

@@ -13,7 +13,10 @@ async def main():
    model_url = "http://127.0.0.1:6900"
    responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
        url= f"{model_url}/embedding",
-        json= {"content": str(i)*1024}
+        json= {"content": str(0)*32}
+        #json= {"content": str(0)*1024}
+        #json= {"content": str(i)*32}
+        #json= {"content": str(i%2)*32}
    ) for i in range(n)])

    for response in responses:
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -413,7 +413,7 @@ struct llama_server_context
        int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
        if (res < 0) {
            LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
-            sparams.chat_template = "chatml";
+            sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
        }
    }

@@ -1235,22 +1235,12 @@ struct llama_server_context
                    continue;
                }

-                const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-                if (embd == NULL) {
-                    embd = llama_get_embeddings_ith(ctx, i);
-                    if (embd == NULL) {
-                        LOG_ERROR("failed to get embeddings for token", {{"token", batch.token[i]}, {"seq_id", batch.seq_id[i][0]}});
-                        res.result_json = json
-                        {
-                            {"embedding", std::vector<float>(n_embd, 0.0f)},
-                        };
-                        continue;
-                    }
-                }
+                const float * data = llama_get_embeddings_ith(ctx, i);
+                std::vector<float> embedding(data, data + n_embd);

                res.result_json = json
                {
-                    {"embedding", std::vector<float>(embd, embd + n_embd)},
+                    {"embedding", embedding },
                };
            }
        }
--- a/examples/speculative/README.md
+++ b/examples/speculative/README.md
@@ -6,4 +6,3 @@ More info:

 - https://github.com/ggerganov/llama.cpp/pull/2926
 - https://github.com/ggerganov/llama.cpp/pull/3624
- https://github.com/ggerganov/llama.cpp/pull/5625
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -5,7 +5,6 @@
 #include <cstdio>
 #include <string>
 #include <vector>
-#include <set>

 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@@ -19,7 +18,6 @@ struct seq_draft {
    std::vector<int> i_batch_tgt;

    std::vector<llama_token> tokens;
-    std::vector<std::vector<llama_token_data>> dists;

    struct llama_sampling_context * ctx_sampling;
 };
@@ -39,15 +37,12 @@ int main(int argc, char ** argv) {
    // max number of parallel drafting sequences (i.e. tree branches)
    const int n_seq_dft = params.n_parallel;

+    // probability threshold for accepting a token from the draft model
+    const float p_accept = params.p_accept;
+
    // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
    const float p_split  = params.p_split;

-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-    std::default_random_engine rng(params.seed);
-    std::uniform_real_distribution<> u_dist;
-
 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("speculative", "log"));
    LOG_TEE("Log start\n");
@@ -171,9 +166,7 @@ int main(int argc, char ** argv) {
    std::vector<seq_draft> drafts(n_seq_dft);

    params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
-    if (params.sparams.temp == 0) {
-        params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
-    }
+    params.sparams.temp = -1.0f;    // force greedy sampling with probs for the draft model

    for (int s = 0; s < n_seq_dft; ++s) {
        drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
@@ -189,15 +182,12 @@ int main(int argc, char ** argv) {
    drafts[0].i_batch_tgt[0] = 0;

    while (true) {
-        std::set<int> active_seqs = {};
-
        // print current draft sequences
        for (int s = 0; s < n_seq_dft; ++s) {
            if (!drafts[s].active) {
                continue;
            }

-            active_seqs.insert(s);
            const auto & tokens = drafts[s].tokens;

            LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
@@ -206,156 +196,48 @@ int main(int argc, char ** argv) {
        int i_dft  = 0;
        int s_keep = 0;

-        llama_token token_id;
-        std::string token_str;
-
-        // loop until we fail to accept a drafted token or we run out of drafted tokens
        while (true) {
+            LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
+
+            // sample from the target model
+            llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
+
+            llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
+
+            //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
+
+            const std::string token_str = llama_token_to_piece(ctx_tgt, id);
+
+            if (!params.use_color) {
+                printf("%s", token_str.c_str());
+            }
+
+            if (id == llama_token_eos(model_tgt)) {
+                has_eos = true;
+            }
+
+            ++n_predict;

            // check if the target token matches any of the drafts
-            // for stochastic sampling, attempt to match the token with the drafted tokens
            {
-                bool accept = false;
-                if (params.sparams.temp > 0) {
-                    // stochastic verification
+                bool matches = false;

-                    llama_token_data_array dist_tgt = llama_sampling_probability_distribution(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
-                    float p_tgt = 0, p_dft = 0;
-
-                    // GGML_ASSERT(dist_tgt.size() == dist_dft.size());
-
-                    while (active_seqs.size() > 0) {
-                        // randomly select a sequence to verify from active sequences
-                        std::uniform_int_distribution<unsigned int> u_int_dist(0, active_seqs.size() - 1);
-                        int s = *std::next(active_seqs.begin(), u_int_dist(rng));
-                        if (i_dft >= (int) drafts[s].tokens.size()) {
-                            drafts[s].active = false;
-                            active_seqs.erase(s);
-                            continue;
-                        }
-                        if (accept) {
-                            // if we already accepted a token, we can skip the rest
-                            if (drafts[s].tokens[i_dft] != drafts[s_keep].tokens[i_dft]) {
-                                drafts[s].active = false;
-                                active_seqs.erase(s);
-                            }
-                            continue;
-                        }
-                        LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
-                        float r = u_dist(rng);
-                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), true };
-                        // acquire the token probabilities assigned by the draft and target models
-                        for (size_t i = 0; i < dist_tgt.size; i++) {
-                            if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
-                                p_tgt = dist_tgt.data[i].p;
-                            }
-                            if (dist_dft.data[i].id == drafts[s].tokens[i_dft]) {
-                                p_dft = dist_dft.data[i].p;
-                            }
-                            if (p_tgt && p_dft) {
-                                break;
-                            }
-                        }
-                        LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
-                        if (r <= p_tgt / p_dft) {
-                            s_keep = s;
-                            accept = true;
-                            token_id = drafts[s].tokens[i_dft];
-                            token_str = llama_token_to_piece(ctx_tgt, token_id);
-                            llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
-
-                            LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
-                            break;
-                        } else {
-                            LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
-                            drafts[s].active = false;
-
-                            // calculate residual probability
-                            GGML_ASSERT(dist_tgt.sorted);
-                            GGML_ASSERT(dist_dft.sorted);
-                            float sum_probs = 0.0f;
-
-                            // sort dist by id
-                            std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
-                                return a.id < b.id;
-                            });
-                            std::sort(dist_dft.data, dist_dft.data + dist_dft.size, [](const llama_token_data &a, const llama_token_data &b) {
-                                return a.id < b.id;
-                            });
-
-                            for (size_t i = 0; i < dist_tgt.size; i++) {
-                                dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p);
-                                sum_probs += dist_tgt.data[i].p;
-                            }
-                            for (size_t i = 0; i < dist_tgt.size; i++) {
-                                dist_tgt.data[i].p /= sum_probs;
-                            }
-
-                            // sort dist_tgt by p desc
-                            std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
-                                return a.p > b.p;
-                            });
-                        }
-
-                        active_seqs.erase(s);
-                        for(int i = 0; i < n_seq_dft; i++) {
-                            if (i == s) {
-                                continue;
-                            }
-                            if (drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
-                                // synchronize active status for sequences with the same drafted token
-                                drafts[i].active = drafts[i].active && accept;
-                                if (!drafts[i].active) {
-                                    active_seqs.erase(s);
-                                }
-                            }
-                        }
+                for (int s = 0; s < n_seq_dft; ++s) {
+                    if (!drafts[s].active) {
+                        continue;
                    }

-                    if (!accept) {
-                        // all drafted tokens were rejected
-                        // sample from the target model
-                        LOG("all drafted tokens were rejected, sampling from residual distribution\n");
-                        token_id = llama_sample_token(ctx_tgt, &dist_tgt);
-                        llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
-                        token_str = llama_token_to_piece(ctx_tgt, token_id);
-                    }
+                    if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) {
+                        LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str());

-                } else {
-                    // greedy verification
-
-                    // sample from the target model
-                    LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
-                    token_id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
-
-                    llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
-
-                    //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
-
-                    token_str = llama_token_to_piece(ctx_tgt, token_id);
-
-                    for (int s = 0; s < n_seq_dft; ++s) {
-                        if (!drafts[s].active) {
-                            continue;
-                        }
-
-                        if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
-                            LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
-
-                            s_keep = s;
-                            accept = true;
-                        } else {
-                            drafts[s].active = false;
-                        }
+                        s_keep = s;
+                        matches = true;
+                    } else {
+                        drafts[s].active = false;
                    }
                }

-                if (token_id == llama_token_eos(model_tgt)) {
-                    has_eos = true;
-                }
-                ++n_predict;
-
-                if (accept) {
+                if (matches) {
                    ++n_accept;
                    ++n_past_tgt;
                    ++n_past_dft;
@@ -363,21 +245,17 @@ int main(int argc, char ** argv) {
                    if (params.use_color) {
                        // Color token according to its origin sequence
                        printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
-                    } else {
-                        printf("%s", token_str.c_str());
+                        fflush(stdout);
                    }
-                    fflush(stdout);
                    continue;
-                } else {
-                    printf("%s", token_str.c_str());
-                    fflush(stdout);
-                    break;
                }
            }
-        }
+            if (params.use_color) {
+                printf("%s", token_str.c_str());
+            }
+            fflush(stdout);

-        {
-            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
+            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());

            // TODO: simplify
            {
@@ -397,21 +275,21 @@ int main(int argc, char ** argv) {
                drafts[s].active = false;
                drafts[s].tokens.clear();
                drafts[s].i_batch_tgt.clear();
-                drafts[s].dists.clear();
            }
            // note: will be erased after the speculation phase
-            drafts[0].tokens.push_back(token_id);
-            drafts[0].dists.push_back(std::vector<llama_token_data>());
+            drafts[0].tokens.push_back(id);
            drafts[0].i_batch_tgt.push_back(0);

            llama_batch_clear(batch_dft);
-            llama_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);
+            llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);

            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
            // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
-            llama_decode(ctx_dft, batch_dft);
+            llama_decode         (ctx_dft, batch_dft);

            ++n_past_dft;
+
+            break;
        }

        if (n_predict > params.n_predict || has_eos) {
@@ -456,6 +334,12 @@ int main(int argc, char ** argv) {
                            k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
                }

+                if (cur_p[0].p < p_accept) {
+                    LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
+                    drafts[s].drafting = false;
+                    continue;
+                }
+
                std::vector<int> sa(1, s);

                // attempt to split the branch if the probability is high enough
@@ -483,7 +367,6 @@ int main(int argc, char ** argv) {
                        drafts[n_seq_cur].skip     = true;

                        drafts[n_seq_cur].tokens      = drafts[s].tokens;
-                        drafts[n_seq_cur].dists       = drafts[s].dists;
                        drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
                        drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;

@@ -506,8 +389,6 @@ int main(int argc, char ** argv) {
                    llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);

                    drafts[s].tokens.push_back(id);
-                    // save cur_p.data into drafts[s].dists
-                    drafts[s].dists.push_back(cur_p);

                    // add unique drafted tokens to the target batch
                    drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
@@ -559,7 +440,6 @@ int main(int argc, char ** argv) {
            }

            drafts[s].tokens.erase(drafts[s].tokens.begin());
-            drafts[s].dists.erase(drafts[s].dists.begin());
        }
    }

--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@@ -91,14 +91,13 @@ extern "C" {
        // (optional) complete all pending operations
        void (*GGML_CALL synchronize)(ggml_backend_t backend);

-        // create a plan for ggml_cgraph and free it
+        // compute graph with a plan
        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        void                      (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);

-        // compute graph with a plan
-        enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        // compute graph without a plan (async)
-        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);

        // check if the backend supports an operation
        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -262,11 +262,11 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
    backend->iface.graph_plan_free(backend, plan);
 }

-enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    return backend->iface.graph_plan_compute(backend, plan);
+void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    backend->iface.graph_plan_compute(backend, plan);
 }

-enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    return backend->iface.graph_compute(backend, cgraph);
 }

@@ -732,15 +732,15 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
    GGML_UNUSED(backend);
 }

-GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;

-    return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
+    ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);

    GGML_UNUSED(backend);
 }

-GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;

    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@@ -755,7 +755,8 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
    cplan.abort_callback      = cpu_ctx->abort_callback;
    cplan.abort_callback_data = cpu_ctx->abort_callback_data;

-    return ggml_graph_compute(cgraph, &cplan);
+    ggml_graph_compute(cgraph, &cplan);
+    return true;
 }

 GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -1436,7 +1437,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
    return true;
 }

-static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
+static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
    uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
    uint64_t compute_us[GGML_MAX_BACKENDS] = {0};

@@ -1471,9 +1472,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s

        uint64_t compute_start_us = ggml_time_us();
        if (!sched->callback_eval) {
-            enum ggml_status ec = ggml_backend_graph_compute(split_backend, &split->graph);
-            if (ec != GGML_STATUS_SUCCESS) {
-                return ec;
+            if (!ggml_backend_graph_compute(split_backend, &split->graph)) {
+                return false;
            }
            //ggml_backend_synchronize(split_backend); // necessary to measure compute time
        } else {
@@ -1494,9 +1494,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s

                struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);

-                enum ggml_status ec = ggml_backend_graph_compute(split_backend, &gv);
-                if (ec != GGML_STATUS_SUCCESS) {
-                    return ec;
+                if (!ggml_backend_graph_compute(split_backend, &gv)) {
+                    return false;
                }

                if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
@@ -1520,7 +1519,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
    }
 #endif

-    return GGML_STATUS_SUCCESS;
+    return true;
 }

 ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
@@ -1582,7 +1581,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
    return true;
 }

-enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
+bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
    GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);

    if (!sched->is_reset) {
@@ -1591,10 +1590,14 @@ enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, st

    ggml_backend_sched_split_graph(sched, graph);
    if (!ggml_backend_sched_alloc_splits(sched)) {
-        return GGML_STATUS_ALLOC_FAILED;
+        return false;
    }

-    return ggml_backend_sched_compute_splits(sched);
+    if (!ggml_backend_sched_compute_splits(sched)) {
+        return false;
+    }
+
+    return true;
 }

 void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -66,13 +66,12 @@ extern "C" {

    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);

-    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API void                      ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);

-    GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API enum ggml_status ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-
-    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
+    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API bool ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);

    // tensor copy between different backends
    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
@@ -158,26 +157,26 @@ extern "C" {
    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);

    // Initialize a backend scheduler
-    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
-    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
+    GGML_API ggml_backend_sched_t  ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
+    GGML_API void                  ggml_backend_sched_free(ggml_backend_sched_t sched);
    // Initialize backend buffers from a measure graph
-    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
+    GGML_API bool                  ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
    // Get the number of splits of the last graph
-    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
+    GGML_API int                   ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);

-    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+    GGML_API size_t                ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);

-    GGML_API void                 ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
-    GGML_API ggml_backend_t       ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
+    GGML_API void                  ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
+    GGML_API ggml_backend_t        ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);

    // Allocate and compute graph on the backend scheduler
-    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API bool                  ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);

    // Reset all assignments and allocators - must be called before changing the node backends
-    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
+    GGML_API void                  ggml_backend_sched_reset(ggml_backend_sched_t sched);

    // Set a callback to be called for each resulting node during graph compute
-    GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
+    GGML_API void                  ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);

    //
    // Utils
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -12241,7 +12241,7 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
    UNUSED(backend);
 }

-GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    ggml_cuda_set_main_device(cuda_ctx->device);
@@ -12277,7 +12277,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
        GGML_ASSERT(ok);
    }

-    return GGML_STATUS_SUCCESS;
+    return true;
 }

 GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1927,10 +1927,10 @@ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(g
    return ggml_backend_kompute_buffer_type(ctx->device);
 }

-static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
    ggml_vk_graph_compute(ctx, cgraph);
-    return GGML_STATUS_SUCCESS;
+    return true;
 }

 static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -748,7 +748,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
    }
 }

-static enum ggml_status ggml_metal_graph_compute(
+static bool ggml_metal_graph_compute(
        struct ggml_metal_context * ctx,
               struct ggml_cgraph * gf) {

@@ -2484,7 +2484,7 @@ static enum ggml_status ggml_metal_graph_compute(
        MTLCommandBufferStatus status = [command_buffer status];
        if (status != MTLCommandBufferStatusCompleted) {
            GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
-            return GGML_STATUS_FAILED;
+            return false;
        }
    }

@@ -2493,7 +2493,7 @@ static enum ggml_status ggml_metal_graph_compute(
    }

    }
-    return GGML_STATUS_SUCCESS;
+    return true;
 }

 ////////////////////////////////////////////////////////////////////////////////
@@ -2795,7 +2795,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
    UNUSED(backend);
 }

-GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+GGML_CALL static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;

    return ggml_metal_graph_compute(metal_ctx, cgraph);
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -2231,7 +2231,7 @@ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(gg
    GGML_UNUSED(backend);
 }

-static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
+static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
    for (int i = 0; i < graph->n_nodes; ++i) {
        ggml_tensor * node = graph->nodes[i];
        switch (node->op) {
@@ -2246,7 +2246,7 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm
        }
    }

-    return GGML_STATUS_SUCCESS;
+    return true;

    GGML_UNUSED(backend);
 }
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -51,7 +51,6 @@

 #define UNUSED GGML_UNUSED

-// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
 #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)

 #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
@@ -464,8 +463,8 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
 }

 // NOTE: not tested
-inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
-    uint8x16_t res;
+inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
+    int8x16_t res;

    res[ 0] = a[b[ 0]];
    res[ 1] = a[b[ 1]];
@@ -9564,7 +9563,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *

        const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
        const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
-        const __m256i full_signs = MM256_SET_M128I(full_sign_bits, full_sign_bits);
+        const __m256i full_signs = _mm256_set_m128i(full_sign_bits, full_sign_bits);

        const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
        const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
@@ -9586,8 +9585,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
        const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
        const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);

-        const __m256i sc1 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
-        const __m256i sc2 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
+        const __m256i sc1 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
+        const __m256i sc2 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));

        const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));

@@ -9654,8 +9653,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *

            const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
            const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
-            const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
-            const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
+            const __m256i full_signs_1 = _mm256_set_m128i(full_signs_l, full_signs_l);
+            const __m256i full_signs_2 = _mm256_set_m128i(full_signs_h, full_signs_h);

            __m256i signs;
            signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
@@ -10552,10 +10551,10 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
        const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
        const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
-        const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
-                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
-        const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
-                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
+        const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
+                                               _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
+        const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
+                                               _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
        const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
        const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
        const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
@@ -10662,10 +10661,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
            const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs);  qs += 16;
            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
-                                                  _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
-            const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
-                                                  _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
+            const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
+                                                   _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
+            const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
+                                                   _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
            const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
            const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
--- a/ggml-vulkan-shaders.hpp
+++ b/ggml-vulkan-shaders.hpp
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -10,7 +10,6 @@ extern "C" {
 #define GGML_VK_NAME "Vulkan"
 #define GGML_VK_MAX_DEVICES 16

-GGML_API void ggml_vk_instance_init(void);
 GGML_API void ggml_vk_init_cpu_assist(void);

 GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
--- a/ggml.c
+++ b/ggml.c
@@ -320,17 +320,6 @@ static ggml_fp16_t ggml_table_exp_f16[1 << 16];
 // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
 float ggml_table_f32_f16[1 << 16];

-const char * ggml_status_to_string(enum ggml_status status) {
-    switch (status) {
-        case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
-        case GGML_STATUS_FAILED:       return "GGML status: error (operation failed)";
-        case GGML_STATUS_SUCCESS:      return "GGML status: success";
-        case GGML_STATUS_ABORTED:      return "GGML status: warning (operation aborted)";
-    }
-
-    return "GGML status: unknown";
-}
-
 // note: do not use these inside ggml.c
 // these are meant to be used via the ggml.h API
 float ggml_fp16_to_fp32(ggml_fp16_t x) {
@@ -17411,7 +17400,6 @@ struct ggml_compute_state {
    ggml_thread_t thrd;
    int ith;
    struct ggml_compute_state_shared * shared;
-    enum ggml_status ec;
 };

 static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
@@ -17705,8 +17693,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
    while (true) {
        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
            state->shared->node_n += 1;
-            state->ec = GGML_STATUS_ABORTED;
-            return 0;
+            return (thread_ret_t) GGML_EXIT_ABORTED;
        }

        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
@@ -17828,7 +17815,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
        }
    }

-    return 0;
+    return GGML_EXIT_SUCCESS;
 }

 struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
@@ -18024,7 +18011,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
    return cplan;
 }

-enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
+int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
    {
        GGML_ASSERT(cplan);
        GGML_ASSERT(cplan->n_threads > 0);
@@ -18068,7 +18055,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
                .thrd   = 0,
                .ith = j,
                .shared = &state_shared,
-                .ec = GGML_STATUS_SUCCESS,
            };

            const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
@@ -18079,14 +18065,12 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl

    workers[0].ith = 0;
    workers[0].shared = &state_shared;
-    workers[0].ec = GGML_STATUS_SUCCESS;

    const int64_t perf_start_cycles  = ggml_perf_cycles();
    const int64_t perf_start_time_us = ggml_perf_time_us();

    // this is a work thread too
-    ggml_graph_compute_thread(&workers[0]);
-    enum ggml_status compute_status = workers[0].ec;
+    int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);

    // don't leave affinity set on the main thread
    clear_numa_thread_affinity();
@@ -18096,8 +18080,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
        for (int j = 1; j < n_threads; j++) {
            const int rc = ggml_thread_join(workers[j].thrd, NULL);
            GGML_ASSERT(rc == 0);
-            if (workers[j].ec != GGML_STATUS_SUCCESS)
-                compute_status = workers[j].ec;
        }
    }

@@ -18125,14 +18107,14 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
    return compute_status;
 }

-enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
+void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);

    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);

    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;

-    return ggml_graph_compute(cgraph, &cplan);
+    ggml_graph_compute(cgraph, &cplan);
 }

 struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
--- a/ggml.h
+++ b/ggml.h
@@ -315,16 +315,6 @@
 extern "C" {
 #endif

-    enum ggml_status {
-        GGML_STATUS_ALLOC_FAILED = -2,
-        GGML_STATUS_FAILED = -1,
-        GGML_STATUS_SUCCESS = 0,
-        GGML_STATUS_ABORTED = 1,
-    };
-
-    // get ggml_status name string
-    GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
-
    typedef uint16_t ggml_fp16_t;

    // convert FP16 <-> FP32
@@ -1950,11 +1940,12 @@ extern "C" {

    // ggml_graph_plan() has to be called before ggml_graph_compute()
    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan            (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API enum ggml_status  ggml_graph_compute         (      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API struct ggml_cplan ggml_graph_plan   (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
+    GGML_API int               ggml_graph_compute(      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+
    // same as ggml_graph_compute() but the work data is allocated as a part of the context
    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
-    GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+    GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);

    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);

--- a/ggml_vk_generate_shaders.py
+++ b/ggml_vk_generate_shaders.py
--- a/grammars/json.gbnf
+++ b/grammars/json.gbnf
@@ -15,7 +15,7 @@ array  ::=

 string ::=
  "\"" (
-    [^"\\\x7F\x00-\x1F] |
+    [^"\\] |
    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
  )* "\"" ws

--- a/grammars/json_arr.gbnf
+++ b/grammars/json_arr.gbnf
@@ -24,7 +24,7 @@ array  ::=

 string ::=
  "\"" (
-    [^"\\\x7F\x00-\x1F] |
+    [^"\\] |
    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
  )* "\"" ws

--- a/llama.cpp
+++ b/llama.cpp
@@ -1983,12 +1983,7 @@ struct llama_context {
    bool logits_all = false;

    // embeddings output (2-dimensional array: [n_tokens][n_embd])
-    // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
-    std::vector<float> embd;
-
-    // sequence embeddings output (map of [n_embd] vectors)
-    // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
-    std::map<llama_seq_id, std::vector<float>> embd_seq;
+    std::vector<float> embeddings;

    // memory buffers used to evaluate the model
    std::vector<uint8_t> buf_compute_meta;
@@ -2007,7 +2002,6 @@ struct llama_context {
    struct ggml_tensor * inp_KQ_pos;    // F32 [n_ctx]
    struct ggml_tensor * inp_K_shift;   // I32 [n_ctx]
    struct ggml_tensor * inp_mean;      // F32 [n_batch, n_batch]
-    struct ggml_tensor * inp_cls;       // I32 [n_batch]

 #ifdef GGML_USE_MPI
    ggml_mpi_context * ctx_mpi = NULL;
@@ -5014,8 +5008,8 @@ static struct ggml_tensor * llm_build_kqv(
        ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
    }

-#if defined(GGML_USE_KOMPUTE)
-#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
+#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE)
+#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, and Kompute")
 #pragma message("      Falling back to ggml_alibi(). Will become an error in Mar 2024")
 #pragma message("ref:  https://github.com/ggerganov/llama.cpp/pull/5488")
    if (hparams.f_max_alibi_bias > 0.0f) {
@@ -6104,7 +6098,6 @@ struct llm_build_context {

        struct ggml_tensor * inp_pos  = ggml_view_1d(ctx0, lctx.inp_pos,  n_tokens, 0);
        struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
-        struct ggml_tensor * inp_cls  = ggml_view_1d(ctx0, lctx.inp_cls,  n_tokens, 0);

        // construct input embeddings (token, type, position)
        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
@@ -6246,29 +6239,24 @@ struct llm_build_context {

        // final output
        cur = inpL;
-        cb(cur, "result_embd", -1);

        // pooling layer
        switch (pooling_type) {
            case LLAMA_POOLING_TYPE_NONE:
+            case LLAMA_POOLING_TYPE_CLS:
                {
                    // nop
                } break;
            case LLAMA_POOLING_TYPE_MEAN:
                {
                    cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
-                    cb(cur, "result_embd_pooled", -1);
-                } break;
-            case LLAMA_POOLING_TYPE_CLS:
-                {
-                    cur = ggml_get_rows(ctx0, cur, inp_cls);
-                    cb(cur, "result_embd_pooled", -1);
                } break;
            case LLAMA_POOLING_TYPE_UNSPECIFIED:
                {
-                    GGML_ASSERT(false && "Invalid pooling type");
+                    GGML_ASSERT(false && "Max pooling not supported");
                } break;
        }
+        cb(cur, "result_embd", -1);

        ggml_build_forward_expand(gf, cur);

@@ -8045,7 +8033,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {

                for (int i = 0; i < n_kv; ++i) {
                    float f;
-                    if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
+                    if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
+                        (hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
                        f = -INFINITY;
                    } else {
                        f = 0.0f;
@@ -8097,16 +8086,13 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
        const int64_t n_tokens = batch.n_tokens;

        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
-
        float * data = (float *) lctx.inp_mean->data;
+
        memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));

        std::vector<uint64_t> sum(n_tokens, 0);
        for (int i = 0; i < n_tokens; ++i) {
            const llama_seq_id seq_id = batch.seq_id[i][0];
-
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
-
            sum[seq_id] += 1;
        }

@@ -8123,26 +8109,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
            data[seq_id*n_tokens + i] = div[seq_id];
        }
    }
-
-    if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
-        const int64_t n_tokens = batch.n_tokens;
-
-        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
-
-        uint32_t * data = (uint32_t *) lctx.inp_cls->data;
-        memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
-
-        for (int i = 0; i < n_tokens; ++i) {
-            const llama_seq_id seq_id = batch.seq_id[i][0];
-            const llama_pos    pos    = batch.pos[i];
-
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
-
-            if (pos == 0) {
-                data[seq_id] = i;
-            }
-        }
-    }
 }

 static void llama_graph_compute(
@@ -8283,23 +8249,17 @@ static int llama_decode_internal(
    struct ggml_tensor * res  = gf->nodes[gf->n_nodes - 1];
    struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];

-    if (!hparams.causal_attn) {
-        res = nullptr; // do not extract logits for embedding models such as BERT
-
-        // token or sequence embeddings
-        embd = gf->nodes[gf->n_nodes - 1];
-
-        GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
-    } else {
-        if (strcmp(res->name, "result_output") == 0) {
-            // the token embeddings could be the second to last tensor, or the third to last tensor
-            if (strcmp(embd->name, "result_norm") != 0) {
-                embd = gf->nodes[gf->n_nodes - 3];
-                GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
-            }
-        } else {
-            GGML_ASSERT(false && "missing result_output tensor");
+    if (strcmp(res->name, "result_output") == 0) {
+        // the embeddings could be the second to last tensor, or the third to last tensor
+        if (strcmp(embd->name, "result_norm") != 0) {
+            embd = gf->nodes[gf->n_nodes - 3];
+            GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
        }
+    } else if (strcmp(res->name, "result_embd") == 0) {
+        embd = res;
+        res = nullptr;
+    } else {
+        GGML_ASSERT(false);
    }

    // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -8398,48 +8358,45 @@ static int llama_decode_internal(

    // extract embeddings
    if (cparams.embeddings && embd) {
+        auto & embeddings_out = lctx.embeddings;
+
        ggml_backend_t backend_embd = ggml_backend_sched_get_node_backend(lctx.sched, embd);
        GGML_ASSERT(backend_embd != nullptr);

-        switch (cparams.pooling_type) {
-            case LLAMA_POOLING_TYPE_NONE:
-                {
-                    // extract token embeddings
-                    auto & embd_out = lctx.embd;
+        if (batch.logits) {
+            embeddings_out.resize(n_embd * n_tokens);
+            for (uint32_t i = 0; i < n_tokens; i++) {
+                if (batch.logits[i] == 0) {
+                    continue;
+                }

-                    if (batch.logits) {
-                        embd_out.resize(n_embd * n_tokens);
-                        for (uint32_t i = 0; i < n_tokens; i++) {
-                            if (batch.logits[i] == 0) {
-                                continue;
+                switch (cparams.pooling_type) {
+                    case LLAMA_POOLING_TYPE_CLS:
+                        {
+                            // find the token with the same seq_id and pos == 0 and use its embeddings
+                            int i_src = -1;
+                            for (int j = 0; j < (int) n_tokens; j++) {
+                                if (batch.seq_id[i][0] == batch.seq_id[j][0] && batch.pos[j] == 0) {
+                                    i_src = j;
+                                    break;
+                                }
                            }

-                            ggml_backend_tensor_get_async(backend_embd, embd, embd_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
-                        }
-                    }
-                } break;
-            case LLAMA_POOLING_TYPE_CLS:
-            case LLAMA_POOLING_TYPE_MEAN:
-                {
-                    GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
+                            GGML_ASSERT(i_src >= 0);

-                    // extract sequence embeddings
-                    auto & embd_seq_out = lctx.embd_seq;
-                    embd_seq_out.clear();
-
-                    for (uint32_t i = 0; i < n_tokens; i++) {
-                        const llama_seq_id seq_id = batch.seq_id[i][0];
-                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
-                            continue;
-                        }
-                        embd_seq_out[seq_id].resize(n_embd);
-                        ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
-                    }
-                } break;
-            case LLAMA_POOLING_TYPE_UNSPECIFIED:
-                {
-                    GGML_ASSERT(false && "unknown pooling type");
-                } break;
+                            ggml_backend_tensor_get_async(backend_embd, embd, embeddings_out.data() + (n_embd*i), (n_embd*i_src)*sizeof(float), n_embd*sizeof(float));
+                        } break;
+                    case LLAMA_POOLING_TYPE_NONE:
+                    case LLAMA_POOLING_TYPE_MEAN:
+                        {
+                            ggml_backend_tensor_get_async(backend_embd, embd, embeddings_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
+                        } break;
+                    default:
+                        {
+                            GGML_ASSERT(false && "unknown pooling type");
+                        } break;
+                }
+            }
        }
        ggml_backend_synchronize(backend_embd);
    }
@@ -12321,13 +12278,13 @@ struct llama_context * llama_new_context_with_model(
        ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);

        if (params.embeddings) {
-            ctx->embd.reserve(hparams.n_embd*cparams.n_batch);
+            ctx->embeddings.reserve(hparams.n_embd*cparams.n_batch);
        }

        // graph inputs
        {
            ggml_init_params init_params = {
-                /* .mem_size   */ ggml_tensor_overhead()*8,
+                /* .mem_size   */ ggml_tensor_overhead()*7,
                /* .mem_buffer */ nullptr,
                /* .no_alloc   */ true,
            };
@@ -12340,7 +12297,6 @@ struct llama_context * llama_new_context_with_model(
            ctx->inp_KQ_pos  = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
            ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
            ctx->inp_mean    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
-            ctx->inp_cls     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);

            ggml_set_name(ctx->inp_tokens,  "inp_tokens");
            ggml_set_name(ctx->inp_embd,    "inp_embd");
@@ -12349,7 +12305,6 @@ struct llama_context * llama_new_context_with_model(
            ggml_set_name(ctx->inp_KQ_pos,  "inp_KQ_pos");
            ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
            ggml_set_name(ctx->inp_mean,    "inp_mean");
-            ggml_set_name(ctx->inp_cls,     "inp_cls");

            ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
            LLAMA_LOG_INFO("%s: %10s input buffer size   = %8.2f MiB\n", __func__,
@@ -12756,7 +12711,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
    // assume worst case for logits although only currently set ones are serialized
    const size_t s_logits          = ctx->logits.capacity() * sizeof(float);
    const size_t s_embedding_size  = sizeof(size_t);
-    const size_t s_embedding       = ctx->embd.capacity() * sizeof(float);
+    const size_t s_embedding       = ctx->embeddings.capacity() * sizeof(float);
    const size_t s_kv_buf_size     = sizeof(size_t);
    const size_t s_kv_head         = sizeof(uint32_t);
    const size_t s_kv_size         = sizeof(uint32_t);
@@ -12865,12 +12820,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat

    // copy embeddings
    {
-        const size_t embeddings_size = ctx->embd.size();
+        const size_t embeddings_size = ctx->embeddings.size();

        data_ctx->write(&embeddings_size, sizeof(embeddings_size));

        if (embeddings_size) {
-            data_ctx->write(ctx->embd.data(), embeddings_size * sizeof(float));
+            data_ctx->write(ctx->embeddings.data(), embeddings_size * sizeof(float));
        }
    }

@@ -12978,12 +12933,12 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {

        memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);

-        GGML_ASSERT(ctx->embd.capacity() == embeddings_size);
+        GGML_ASSERT(ctx->embeddings.capacity() == embeddings_size);

        if (embeddings_size) {
-            ctx->embd.resize(embeddings_size);
+            ctx->embeddings.resize(embeddings_size);

-            memcpy(ctx->embd.data(), inp, embeddings_size * sizeof(float));
+            memcpy(ctx->embeddings.data(), inp, embeddings_size * sizeof(float));
            inp += embeddings_size * sizeof(float);
        }
    }
@@ -13234,20 +13189,11 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
 }

 float * llama_get_embeddings(struct llama_context * ctx) {
-    return ctx->embd.data();
+    return ctx->embeddings.data();
 }

 float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
-    return ctx->embd.data() + i*ctx->model.hparams.n_embd;
-}
-
-float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
-    auto it = ctx->embd_seq.find(seq_id);
-    if (it == ctx->embd_seq.end()) {
-        return nullptr;
-    }
-
-    return it->second.data();
+    return ctx->embeddings.data() + i*ctx->model.hparams.n_embd;
 }

 const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
@@ -13421,7 +13367,7 @@ static int32_t llama_chat_apply_template_internal(
    std::string & dest, bool add_ass) {
    // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
    std::stringstream ss;
-    if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
+    if (tmpl.find("<|im_start|>") != std::string::npos) {
        // chatml template
        for (auto message : chat) {
            ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@@ -13429,7 +13375,7 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<|im_start|>assistant\n";
        }
-    } else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
+    } else if (tmpl.find("[INST]") != std::string::npos) {
        // llama2 template and its variants
        // [variant] support system message
        bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
@@ -13464,7 +13410,7 @@ static int32_t llama_chat_apply_template_internal(
            }
        }
        // llama2 templates seem to not care about "add_generation_prompt"
-    } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
+    } else if (tmpl.find("<|user|>") != std::string::npos) {
        // zephyr template
        for (auto message : chat) {
            ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@@ -13472,7 +13418,7 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<|assistant|>\n";
        }
-    } else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
+    } else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
        // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
        for (auto message : chat) {
            std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
@@ -13481,7 +13427,7 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<s>assistant\n";
        }
-    } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
+    } else if (tmpl.find("<start_of_turn>") != std::string::npos) {
        // google/gemma-7b-it
        std::string system_prompt = "";
        for (auto message : chat) {
@@ -13528,7 +13474,7 @@ LLAMA_API int32_t llama_chat_apply_template(
        int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
        if (res < 0) {
            // worst case: there is no information about template, we will use chatml by default
-            curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
+            curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
        } else {
            curr_tmpl = std::string(model_template.data(), model_template.size());
        }
--- a/llama.h
+++ b/llama.h
@@ -655,20 +655,14 @@ extern "C" {
    // llama_get_logits(ctx) + i*n_vocab
    LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);

-    // Get all output token embeddings
-    // shape: [n_tokens*n_embd] (1-dimensional)
+    // Get the embeddings for the input
+    // shape: [n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);

    // Get the embeddings for the ith token
    // llama_get_embeddings(ctx) + i*n_embd
-    // shape: [n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);

-    // Get the embeddings for a sequence id
-    // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
-    // shape: [n_embd] (1-dimensional)
-    LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
-
    //
    // Vocab
    //
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@@ -18,7 +18,7 @@ except ImportError as e:
 KEY_PROPERTIES = [
    "cpu_info", "gpu_info", "n_gpu_layers", "main_gpu", "cuda", "opencl", "metal", "gpu_blas",
    "blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads",
-    "type_k", "type_v", "no_kv_offload", "tensor_split", "n_prompt", "n_gen"
+    "type_k", "type_v", "no_kv_offload", "mul_mat_q", "tensor_split", "n_prompt", "n_gen"
 ]

 # Properties that are boolean and are converted to Yes/No for the table:
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-8695910a39102609073d0e099aa7c97d6bcb3bf9
+274680868e12427373bab4bec87554431b954704
Author	SHA1	Message	Date
Georgi Gerganov	4ec0e9abbf	wip	2024-03-04 17:07:12 +02:00
Georgi Gerganov	e66da356a4	llama : add pooling switch	2024-03-04 14:06:33 +02:00
Georgi Gerganov	9bbeb0f110	embeddings : fix llama_batch_init arg	2024-03-04 14:06:00 +02:00
Georgi Gerganov	eb42596277	llama : do not use KV cache for non-causal models ggml-ci	2024-03-04 13:34:16 +02:00
Georgi Gerganov	d0347840c1	llama : fix embeddings ggml-ci	2024-03-04 11:43:16 +02:00