jinja : correct default size for string slices (#19913 )

model : add Jina Embeddings v5 Nano (partial EuroBERT) support (#19826 )
* WIP: Add EuroBERT support with autoformatting changes This commit includes: - EuroBERT model implementation for GGUF conversion - C++ backend support for EuroBERT architecture - Unintended autoformatting changes to Python files Saving before reverting formatting-only changes. * feat: add back eos assert when not last token pooling * feat: removed duplicated code and cleanup * feat: removed not working architectures and unnecessary check * fix: typo * fix: dynamic pooling config * feat: added an example model for eurobert * feat: proper llama-vocab implementation for jina-v5 * fix: removed unnecessary comments
2026-03-05 14:33:24 +02:00 · 2026-02-26 12:28:09 +01:00 · 2026-02-26 12:14:09 +01:00 · 2026-02-26 12:46:32 +02:00 · 2026-02-26 11:26:16 +01:00 · 2026-02-26 10:27:20 +08:00
25 changed files with 420 additions and 600 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1578,7 +1578,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_sparam());
    add_opt(common_arg(
-        {"--temp"}, "N",
+        {"--temp", "--temperature"}, "N",
        string_format("temperature (default: %.2f)", (double)params.sampling.temp),
        [](common_params & params, const std::string & value) {
            params.sampling.temp = std::stof(value);
@@ -1611,7 +1611,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_sparam());
    add_opt(common_arg(
-        {"--top-nsigma"}, "N",
+        {"--top-nsigma", "--top-n-sigma"}, "N",
        string_format("top-n-sigma sampling (default: %.2f, -1.0 = disabled)", params.sampling.top_n_sigma),
        [](common_params & params, const std::string & value) {
            params.sampling.top_n_sigma = std::stof(value);
@@ -1634,7 +1634,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_sparam());
    add_opt(common_arg(
-        {"--typical"}, "N",
+        {"--typical", "--typical-p"}, "N",
        string_format("locally typical sampling, parameter p (default: %.2f, 1.0 = disabled)", (double)params.sampling.typ_p),
        [](common_params & params, const std::string & value) {
            params.sampling.typ_p = std::stof(value);
@@ -2642,8 +2642,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.out_file = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA,
-                    LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
    add_opt(common_arg(
        {"-ofreq", "--output-frequency"}, "N",
        string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
--- a/common/common.h
+++ b/common/common.h
@@ -104,7 +104,6 @@ enum llama_example {
    LLAMA_EXAMPLE_DIFFUSION,
    LLAMA_EXAMPLE_FINETUNE,
    LLAMA_EXAMPLE_FIT_PARAMS,
-    LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,

    LLAMA_EXAMPLE_COUNT,
 };
--- a/common/jinja/runtime.cpp
+++ b/common/jinja/runtime.cpp
@@ -721,6 +721,8 @@ value member_expression::execute_impl(context & ctx) {
        int64_t arr_size = 0;
        if (is_val<value_array>(object)) {
            arr_size = object->as_array().size();
+        } else if (is_val<value_string>(object)) {
+            arr_size = object->as_string().length();
        }

        if (is_stmt<slice_expression>(this->property)) {
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1148,6 +1148,9 @@ class TextModel(ModelBase):
        if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
            res = "jina-v2-de"
+        if chkhsh == "a023e9fdc5a11f034d3ef515b92350e56fb2af1f66c6b6811a4444ea9bf8763d":
+            # ref: https://huggingface.co/jinaai/jina-embeddings-v5-text-nano
+            res = "jina-v5-nano"
        if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
            # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
            res = "smaug-bpe"
@@ -6125,6 +6128,32 @@ class NeoBert(BertModel):
        yield from super().modify_tensors(data_torch, name, bid)


+@ModelBase.register("EuroBertModel", "JinaEmbeddingsV5Model")
+class EuroBertModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.EUROBERT
+
+    def set_vocab(self):
+        self.gguf_writer.add_add_bos_token(False)
+        self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        # EuroBert is bidirectional (encoder)
+        self.gguf_writer.add_causal_attention(False)
+
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+
+        self._try_set_pooling_type()
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # Strip "model." prefix from tensor names
+        if name.startswith("model."):
+            name = name[6:]
+
+        yield from super().modify_tensors(data_torch, name, bid)
+
+
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
 class XLMRobertaModel(BertModel):
    model_arch = gguf.MODEL_ARCH.BERT
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -107,6 +107,7 @@ models = [
    {"name": "jina-v2-en",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
    {"name": "jina-v2-es",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
    {"name": "jina-v2-de",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
+    {"name": "jina-v5-nano",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v5-text-nano", },
    {"name": "smaug-bpe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
    {"name": "poro-chat",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
    {"name": "jina-v2-code",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
--- a/ggml/src/ggml-sycl/binbcast.cpp
+++ b/ggml/src/ggml-sycl/binbcast.cpp
@@ -11,8 +11,8 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
        int ne0, int ne1, int ne2, int ne3,
        int ne10, int ne11, int ne12, int ne13,
        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s00,*/ int s01, int s02, int s03,
-        /*int s10,*/ int s11, int s12, int s13,
+        int s00, int s01, int s02, int s03,
+        int s10, int s11, int s12, int s13,
        const sycl::nd_item<3> &item_ct1) {
    const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
                    item_ct1.get_local_id(2);
@@ -44,7 +44,7 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
    for (int i0 = i0s; i0 < ne0;
         i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
        const int i10 = i0 % ne10;
-        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
+        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0*s00] : 0.0f, (float)src1_row[i10*s10]);
    }
 }

@@ -53,8 +53,8 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t
        int ne0, int ne1, int ne2, int ne3,
        int ne10, int ne11, int ne12, int ne13,
        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s00,*/ int s01, int s02, int s03,
-        /*int s10,*/ int s11, int s12, int s13,
+        int s00, int s01, int s02, int s03,
+        int s10, int s11, int s12, int s13,
        const sycl::nd_item<3> &item_ct1) {

    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
@@ -82,7 +82,7 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t
    dst_t * dst_row = dst + i_dst;

    const int i10 = i0 % ne10;
-    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
+    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0*s00] : 0.0f, (float)src1_row[i10*s10]);
 }


@@ -95,7 +95,8 @@ struct bin_bcast_sycl {
                    const int64_t ne3, const size_t nb00, const size_t nb01, const size_t nb02, const size_t nb03,
                    const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0,
                    const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous,
-                    const bool src1_is_contiguous, const bool dst_is_contiguous, queue_ptr stream) {
+                    const bool src1_is_contiguous, const bool src0_is_permuted, const bool src1_is_permuted,
+                    queue_ptr stream) {
        int nr0 = ne10 / ne0;
        int nr1 = ne11/ne1;
        int nr2 = ne12/ne2;
@@ -123,7 +124,7 @@ struct bin_bcast_sycl {
            cnb[3] *= cne[3];
        };

-        if (src0_is_contiguous && src1_is_contiguous && dst_is_contiguous) {
+        if (src0_is_contiguous && src1_is_contiguous && !src0_is_permuted && !src1_is_permuted) {
            for (int i = 0; i < 4; i++) {
                if (nr[i] != 1) {
                    break;
@@ -164,7 +165,7 @@ struct bin_bcast_sycl {
            size_t nb12 = cnb1[2];
            size_t nb13 = cnb1[3];

-            size_t s0 = nb0 / sizeof(dst_t);
+            // size_t s0 = nb0 / sizeof(dst_t);
            size_t s1 = nb1 / sizeof(dst_t);
            size_t s2 = nb2 / sizeof(dst_t);
            size_t s3 = nb3 / sizeof(dst_t);
@@ -196,9 +197,6 @@ struct bin_bcast_sycl {
            GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
            GGML_ASSERT(nb13 % sizeof(src1_t) == 0);

-            GGML_ASSERT(s0 == 1);
-            GGML_ASSERT(s10 == 1);
-
            const int block_size = 128;

            int64_t hne0 = std::max(ne0/2LL, 1LL);
@@ -232,8 +230,8 @@ struct bin_bcast_sycl {
                        [=](sycl::nd_item<3> item_ct1) {
                            k_bin_bcast_unravel<bin_op>(
                                src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
-                                ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
-                                s03, s11, s12, s13, item_ct1);
+                                ne10, ne11, ne12, ne13, s1, s2, s3, s00, s01, s02,
+                                s03, s10, s11, s12, s13, item_ct1);
                        });
                }
            } else {
@@ -251,7 +249,7 @@ struct bin_bcast_sycl {
                    [=](sycl::nd_item<3> item_ct1) {
                        k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
                                            ne2, ne3, ne10, ne11, ne12, ne13,
-                                            s1, s2, s3, s01, s02, s03, s11, s12, s13,
+                                            s1, s2, s3, s00, s01, s02, s03, s10, s11, s12, s13,
                                            item_ct1);
                    });
            }
@@ -268,24 +266,27 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
        op()((const float *) src0->data, (const float *) src1->data, (float *) dst->data, ne00, ne01, ne02, ne03, ne10,
             ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, nb3,
-             ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
+             ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1), main_stream);
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
        op()((const sycl::half *) src0->data, (const sycl::half *) src1->data, (sycl::half *) dst->data, ne00, ne01,
             ne02, ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13,
-             nb0, nb1, nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst),
+             nb0, nb1, nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
             main_stream);
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
        op()((const sycl::half *) src0->data, (const float *) src1->data, (sycl::half *) dst->data, ne00, ne01, ne02,
             ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1,
-             nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
+             nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
+             main_stream);
    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
        op()((const int32_t *) src0->data, (const int32_t *) src1->data, (int32_t *) dst->data, ne00, ne01, ne02, ne03,
             ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
-             nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
+             nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
+             main_stream);
    } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
        op()((const int16_t *) src0->data, (const int16_t *) src1->data, (int16_t *) dst->data, ne00, ne01, ne02, ne03,
             ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
-             nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
+             nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
+             main_stream);
    } else {
        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type),
                ggml_type_name(src0->type), ggml_type_name(src1->type));
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -13820,12 +13820,11 @@ static bool ggml_vk_can_fuse_rope_set_rows(ggml_backend_vk_context * ctx, const
    return true;
 }

-// Check whether the tensors overlap in memory but are not equal.
-// Fusions can potenitally overwrite src tensors in ways that are not prevented
-// by ggml-alloc. If the fusion is entirely elementwise, then it's OK for them
-// to overlap if they are exactly equal.
-// XXX TODO this check is probably missing from several fusion optimizations.
-static bool ggml_vk_tensors_overlap_but_not_equal(const ggml_tensor * a, const ggml_tensor * b) {
+// Check whether the tensors overlap in memory.
+// Fusions can potentially overwrite src tensors in ways that are not prevented
+// by ggml-alloc. If the fusion src is being applied in a way that's elementwise
+// with the destination, then it's OK for them to overlap if they are exactly equal.
+static bool ggml_vk_tensors_overlap(const ggml_tensor * a, const ggml_tensor * b, bool elementwise) {
    ggml_backend_vk_buffer_context * a_buf_ctx = (ggml_backend_vk_buffer_context *)a->buffer->context;
    vk_buffer a_buf = a_buf_ctx->dev_buffer;
    ggml_backend_vk_buffer_context * b_buf_ctx = (ggml_backend_vk_buffer_context *)b->buffer->context;
@@ -13836,7 +13835,7 @@ static bool ggml_vk_tensors_overlap_but_not_equal(const ggml_tensor * a, const g
        auto b_base = vk_tensor_offset(b) + b->view_offs;
        auto b_size = ggml_nbytes(b);

-        if (a_base == b_base && a_size == b_size) {
+        if (elementwise && a_base == b_base && a_size == b_size) {
            return false;
        }

@@ -13874,13 +13873,6 @@ static bool ggml_vk_can_fuse_rms_norm_mul_rope(ggml_backend_vk_context * ctx, co
        return false;
    }

-    // must not overwrite srcs in a way that's not elementwise
-    ggml_tensor *other_src = mul->src[0] == rms ? mul->src[1] : mul->src[0];
-    if (ggml_vk_tensors_overlap_but_not_equal(rms->src[0], rope) ||
-        ggml_vk_tensors_overlap_but_not_equal(other_src, rope)) {
-        return false;
-    }
-
    // conditions for pipeline creation
    if (!(ctx->device->float_controls_rte_fp16 &&
        sizeof(vk_op_rms_norm_mul_rope_push_constants) <= ctx->device->properties.limits.maxPushConstantsSize)) {
@@ -13942,6 +13934,18 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
    return num_adds;
 }

+static int32_t find_first_set(uint32_t x) {
+    int32_t ret = 0;
+    if (!x) {
+        return -1;
+    }
+    while (!(x & 1)) {
+        x >>= 1;
+        ret++;
+    }
+    return ret;
+}
+
 static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
    VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
@@ -14040,6 +14044,12 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
            total_mul_mat_bytes += bytes;
        }

+        // op_srcs_fused_elementwise indicates whether an op's srcs all contribute to
+        // the fused result in an elementwise-way. This affects whether the memory for
+        // the src is allowed to overlap the memory for the destination.
+        // The array is sized to handle the largest fusion (asserted later).
+        bool op_srcs_fused_elementwise[12];
+
        ctx->fused_topk_moe_mode = TOPK_MOE_COUNT;
        ctx->fused_topk_moe_scale = false;
        const char *fusion_string {};
@@ -14048,39 +14058,68 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
            if (num_adds) {
                ctx->num_additional_fused_ops = num_adds - 1;
                fusion_string = "MULTI_ADD";
+                std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, true);
            } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_ADD })) {
                ctx->num_additional_fused_ops = 2;
                fusion_string = "MUL_MAT_ADD_ADD";
+                op_srcs_fused_elementwise[0] = false;
+                op_srcs_fused_elementwise[1] = true;
+                op_srcs_fused_elementwise[2] = true;
            } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD })) {
                ctx->num_additional_fused_ops = 1;
                fusion_string = "MUL_MAT_ADD";
+                op_srcs_fused_elementwise[0] = false;
+                op_srcs_fused_elementwise[1] = true;
            } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_MUL })) {
                ctx->num_additional_fused_ops = 2;
                fusion_string = "MUL_MAT_ID_ADD_ID_MUL";
+                op_srcs_fused_elementwise[0] = false;
+                op_srcs_fused_elementwise[1] = true;
+                op_srcs_fused_elementwise[2] = true;
            } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID })) {
                ctx->num_additional_fused_ops = 1;
                fusion_string = "MUL_MAT_ID_ADD_ID";
+                op_srcs_fused_elementwise[0] = false;
+                op_srcs_fused_elementwise[1] = true;
            } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_MUL })) {
                ctx->num_additional_fused_ops = 1;
                fusion_string = "MUL_MAT_ID_MUL";
+                op_srcs_fused_elementwise[0] = false;
+                op_srcs_fused_elementwise[1] = true;
            } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 4 }) &&
                       ggml_check_edges(cgraph, i, rms_norm_mul_rope_view_set_rows_edges) &&
                       ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i) &&
                       ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i + 2)) {
                ctx->num_additional_fused_ops = 4;
                fusion_string = "RMS_NORM_MUL_ROPE_VIEW_SET_ROWS";
+                op_srcs_fused_elementwise[0] = false;
+                op_srcs_fused_elementwise[1] = false;
+                op_srcs_fused_elementwise[2] = false;
+                op_srcs_fused_elementwise[3] = false;
+                op_srcs_fused_elementwise[4] = false;
            } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE })&&
                       ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i)) {
                ctx->num_additional_fused_ops = 2;
                fusion_string = "RMS_NORM_MUL_ROPE";
+                // rope is approximately elementwise - whole rows are done by a single workgroup and it's row-wise
+                op_srcs_fused_elementwise[0] = false;
+                op_srcs_fused_elementwise[1] = true;
+                op_srcs_fused_elementwise[2] = true;
            } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
                ctx->num_additional_fused_ops = 1;
                fusion_string = "RMS_NORM_MUL";
+                // rms_norm is not elementwise, but whole rows must be consumed and the scale factor computed before
+                // they are overwritten, and one workgroup per row. So close enough.
+                op_srcs_fused_elementwise[0] = true;
+                op_srcs_fused_elementwise[1] = true;
            } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) &&
                       ggml_check_edges(cgraph, i, rope_view_set_rows_edges) &&
                       ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) {
                ctx->num_additional_fused_ops = 2;
                fusion_string = "ROPE_VIEW_SET_ROWS";
+                op_srcs_fused_elementwise[0] = false;
+                op_srcs_fused_elementwise[1] = false;
+                op_srcs_fused_elementwise[2] = false;
            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
                       ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
@@ -14089,6 +14128,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
                ctx->fused_ops_write_mask |= 1 << 3;
                ctx->fused_topk_moe_mode = TOPK_MOE_EARLY_SOFTMAX_NORM;
                fusion_string = "TOPK_MOE_EARLY_SOFTMAX_NORM";
+                std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false);
            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_sigmoid_norm_bias, { i + 4, i + 10 }) &&
                       ggml_check_edges(cgraph, i, topk_moe_sigmoid_norm_bias_edges) &&
                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_SIGMOID_NORM_BIAS)) {
@@ -14097,6 +14137,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
                ctx->fused_ops_write_mask |= 1 << 4;
                ctx->fused_topk_moe_mode = TOPK_MOE_SIGMOID_NORM_BIAS;
                fusion_string = "TOPK_MOE_SIGMOID_NORM_BIAS";
+                std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false);
            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
                       ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
@@ -14105,6 +14146,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
                ctx->fused_ops_write_mask |= 1 << 3;
                ctx->fused_topk_moe_mode = TOPK_MOE_EARLY_SOFTMAX;
                fusion_string = "TOPK_MOE_EARLY_SOFTMAX";
+                std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false);
            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
                       ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) {
@@ -14113,6 +14155,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
                ctx->fused_ops_write_mask |= 1 << 1;
                ctx->fused_topk_moe_mode = TOPK_MOE_LATE_SOFTMAX;
                fusion_string = "TOPK_MOE_LATE_SOFTMAX";
+                std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false);
            }
            if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) {
                // Look for an additional scale op to fuse - occurs in deepseek2 and nemotron3 nano.
@@ -14120,11 +14163,73 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
                    ggml_can_fuse_subgraph(cgraph, i + ctx->num_additional_fused_ops, { GGML_OP_GET_ROWS, GGML_OP_SCALE }, { i + ctx->num_additional_fused_ops + 1 })) {
                    ctx->fused_topk_moe_scale = true;
                    ctx->num_additional_fused_ops++;
+                    op_srcs_fused_elementwise[ctx->num_additional_fused_ops] = false;
                }
            }
        }
+        GGML_ASSERT(ctx->num_additional_fused_ops < (int)(sizeof(op_srcs_fused_elementwise) / sizeof(op_srcs_fused_elementwise[0])));
        ctx->fused_ops_write_mask |= 1 << ctx->num_additional_fused_ops;

+        // Check whether fusion would overwrite src operands while they're still in use.
+        // If so, disable fusion.
+        if (ctx->num_additional_fused_ops) {
+            // There are up to two output nodes - topk_moe has two.
+            uint32_t bits = ctx->fused_ops_write_mask & ~(1 << ctx->num_additional_fused_ops);
+            ggml_tensor *output_nodes[2] {};
+            output_nodes[0] = cgraph->nodes[i + ctx->num_additional_fused_ops];
+            if (bits) {
+                int output_idx = find_first_set(bits);
+                GGML_ASSERT(bits == (1u << output_idx));
+                output_nodes[1] = cgraph->nodes[i + output_idx];
+            }
+
+            bool need_disable = false;
+
+            // topk_moe often overwrites the source, but for a given row all the src values are
+            // loaded before anything is stored. If there's only one row, this is safe, so treat
+            // this as a special case.
+            bool is_topk_moe_single_row = ctx->fused_topk_moe_mode != TOPK_MOE_COUNT &&
+                                          ggml_nrows(cgraph->nodes[i]->src[0]) == 1;
+
+            if (!is_topk_moe_single_row) {
+                for (int j = 0; j < 2; ++j) {
+                    ggml_tensor *dst = output_nodes[j];
+                    if (!dst) {
+                        continue;
+                    }
+                    // Loop over all srcs of all nodes in the fusion. If the src overlaps
+                    // the destination and the src is not an intermediate node that's being
+                    // elided, then disable fusion.
+                    for (int k = 0; k <= ctx->num_additional_fused_ops; ++k) {
+                        for (uint32_t s = 0; s < GGML_MAX_SRC; ++s) {
+                            ggml_tensor *src = cgraph->nodes[i + k]->src[s];
+                            if (!src || src->op == GGML_OP_NONE) {
+                                continue;
+                            }
+                            if (ggml_vk_tensors_overlap(src, dst, op_srcs_fused_elementwise[k])) {
+                                bool found = false;
+                                for (int n = 0; n < k; ++n) {
+                                    if (cgraph->nodes[i + n] == src) {
+                                        found = true;
+                                        break;
+                                    }
+                                }
+                                if (!found) {
+                                    need_disable = true;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            if (need_disable) {
+                ctx->num_additional_fused_ops = 0;
+                ctx->fused_ops_write_mask = 1;
+                ctx->fused_topk_moe_mode = TOPK_MOE_COUNT;
+                ctx->fused_topk_moe_scale = false;
+            }
+        }
+
        // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
        bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
        bool submit = (submitted_nodes >= nodes_per_submit) ||
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@@ -228,13 +228,41 @@ struct gguf_context {
 };

 struct gguf_reader {
-    FILE * file;
+    gguf_reader(FILE * file) : file(file) {
+        // read the remaining bytes once and update on each read
+        nbytes_remain = file_remain(file);
+    }

-    gguf_reader(FILE * file) : file(file) {}
+    // helper for remaining bytes in a file
+    static uint64_t file_remain(FILE * file) {
+        const int64_t cur = gguf_ftell(file);
+        if (cur < 0) {
+            return 0;
+        }
+        if (gguf_fseek(file, 0, SEEK_END) != 0) {
+            gguf_fseek(file, cur, SEEK_SET);
+
+            return 0;
+        }
+        const int64_t end = gguf_ftell(file);
+        if (end < 0) {
+            gguf_fseek(file, cur, SEEK_SET);
+
+            return 0;
+        }
+        gguf_fseek(file, cur, SEEK_SET);
+        return static_cast<uint64_t>(end - cur);
+    }

    template <typename T>
    bool read(T & dst) const {
-        return fread(&dst, 1, sizeof(dst), file) == sizeof(dst);
+        const size_t size = sizeof(dst);
+        if (nbytes_remain < size) {
+            return false;
+        }
+        const size_t nread = fread(&dst, 1, size, file);
+        nbytes_remain -= nread;
+        return nread == size;
    }

    template <typename T>
@@ -242,20 +270,19 @@ struct gguf_reader {
        if (n > GGUF_MAX_ARRAY_ELEMENTS) {
            return false;
        }
-        const uint64_t nbytes = nbytes_remain();
        if constexpr (std::is_same<T, std::string>::value) {
            // strings are prefixed with their length, so we need to account for that
            if (n > SIZE_MAX / sizeof(uint64_t)) {
                return false;
            }
-            if (nbytes < n * sizeof(uint64_t)) {
+            if (nbytes_remain < n * sizeof(uint64_t)) {
                return false;
            }
        } else {
            if (n > SIZE_MAX / sizeof(T)) {
                return false;
            }
-            if (nbytes < n * sizeof(T)) {
+            if (nbytes_remain < n * sizeof(T)) {
                return false;
            }
        }
@@ -312,39 +339,29 @@ struct gguf_reader {
            GGML_LOG_ERROR("%s: string length %" PRIu64 " exceeds maximum %" PRIu64 "\n", __func__, size, (uint64_t) GGUF_MAX_STRING_LENGTH);
            return false;
        }
-        const uint64_t nbytes = nbytes_remain();
-        if (size > nbytes) {
-            GGML_LOG_ERROR("%s: string length %" PRIu64 " exceeds remaining file size %" PRIu64 " bytes\n", __func__, size, nbytes);
+        if (size > nbytes_remain) {
+            GGML_LOG_ERROR("%s: string length %" PRIu64 " exceeds remaining file size %" PRIu64 " bytes\n", __func__, size, nbytes_remain);
            return false;
        }
        dst.resize(static_cast<size_t>(size));
-        return fread(dst.data(), 1, dst.length(), file) == dst.length();
+        const size_t nread = fread(dst.data(), 1, size, file);
+        nbytes_remain -= nread;
+        return nread == size;
    }

    bool read(void * dst, const size_t size) const {
-        return fread(dst, 1, size, file) == size;
+        if (size > nbytes_remain) {
+            return false;
+        }
+        const size_t nread = fread(dst, 1, size, file);
+        nbytes_remain -= nread;
+        return nread == size;
    }

-    // remaining bytes in the file
-    uint64_t nbytes_remain() const {
-        const int64_t cur = gguf_ftell(file);
-        if (cur < 0) {
-            return 0;
-        }
-        if (gguf_fseek(file, 0, SEEK_END) != 0) {
-            gguf_fseek(file, cur, SEEK_SET);
+private:
+    FILE * file;

-            return 0;
-        }
-        const int64_t end = gguf_ftell(file);
-        if (end < 0) {
-            gguf_fseek(file, cur, SEEK_SET);
-
-            return 0;
-        }
-        gguf_fseek(file, cur, SEEK_SET);
-        return static_cast<uint64_t>(end - cur);
-    }
+    mutable uint64_t nbytes_remain;
 };

 struct gguf_context * gguf_init_empty(void) {
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -379,6 +379,7 @@ class MODEL_ARCH(IntEnum):
    NEO_BERT         = auto()
    JINA_BERT_V2     = auto()
    JINA_BERT_V3     = auto()
+    EUROBERT         = auto()
    BLOOM            = auto()
    STABLELM         = auto()
    QWEN             = auto()
@@ -820,6 +821,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.NEO_BERT:         "neo-bert",
    MODEL_ARCH.JINA_BERT_V2:     "jina-bert-v2",
    MODEL_ARCH.JINA_BERT_V3:     "jina-bert-v3",
+    MODEL_ARCH.EUROBERT:         "eurobert",
    MODEL_ARCH.BLOOM:            "bloom",
    MODEL_ARCH.STABLELM:         "stablelm",
    MODEL_ARCH.QWEN:             "qwen",
@@ -1587,6 +1589,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_UP,
        MODEL_TENSOR.LAYER_OUT_NORM,
    ],
+    MODEL_ARCH.EUROBERT: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_DOWN,
+    ],
    MODEL_ARCH.MPT: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
--- a/include/llama.h
+++ b/include/llama.h
@@ -617,13 +617,6 @@ extern "C" {
            const char * fname_out,
            const llama_model_quantize_params * params);

-    // Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
-    LLAMA_API struct ggml_cgraph * llama_graph_reserve(
-            struct llama_context * ctx,
-            uint32_t n_tokens,
-            uint32_t n_seqs,
-            uint32_t n_outputs);
-
    //
    // Adapters
    //
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -62,6 +62,7 @@ add_library(llama
            models/dream.cpp
            models/ernie4-5-moe.cpp
            models/ernie4-5.cpp
+            models/eurobert.cpp
            models/exaone-moe.cpp
            models/exaone.cpp
            models/exaone4.cpp
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -26,6 +26,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_NEO_BERT,         "neo-bert"         },
    { LLM_ARCH_JINA_BERT_V2,     "jina-bert-v2"     },
    { LLM_ARCH_JINA_BERT_V3,     "jina-bert-v3"     },
+    { LLM_ARCH_EUROBERT,         "eurobert"         },
    { LLM_ARCH_BLOOM,            "bloom"            },
    { LLM_ARCH_STABLELM,         "stablelm"         },
    { LLM_ARCH_QWEN,             "qwen"             },
@@ -819,6 +820,20 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_CLS,
                LLM_TENSOR_CLS_OUT,
            };
+        case LLM_ARCH_EUROBERT:
+            return {
+                LLM_TENSOR_TOKEN_EMBD,
+                LLM_TENSOR_OUTPUT_NORM,
+                LLM_TENSOR_ATTN_NORM,
+                LLM_TENSOR_ATTN_Q,
+                LLM_TENSOR_ATTN_K,
+                LLM_TENSOR_ATTN_V,
+                LLM_TENSOR_ATTN_OUT,
+                LLM_TENSOR_FFN_NORM,
+                LLM_TENSOR_FFN_GATE,
+                LLM_TENSOR_FFN_UP,
+                LLM_TENSOR_FFN_DOWN,
+            };
        case LLM_ARCH_MODERN_BERT:
            return {
                LLM_TENSOR_TOKEN_EMBD,
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -30,6 +30,7 @@ enum llm_arch {
    LLM_ARCH_NEO_BERT,
    LLM_ARCH_JINA_BERT_V2,
    LLM_ARCH_JINA_BERT_V3,
+    LLM_ARCH_EUROBERT,
    LLM_ARCH_BLOOM,
    LLM_ARCH_STABLELM,
    LLM_ARCH_QWEN,
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -3035,19 +3035,6 @@ uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
    return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
 }

-struct ggml_cgraph * llama_graph_reserve(
-        struct llama_context * ctx,
-        uint32_t n_tokens,
-        uint32_t n_seqs,
-        uint32_t n_outputs) {
-    auto * memory = ctx->get_memory();
-    llama_memory_context_ptr mctx;
-    if (memory) {
-        mctx = memory->init_full();
-    }
-    return ctx->graph_reserve(n_tokens, n_seqs, n_outputs, mctx.get());
-}
-
 // llama adapter API

 int32_t llama_set_adapters_lora(
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -979,6 +979,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    type = LLM_TYPE_250M;
                }
            } break;
+        case LLM_ARCH_EUROBERT:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_ATTENTION_CAUSAL,            hparams.causal_attn);
+                ml.get_key(LLM_KV_POOLING_TYPE,                hparams.pooling_type);
+
+                if (hparams.n_layer == 12) {
+                    type = LLM_TYPE_SMALL;  // 0.2B
+                }
+            } break;
        case LLM_ARCH_BLOOM:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3570,6 +3580,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
                    }
                } break;
+            case LLM_ARCH_EUROBERT:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                    }
+                } break;
            case LLM_ARCH_JINA_BERT_V2:
                {
                    tok_embd  = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0); // word_embeddings
@@ -8181,6 +8214,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
        case LLM_ARCH_NOMIC_BERT:
        case LLM_ARCH_NOMIC_BERT_MOE:
        case LLM_ARCH_NEO_BERT:
+        case LLM_ARCH_EUROBERT:
        case LLM_ARCH_WAVTOKENIZER_DEC:
        case LLM_ARCH_MODERN_BERT:
        case LLM_ARCH_GEMMA_EMBEDDING:
@@ -8378,6 +8412,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
            {
                llm = std::make_unique<llm_build_neo_bert>(*this, params);
            } break;
+        case LLM_ARCH_EUROBERT:
+            {
+                llm = std::make_unique<llm_build_eurobert>(*this, params);
+            } break;
        case LLM_ARCH_BLOOM:
            {
                llm = std::make_unique<llm_build_bloom>(*this, params);
@@ -9004,6 +9042,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_MODERN_BERT:
        case LLM_ARCH_NOMIC_BERT:
        case LLM_ARCH_NOMIC_BERT_MOE:
+        case LLM_ARCH_EUROBERT:
        case LLM_ARCH_STABLELM:
        case LLM_ARCH_BITNET:
        case LLM_ARCH_QWEN:
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1890,7 +1890,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "falcon-h1" ||
                    tokenizer_pre == "pixtral"  ||
                    tokenizer_pre == "midm-2.0" ||
-                    tokenizer_pre == "lfm2") {
+                    tokenizer_pre == "lfm2"     ||
+                    tokenizer_pre == "jina-v5-nano") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
                ignore_merges = true;
                add_bos = true;
--- a/src/models/eurobert.cpp
+++ b/src/models/eurobert.cpp
@@ -0,0 +1,97 @@
+#include "models.h"
+
+llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_v;
+
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    inpL = build_inp_embd(model.tok_embd);
+    cb(inpL, "inp_embd", -1);
+
+    auto * inp_attn = build_attn_inp_no_cache();
+
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    for (int il = 0; il < n_layer; ++il) {
+        ggml_tensor * cur = inpL;
+
+        cur = build_norm(inpL,
+                model.layers[il].attn_norm, NULL,
+                LLM_NORM_RMS, il);
+
+        {
+            ggml_tensor * Qcur;
+            ggml_tensor * Kcur;
+            ggml_tensor * Vcur;
+
+            Qcur = build_lora_mm(model.layers[il].wq, cur);
+            Kcur = build_lora_mm(model.layers[il].wk, cur);
+            Vcur = build_lora_mm(model.layers[il].wv, cur);
+
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+            Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                    );
+
+            Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                    );
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            cur = build_attn(inp_attn,
+                    model.layers[il].wo, nullptr,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+            cb(cur, "kqv_out", il);
+        }
+
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+            inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+        }
+
+        cur = ggml_add(ctx0, cur, inpL);
+
+        ggml_tensor * ffn_inp = cur;
+        cb(ffn_inp, "ffn_inp", il);
+
+        cur = build_norm(ffn_inp,
+                model.layers[il].ffn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "ffn_norm", il);
+
+        cur = build_ffn(cur,
+                model.layers[il].ffn_up, NULL, NULL,
+                model.layers[il].ffn_gate, NULL, NULL,
+                model.layers[il].ffn_down, NULL, NULL,
+                NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+        cb(cur, "ffn_out", il);
+
+        cur = ggml_add(ctx0, cur, ffn_inp);
+
+        inpL = cur;
+    }
+    cur = inpL;
+
+    cur = build_norm(cur,
+            model.output_norm, NULL,
+            LLM_NORM_RMS, -1);
+
+    cb(cur, "result_embd", -1);
+    res->t_embd = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -424,6 +424,10 @@ struct llm_build_neo_bert : public llm_graph_context {
    llm_build_neo_bert(const llama_model & model, const llm_graph_params & params);
 };

+struct llm_build_eurobert : public llm_graph_context {
+    llm_build_eurobert(const llama_model & model, const llm_graph_params & params);
+};
+
 template <bool iswa>
 struct llm_build_olmo2 : public llm_graph_context {
    llm_build_olmo2(const llama_model & model, const llm_graph_params & params);
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -31,12 +31,10 @@
 #include <cstring>
 #include <ctime>
 #include <future>
-#include <fstream>
 #include <memory>
 #include <random>
 #include <regex>
 #include <set>
-#include <sstream>
 #include <string>
 #include <string_view>
 #include <thread>
@@ -6596,236 +6594,6 @@ struct test_diag : public test_case {
    }
 };

-// Deserializable generic test case
-struct input_tensor {
-    ggml_type type;
-    std::array<int64_t, 4> ne;
-    std::array<size_t, 4> nb; // strides (0 = use default contiguous strides)
-};
-
-static bool is_non_contiguous(const input_tensor & src) {
-    if (src.nb[0] == 0) {
-        return false;
-    }
-    const size_t default_nb0 = ggml_type_size(src.type);
-    const size_t default_nb1 = default_nb0 * (src.ne[0] / ggml_blck_size(src.type));
-    const size_t default_nb2 = default_nb1 * src.ne[1];
-    const size_t default_nb3 = default_nb2 * src.ne[2];
-    return src.nb[0] != default_nb0 ||
-           src.nb[1] != default_nb1 ||
-           src.nb[2] != default_nb2 ||
-           src.nb[3] != default_nb3;
-}
-
-static std::string var_to_str(const std::vector<input_tensor>& sources) {
-    std::ostringstream oss;
-    bool first = true;
-    for (const auto& src : sources) {
-        if (!first) oss << ",";
-        oss << ggml_type_name(src.type) << "[" << src.ne[0] << "," << src.ne[1] << "," << src.ne[2] << "," << src.ne[3] << "]";
-        if (is_non_contiguous(src)) {
-            oss << "nb[" << src.nb[0] << "," << src.nb[1] << "," << src.nb[2] << "," << src.nb[3] << "]";
-        }
-        first = false;
-    }
-    return oss.str();
-}
-
-static std::string var_to_str(const std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)>& params) {
-    std::ostringstream oss;
-    oss << "[";
-    bool first = true;
-    for (size_t i = 0; i < params.size(); ++i) {
-        if (params[i] != 0) {
-            if (!first) oss << ",";
-            oss << i << ":" << params[i];
-            first = false;
-        }
-    }
-    oss << "]";
-    return oss.str();
-}
-
-
-struct test_generic_op : public test_case {
-    const ggml_op op;
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    const std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params;
-
-    const std::vector<input_tensor> sources;
-    const std::string name;
-
-    std::string vars() override {
-        if (name.empty()) {
-            return VARS_TO_STR4(type, ne, op_params, sources);
-        }
-
-        return VARS_TO_STR5(name, type, ne, op_params, sources);
-    }
-
-    test_generic_op(ggml_op op, ggml_type type, std::array<int64_t, 4> ne,
-                    std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params,
-                    std::vector<input_tensor> sources, std::string name = "")
-        : op(op), type(type), ne(ne), op_params(op_params), sources(sources), name(std::move(name)) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        const size_t source_count = std::min(sources.size(), (size_t)GGML_MAX_SRC);
-
-        std::array<ggml_tensor *, GGML_MAX_SRC> source_tensors;
-        for (size_t i = 0; i < source_count; ++i) {
-            const input_tensor& src = sources[i];
-
-            if (is_non_contiguous(src)) {
-                size_t total_size;
-                const size_t blck_size = ggml_blck_size(src.type);
-                if (blck_size == 1) {
-                    total_size = ggml_type_size(src.type);
-                    for (int d = 0; d < 4; d++) {
-                        total_size += (src.ne[d] - 1) * src.nb[d];
-                    }
-                } else {
-                    total_size = src.ne[0] * src.nb[0] / blck_size;
-                    for (int d = 1; d < 4; d++) {
-                        total_size += (src.ne[d] - 1) * src.nb[d];
-                    }
-                }
-
-                // Convert bytes to elements, padded to block size for quantized types
-                const size_t type_size = ggml_type_size(src.type);
-                size_t backing_elements = (total_size * blck_size + type_size - 1) / type_size;
-                backing_elements = ((backing_elements + blck_size - 1) / blck_size) * blck_size;
-                ggml_tensor * backing = ggml_new_tensor_1d(ctx, src.type, backing_elements);
-                source_tensors[i] = ggml_view_4d(ctx, backing,
-                    src.ne[0], src.ne[1], src.ne[2], src.ne[3],
-                    src.nb[1], src.nb[2], src.nb[3], 0);
-                // nb[0] does not get set by view_4d, so set it manually
-                source_tensors[i]->nb[0] = src.nb[0];
-            } else {
-                source_tensors[i] = ggml_new_tensor_4d(ctx, src.type, src.ne[0], src.ne[1], src.ne[2], src.ne[3]);
-            }
-        }
-
-        // Ops with an inplace flag create a view of src[0] as their output.
-        bool inplace = false;
-        if (op == GGML_OP_SET || op == GGML_OP_ACC) {
-            inplace = op_params[4] != 0;
-        } else if (op == GGML_OP_ADD_REL_POS) {
-            inplace = op_params[0] != 0;
-        }
-
-        ggml_tensor * out;
-        if (inplace && source_count > 0) {
-            out = ggml_view_tensor(ctx, source_tensors[0]);
-        } else {
-            out = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
-        }
-        out->op = op;
-        for (size_t i = 0; i < source_count; ++i) {
-            out->src[i] = source_tensors[i];
-        }
-
-        memcpy(out->op_params, op_params.data(), GGML_MAX_OP_PARAMS);
-        ggml_set_name(out, "out");
-
-        return out;
-    }
-
-    double max_nmse_err() override {
-        switch (op) {
-        case GGML_OP_MUL_MAT:
-        case GGML_OP_MUL_MAT_ID:
-        case GGML_OP_OUT_PROD:
-        case GGML_OP_CONV_TRANSPOSE_2D:
-        case GGML_OP_IM2COL:
-        case GGML_OP_CONV_2D:
-        case GGML_OP_CONV_3D:
-        case GGML_OP_SET_ROWS:
-        case GGML_OP_CPY:
-            return 5e-4;
-        case GGML_OP_SOFT_MAX:
-            return 1e-6;
-        case GGML_OP_RWKV_WKV7:
-            return 5e-3;
-        case GGML_OP_FLASH_ATTN_EXT:
-        {
-            // Scale error with kv length to account for accumulating floating point error
-            const int64_t kv = sources[1].ne[1];
-            return 5e-4 * std::max(1.0, kv / 20000.0);
-        }
-        default:
-            return 1e-7;
-        }
-    }
-
-    void initialize_tensors(ggml_context * ctx) override {
-        ggml_tensor * out = ggml_get_tensor(ctx, "out");
-
-        std::random_device rd;
-        std::default_random_engine rng(rd());
-
-        for (size_t i = 0; i < sources.size() && i < GGML_MAX_SRC; i++) {
-            ggml_tensor * t = out->src[i];
-            if (!t) {
-                break;
-            }
-
-            // FLASH_ATTN_EXT: src[3] is the KQ mask
-            if (op == GGML_OP_FLASH_ATTN_EXT && i == 3) {
-                init_tensor_kq_mask(t);
-                continue;
-            }
-
-            if (t->type == GGML_TYPE_I32 || t->type == GGML_TYPE_I64) {
-                if (op == GGML_OP_GET_ROWS || op == GGML_OP_GET_ROWS_BACK) {
-                    const int64_t num_rows = sources[0].ne[1];
-                    const int64_t nels = ggml_nelements(t);
-                    std::vector<int32_t> data(nels);
-                    std::uniform_int_distribution<int32_t> dist(0, num_rows - 1);
-                    for (int64_t i = 0; i < nels; i++) {
-                        data[i] = dist(rng);
-                    }
-                    ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t));
-                } else if (op == GGML_OP_SET_ROWS) {
-                    init_set_rows_row_ids(t, ne[1]);
-                } else if (op == GGML_OP_ROPE) {
-                    const int mode = op_params[2];
-                    const int64_t nels = (mode & GGML_ROPE_TYPE_MROPE) ? ne[2] * 4 : ne[2];
-                    std::vector<int32_t> data(nels);
-                    std::uniform_int_distribution<int32_t> dist(0, ne[2] - 1);
-                    for (int64_t i = 0; i < nels; i++) {
-                        data[i] = dist(rng);
-                    }
-                    ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t));
-                } else if (op == GGML_OP_MUL_MAT_ID || op == GGML_OP_ADD_ID) {
-                    const int64_t n_expert = (op == GGML_OP_MUL_MAT_ID) ? sources[0].ne[2] : sources[1].ne[1];
-                    for (int64_t r = 0; r < ggml_nrows(t); r++) {
-                        std::vector<int32_t> data(t->ne[0]);
-                        for (int32_t i = 0; i < t->ne[0]; i++) {
-                            data[i] = i % n_expert;
-                        }
-                        std::shuffle(data.begin(), data.end(), rng);
-                        ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
-                    }
-                } else if (op == GGML_OP_SSM_SCAN) {
-                    for (int64_t r = 0; r < ggml_nrows(t); r++) {
-                        std::vector<int32_t> data(t->ne[0]);
-                        for (int32_t i = 0; i < t->ne[0]; i++) {
-                            data[i] = i;
-                        }
-                        std::shuffle(data.begin(), data.end(), rng);
-                        ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
-                    }
-                } else {
-                    init_tensor_uniform(t);
-                }
-            } else {
-                init_tensor_uniform(t);
-            }
-        }
-    }
-};
-

 enum llm_norm_type {
    LLM_NORM,
@@ -8885,72 +8653,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
    return test_cases;
 }

-static std::vector<std::unique_ptr<test_case>> make_test_cases_from_file(const char * path) {
-    std::ifstream f(path);
-
-    if (!f.is_open()) {
-        throw std::runtime_error("Unable to read test file");
-    }
-
-    std::vector<std::unique_ptr<test_case>> test_cases;
-
-    std::string line;
-
-    while (std::getline(f, line)) {
-        std::istringstream iss(line);
-
-        ggml_op op;
-        ggml_type type;
-        std::array<int64_t, 4> ne;
-        std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params = {};
-        std::string name;
-        uint64_t tmp;
-
-        iss >> tmp;
-        op = (ggml_op)tmp;
-        iss >> tmp;
-        type = (ggml_type)tmp;
-
-        for (size_t i = 0; i < 4; i++) {
-            iss >> ne[i];
-        }
-
-        iss >> tmp;
-        for (size_t i = 0; i < tmp && i < op_params.size(); i++) {
-            iss >> op_params[i];
-        }
-
-        iss >> tmp;
-
-        size_t num_src = std::min((uint64_t)GGML_MAX_SRC, tmp);
-        std::vector<input_tensor> sources(num_src);
-        for (size_t i = 0; i < num_src; i++) {
-            input_tensor& src = sources[i];
-            iss >> tmp;
-            src.type = (ggml_type)tmp;
-
-            for (size_t i = 0; i < 4; i++) {
-                iss >> src.ne[i];
-            }
-            for (size_t i = 0; i < 4; i++) {
-                iss >> src.nb[i];
-            }
-        }
-
-        iss >> name;
-
-        if (name.length() == 1 && name[0] == '-') {
-            name = "";
-        }
-
-        test_cases.emplace_back(new test_generic_op(op, type, ne, op_params, sources, std::move(name)));
-    }
-
-    return test_cases;
-}
-
 static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_names_filter, const char * params_filter,
-                         printer * output_printer, const char * test_file_path) {
+                         printer * output_printer) {
    auto filter_test_cases = [](std::vector<std::unique_ptr<test_case>> & test_cases, const char * params_filter) {
        if (params_filter == nullptr) {
            return;
@@ -8968,26 +8672,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        }
    };

-    std::vector<std::unique_ptr<test_case>> test_cases;
-
-    if (test_file_path == nullptr) {
-        switch (mode) {
-        case MODE_TEST:
-        case MODE_GRAD:
-        case MODE_SUPPORT:
-            test_cases = make_test_cases_eval();
-            break;
-        case MODE_PERF:
-            test_cases = make_test_cases_perf();
-            break;
-        }
-    } else {
-        test_cases = make_test_cases_from_file(test_file_path);
-    }
-
-    filter_test_cases(test_cases, params_filter);
-
    if (mode == MODE_TEST) {
+        auto test_cases = make_test_cases_eval();
+        filter_test_cases(test_cases, params_filter);
        ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
        if (backend_cpu == NULL) {
            test_operation_info info("", "", "CPU");
@@ -9027,6 +8714,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    }

    if (mode == MODE_GRAD) {
+        auto test_cases = make_test_cases_eval();
+        filter_test_cases(test_cases, params_filter);
        size_t n_ok = 0;
        for (auto & test : test_cases) {
            if (test->eval_grad(backend, op_names_filter, output_printer)) {
@@ -9039,6 +8728,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    }

    if (mode == MODE_PERF) {
+        auto test_cases = make_test_cases_perf();
+        filter_test_cases(test_cases, params_filter);
        for (auto & test : test_cases) {
            test->eval_perf(backend, op_names_filter, output_printer);
        }
@@ -9046,6 +8737,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    }

    if (mode == MODE_SUPPORT) {
+        auto test_cases = make_test_cases_eval();
+        filter_test_cases(test_cases, params_filter);
+
        // Filter out fusion cases
        test_cases.erase(
            std::remove_if(test_cases.begin(), test_cases.end(), [](const std::unique_ptr<test_case> & tc) {
@@ -9164,8 +8858,7 @@ static void show_test_coverage() {
 }

 static void usage(char ** argv) {
-    printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops]", argv[0]);
-    printf(" [--show-coverage] [--test-file <path>]\n");
+    printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops] [--show-coverage]\n", argv[0]);
    printf("    valid modes:\n");
    printf("      - test (default, compare with CPU backend for correctness)\n");
    printf("      - grad (compare gradients from backpropagation with method of finite differences)\n");
@@ -9176,7 +8869,6 @@ static void usage(char ** argv) {
    printf("    --output specifies output format (default: console, options: console, sql, csv)\n");
    printf("    --list-ops lists all available GGML operations\n");
    printf("    --show-coverage shows test coverage\n");
-    printf("    --test-file reads test operators from a test file generated by llama-export-graph-ops\n");
 }

 int main(int argc, char ** argv) {
@@ -9185,7 +8877,6 @@ int main(int argc, char ** argv) {
    const char * op_names_filter = nullptr;
    const char * backend_filter = nullptr;
    const char * params_filter = nullptr;
-    const char * test_file_path = nullptr;

    for (int i = 1; i < argc; i++) {
        if (strcmp(argv[i], "test") == 0) {
@@ -9233,13 +8924,6 @@ int main(int argc, char ** argv) {
        } else if (strcmp(argv[i], "--show-coverage") == 0) {
            show_test_coverage();
            return 0;
-        } else if (strcmp(argv[i], "--test-file") == 0) {
-            if (i + 1 < argc) {
-                test_file_path = argv[++i];
-            } else {
-                usage(argv);
-                return 1;
-            }
        } else {
            usage(argv);
            return 1;
@@ -9292,7 +8976,7 @@ int main(int argc, char ** argv) {
                                                             false, "", ggml_backend_dev_description(dev),
                                                             total / 1024 / 1024, free / 1024 / 1024, true));

-        bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get(), test_file_path);
+        bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get());

        if (ok) {
            n_ok++;
--- a/tests/test-tokenizer-0.sh
+++ b/tests/test-tokenizer-0.sh
@@ -13,7 +13,12 @@ fi
 name=$1
 input=$2

-make -j tests/test-tokenizer-0
+# Build using CMake if binary doesn't exist
+if [ ! -f ./build/bin/test-tokenizer-0 ]; then
+    printf "Building test-tokenizer-0 with CMake...\n"
+    cmake -B build -DLLAMA_BUILD_TESTS=ON
+    cmake --build build --target test-tokenizer-0 -j
+fi

 printf "Testing %s on %s ...\n" $name $input

@@ -23,7 +28,7 @@ printf "Tokenizing using (py)  Python AutoTokenizer ...\n"
 python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1

 printf "Tokenizing using (cpp) llama.cpp ...\n"
-./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
+./build/bin/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1

 cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
 cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -37,5 +37,4 @@ else()
        add_subdirectory(export-lora)
    endif()
    add_subdirectory(fit-params)
-    add_subdirectory(export-graph-ops)
 endif()
--- a/tools/export-graph-ops/CMakeLists.txt
+++ b/tools/export-graph-ops/CMakeLists.txt
@@ -1,8 +0,0 @@
-set(TARGET llama-export-graph-ops)
-add_executable(${TARGET} export-graph-ops.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
-
-if(LLAMA_TOOLS_INSTALL)
-    install(TARGETS ${TARGET} RUNTIME)
-endif()
--- a/tools/export-graph-ops/export-graph-ops.cpp
+++ b/tools/export-graph-ops/export-graph-ops.cpp
@@ -1,168 +0,0 @@
-#include "arg.h"
-#include "common.h"
-#include "log.h"
-#include "llama.h"
-#include "ggml.h"
-
-#include <array>
-#include <vector>
-#include <set>
-#include <fstream>
-#include <iostream>
-
-struct input_tensor {
-    ggml_type type;
-    std::array<int64_t, 4> ne;
-    std::array<size_t, 4> nb;
-
-    input_tensor(ggml_type type, int64_t * ne, size_t * nb): type(type) {
-        memcpy(this->ne.data(), ne, 4 * sizeof(int64_t));
-        memcpy(this->nb.data(), nb, 4 * sizeof(size_t));
-    }
-
-    bool operator<(const input_tensor &b) const {
-        return std::tie(type, ne, nb) <
-               std::tie(b.type, b.ne, b.nb);
-    }
-
-    void serialize(std::ostream& out) const {
-        out << type << ' ';
-        for (size_t i = 0; i < 4; i++) {
-            out << ne[i] << ' ';
-        }
-        for (size_t i = 0; i < 4; i++) {
-            out << nb[i] << ' ';
-        }
-    }
-};
-
-struct test_object {
-    ggml_op op;
-    ggml_type type;
-    std::array<int64_t, 4> ne;
-    std::vector<int32_t> op_params;
-    std::vector<input_tensor> sources;
-    std::string name;
-
-    void serialize(std::ostream& out) const {
-        out << op << ' ' << type << ' ';
-        for (size_t i = 0; i < 4; i++) {
-            out << ne[i] << ' ';
-        }
-
-        out << op_params.size() << ' ';
-        for (size_t i = 0; i < op_params.size(); i++) {
-            out << op_params[i] << ' ';
-        }
-
-        out << sources.size() << ' ';
-        for (size_t s = 0; s < sources.size(); s++) {
-            sources[s].serialize(out);
-        }
-
-        if (!name.empty()) {
-            out << name;
-        } else {
-            out << '-';
-        }
-
-        out << '\n';
-    }
-
-    bool operator<(const test_object &b) const {
-        return std::tie(op, type, ne, op_params, sources) <
-               std::tie(b.op, b.type, b.ne, b.op_params, b.sources);
-    }
-};
-
-static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set<test_object> & tests) {
-    int n_nodes = ggml_graph_n_nodes(cgraph);
-    int n_skipped = 0;
-    int n_before = (int) tests.size();
-    for (int i = 0; i < n_nodes; i++) {
-        ggml_tensor * node = ggml_graph_node(cgraph, i);
-
-        if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) {
-            n_skipped++;
-            continue;
-        }
-
-        test_object test;
-
-        test.op = node->op;
-        test.type = node->type;
-        memcpy(&test.ne, node->ne, 4 * sizeof(int64_t));
-
-        test.op_params.resize(GGML_MAX_OP_PARAMS / sizeof(int32_t));
-        memcpy(test.op_params.data(), node->op_params, GGML_MAX_OP_PARAMS);
-
-        for (size_t s = 0; s < GGML_MAX_SRC; s++) {
-            if (node->src[s] == nullptr) {
-                break;
-            }
-
-            test.sources.emplace_back(node->src[s]->type, node->src[s]->ne, node->src[s]->nb);
-        }
-
-        test.name = node->name;
-        tests.insert(test);
-    }
-
-    int n_new = (int) tests.size() - n_before;
-    LOG_INF("%s: %d unique ops, %d total nodes, %d skipped (view ops)\n",
-            label, n_new, n_nodes, n_skipped);
-}
-
-int main(int argc, char ** argv) {
-    common_params params;
-    params.out_file = "tests.txt";
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS)) {
-        return 1;
-    }
-
-    common_init();
-
-    // Load CPU-only
-    ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-    params.devices = { cpu_device, nullptr };
-    params.fit_params = false;
-    params.n_gpu_layers = 0;
-
-    params.warmup = false;
-
-    auto init_result = common_init_from_params(params);
-
-    llama_context * ctx = init_result->context();
-
-    const uint32_t n_seqs  = llama_n_seq_max(ctx);
-    const uint32_t n_tokens = std::min(llama_n_ctx(ctx), llama_n_ubatch(ctx));
-
-    std::set<test_object> tests;
-
-    auto * gf_pp = llama_graph_reserve(ctx, n_tokens, n_seqs, n_tokens);
-    if (!gf_pp) {
-        throw std::runtime_error("failed to reserve prompt processing graph");
-    }
-    extract_graph_ops(gf_pp, "pp", tests);
-
-    auto * gf_tg = llama_graph_reserve(ctx, n_seqs, n_seqs, n_seqs);
-    if (!gf_tg) {
-        throw std::runtime_error("failed to reserve token generation graph");
-    }
-    extract_graph_ops(gf_tg, "tg", tests);
-
-    LOG_INF("%d unique ops total\n", (int) tests.size());
-
-    std::ofstream f(params.out_file);
-
-    if (!f.is_open()) {
-        throw std::runtime_error("Unable to open output file");
-    }
-
-    for (const auto& test : tests) {
-        test.serialize(f);
-    }
-
-    return 0;
-}
--- a/tools/imatrix/imatrix.cpp
+++ b/tools/imatrix/imatrix.cpp
@@ -912,7 +912,9 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params, c

    const bool add_bos = llama_vocab_get_add_bos(vocab);

-    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
+    if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_LAST) {
+        GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
+    }

    auto tim1 = std::chrono::high_resolution_clock::now();
    LOG_INF("%s: tokenizing the input ..\n", __func__);
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -1510,7 +1510,7 @@ version = 1
 ; If the same key is defined in a specific preset, it will override the value in this global section.
 [*]
 c = 8192
-n-gpu-layer = 8
+n-gpu-layers = 8

 ; If the key corresponds to an existing model on the server,
 ; this will be used as the default config for that model
Author	SHA1	Message	Date
Eric Zhang	9b62913b40	jinja : correct default size for string slices (#19913 )	2026-02-26 12:28:09 +01:00
Maximilian Werk	66287bdaac	model : add Jina Embeddings v5 Nano (partial EuroBERT) support (#19826 ) * WIP: Add EuroBERT support with autoformatting changes This commit includes: - EuroBERT model implementation for GGUF conversion - C++ backend support for EuroBERT architecture - Unintended autoformatting changes to Python files Saving before reverting formatting-only changes. * feat: add back eos assert when not last token pooling * feat: removed duplicated code and cleanup * feat: removed not working architectures and unnecessary check * fix: typo * fix: dynamic pooling config * feat: added an example model for eurobert * feat: proper llama-vocab implementation for jina-v5 * fix: removed unnecessary comments	2026-02-26 12:14:09 +01:00
Georgi Gerganov	1ca3d1de15	gguf : avoid too many file size calls (#19919 )	2026-02-26 12:46:32 +02:00
yggdrasil75	bd72300591	server : fix typo in server README.md (#19900 ) fix typo	2026-02-26 11:26:16 +01:00
Neo Zhang	2943210c1e	support permuted, remove check s0/s10 (#19889 ) Co-authored-by: Neo Zhang Jianyu <jianyu.zhang@intel.com>	2026-02-26 10:27:20 +08:00
Jeff Bolz	3769fe6eb7	vulkan: check for memory overlap before doing fusion (#19768 ) * vulkan: check for memory overlap before doing fusion * Update ggml/src/ggml-vulkan/ggml-vulkan.cpp * address feedback	2026-02-25 18:25:38 +01:00
ddh0	832aa94762	common : add more aliases for sampler CLI params (#19797 ) * common : add more aliases for sampler CLI params	2026-02-25 16:34:25 +01:00