remove event pending stage

fix log
fix event reuse issue with multiple vectors
2026-04-30 16:47:31 +03:00 · 2026-03-15 09:00:06 +01:00 · 2026-03-14 08:45:28 +00:00 · 2026-03-14 09:03:29 +01:00 · 2026-03-14 07:07:39 +01:00 · 2026-03-14 06:39:20 +01:00
20 changed files with 845 additions and 138 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -732,23 +732,28 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
        "llama-completion",
        "llama-convert-llama2c-to-ggml",
        "llama-cvector-generator",
+        "llama-debug",
+        "llama-diffusion-cli",
        "llama-embedding",
        "llama-eval-callback",
        "llama-export-lora",
+        "llama-finetune",
+        "llama-fit-params",
+        "llama-gemma3-cli",
        "llama-gen-docs",
        "llama-gguf",
        "llama-gguf-hash",
        "llama-gguf-split",
-        "llama-gritlm",
+        "llama-idle",
        "llama-imatrix",
-        "llama-infill",
-        "llama-mtmd-cli",
-        "llama-llava-clip-quantize-cli",
+        "llama-llava-cli",
        "llama-lookahead",
        "llama-lookup",
        "llama-lookup-create",
        "llama-lookup-merge",
        "llama-lookup-stats",
+        "llama-minicpmv-cli",
+        "llama-mtmd-cli",
        "llama-parallel",
        "llama-passkey",
        "llama-perplexity",
@@ -2666,7 +2671,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.out_file = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RESULTS}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE,
+                    LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS}));
    add_opt(common_arg(
        {"-ofreq", "--output-frequency"}, "N",
        string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
--- a/common/common.h
+++ b/common/common.h
@@ -105,6 +105,7 @@ enum llama_example {
    LLAMA_EXAMPLE_FINETUNE,
    LLAMA_EXAMPLE_FIT_PARAMS,
    LLAMA_EXAMPLE_RESULTS,
+    LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,

    LLAMA_EXAMPLE_COUNT,
 };
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2194,6 +2194,8 @@ class GPTNeoXModel(TextModel):
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
+        assert n_head is not None
+        assert n_embed is not None

        if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
            # Map bloom-style qkv_linear to gpt-style qkv_linear
@@ -2231,6 +2233,8 @@ class BloomModel(TextModel):
    def set_gguf_parameters(self):
        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
+        assert n_head is not None
+        assert n_embed is not None
        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
        self.gguf_writer.add_embedding_length(n_embed)
        self.gguf_writer.add_feed_forward_length(4 * n_embed)
@@ -2243,6 +2247,8 @@ class BloomModel(TextModel):
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
+        assert n_head is not None
+        assert n_embed is not None

        name = re.sub(r'transformer\.', '', name)

@@ -3853,6 +3859,7 @@ class LLaDAModel(TextModel):

        if (rope_dim := hparams.get("head_dim")) is None:
            n_heads = hparams.get("num_attention_heads", hparams.get("n_heads"))
+            assert n_heads is not None
            rope_dim = hparams.get("hidden_size", hparams.get("d_model")) // n_heads
        self.gguf_writer.add_rope_dimension_count(rope_dim)

@@ -3884,6 +3891,7 @@ class LLaDAModel(TextModel):

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        n_head = self.hparams.get("num_attention_heads", self.hparams.get("n_heads"))
+        assert n_head is not None
        n_kv_head = self.hparams.get("num_key_value_heads", self.hparams.get("n_kv_heads"))

        if self.undo_permute:
@@ -9485,7 +9493,9 @@ class ChatGLMModel(TextModel):

    def set_gguf_parameters(self):
        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
+        assert n_embed is not None
        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
+        assert n_head is not None
        n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
        self.gguf_writer.add_embedding_length(n_embed)
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1450,13 +1450,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
    std::vector<int32_t> ids;
    std::vector<ggml_bitset_t> used_ids;

+    static bool vk_sched_sync_log = getenv("GGML_VK_SYNC_LOG") != nullptr;
+
    for (int split_id = 0; split_id < sched->n_splits; split_id++) {
        struct ggml_backend_sched_split * split = &splits[split_id];
        int split_backend_id = split->backend_id;
        ggml_backend_t split_backend = sched->backends[split_backend_id];

-        if (sched->events[split_backend_id][sched->cur_copy] == NULL) {
-            ggml_backend_synchronize(split_backend);
+        if (vk_sched_sync_log) {
+            fprintf(stderr, "[VK_SYNC sched] split %d/%d backend_id=%d name=%s n_inputs=%d n_nodes=%d\n",
+                split_id, sched->n_splits, split_backend_id,
+                ggml_backend_name(split_backend), split->n_inputs, split->graph.n_nodes);
        }

        // copy the input tensors to the split backend
@@ -1468,13 +1472,29 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
            if (input->flags & GGML_TENSOR_FLAG_INPUT) {
                // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
+                    if (vk_sched_sync_log) {
+                        fprintf(stderr, "[VK_SYNC sched]   input %s: event_synchronize (INPUT flag)\n", input->name);
+                    }
                    ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
+                } else {
+                    if (vk_sched_sync_log) {
+                        fprintf(stderr, "[VK_SYNC sched]   input %s: backend_synchronize (INPUT flag, no event)\n", input->name);
+                    }
+                    ggml_backend_synchronize(split_backend);
                }
-                ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
+                ggml_backend_tensor_copy(input, input_cpy);
            } else {
                // wait for the split backend to finish using the input before overwriting it
                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
+                    if (vk_sched_sync_log) {
+                        fprintf(stderr, "[VK_SYNC sched]   input %s: event_wait\n", input->name);
+                    }
                    ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
+                } else {
+                    if (vk_sched_sync_log) {
+                        fprintf(stderr, "[VK_SYNC sched]   input %s: backend_synchronize (no event)\n", input->name);
+                    }
+                    ggml_backend_synchronize(split_backend);
                }

                // when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
@@ -1565,7 +1585,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                } else {
                    // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
                    // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
-                    if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
+                    bool async_ok = split_backend->iface.cpy_tensor_async && split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy);
+                    if (vk_sched_sync_log) {
+                        fprintf(stderr, "[VK_SYNC sched]   input %s: cpy_tensor_async=%s\n", input->name, async_ok ? "true" : "false");
+                    }
+                    if (!async_ok) {
+                        if (vk_sched_sync_log) {
+                            fprintf(stderr, "[VK_SYNC sched]   input %s: fallback sync copy\n", input->name);
+                        }
                        ggml_backend_synchronize(input_backend);
                        if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                            ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
@@ -1578,11 +1605,11 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
            }
        }

-        if (sched->events[split_backend_id][sched->cur_copy] == NULL) {
-            ggml_backend_synchronize(split_backend);
-        }
-
        if (!sched->callback_eval) {
+            if (vk_sched_sync_log) {
+                fprintf(stderr, "[VK_SYNC sched]   graph_compute_async on %s (%d nodes)\n",
+                    ggml_backend_name(split_backend), split->graph.n_nodes);
+            }
            enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
            if (ec != GGML_STATUS_SUCCESS) {
                return ec;
@@ -1624,6 +1651,9 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
        // record the event of this copy
        if (split->n_inputs > 0) {
            if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
+                if (vk_sched_sync_log) {
+                    fprintf(stderr, "[VK_SYNC sched]   event_record on %s\n", ggml_backend_name(split_backend));
+                }
                ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
            }
        }
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2823,14 +2823,11 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
    ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
    ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;

-    //enables async copies from CPU to CUDA, instead of only CUDA-to-CUDA
-    bool copy_from_host = ggml_backend_buffer_is_host(buf_src) && ggml_backend_dev_type(backend_src->device) == GGML_BACKEND_DEVICE_TYPE_CPU;
-
-    if (!(copy_from_host || ggml_backend_is_cuda(backend_src)) || !ggml_backend_is_cuda(backend_dst)) {
+    if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
        return false;
    }

-    if (!(copy_from_host || ggml_backend_buffer_is_cuda(buf_src)) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
+    if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
        return false;
    }

@@ -2841,17 +2838,14 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
    ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
    ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;

-    if ((copy_from_host && cuda_ctx_dst->device != buf_ctx_dst->device) ||
-        !copy_from_host && (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device)) {
+    if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
 #ifndef NDEBUG
        GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
 #endif
        return false;
    }

-    if (copy_from_host) {
-        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyHostToDevice, cuda_ctx_dst->stream()));
-    } else if (backend_src != backend_dst) {
+    if (backend_src != backend_dst) {
        // copy on src stream
        if (cuda_ctx_src->device == cuda_ctx_dst->device) {
            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -1156,7 +1156,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
        case GGML_OP_RWKV_WKV7:
            return true;
        case GGML_OP_GATED_DELTA_NET:
-            return op->src[2]->ne[0] % 32 == 0;
+            return has_simdgroup_reduction && op->src[2]->ne[0] % 32 == 0;
        case GGML_OP_SOLVE_TRI:
        case GGML_OP_MUL_MAT:
        case GGML_OP_MUL_MAT_ID:
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -3006,7 +3006,7 @@ kernel void kernel_l2_norm_impl(
    sumf = shmem_f32[tiisg];
    sumf = simd_sum(sumf);

-    const float scale = 1.0f/sqrt(max(sumf, args.eps));
+    const float scale = 1.0f/max(sqrt(sumf), args.eps);

    for (int i00 = tpitg.x; i00 < args.ne00; i00 += ntg.x) {
        y[i00] = x[i00] * scale;
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -115,6 +115,17 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
 #define VK_LOG_DEBUG(msg) ((void) 0)
 #endif // GGML_VULKAN_DEBUG

+// Synchronization tracing for multi-GPU deadlock debugging.
+// Enable with GGML_VK_SYNC_LOG=1 environment variable.
+static bool vk_sync_log_enabled = false;
+#define VK_SYNC_LOG(dev_name, msg) do { \
+    if (vk_sync_log_enabled) { \
+        fprintf(stderr, "[VK_SYNC %s tid=%lu] %s\n", \
+            (dev_name).c_str(), (unsigned long)std::hash<std::thread::id>{}(std::this_thread::get_id()) % 10000, \
+            ((std::ostringstream&)(std::ostringstream() << msg)).str().c_str()); \
+    } \
+} while (0)
+
 struct ggml_backend_vk_context;

 #define MAX_PARAMETER_COUNT 12
@@ -191,6 +202,7 @@ struct vk_queue;

 struct vk_command_buffer {
    vk::CommandBuffer buf;
+    uint64_t use_counter = 0;
    bool in_use = false;
 };

@@ -254,7 +266,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
 class vk_memory_logger;
 class vk_perf_logger;
 static void ggml_vk_destroy_buffer(vk_buffer& buf);
-static void ggml_vk_synchronize(ggml_backend_vk_context * ctx);
+static void ggml_vk_synchronize(ggml_backend_vk_context * ctx, const char * caller = "unknown");

 static constexpr uint32_t mul_mat_vec_max_cols = 8;
 static constexpr uint32_t p021_max_gqa_ratio = 8;
@@ -938,21 +950,26 @@ struct vk_subbuffer {
    }
 };

-// vk_event is used for the event-related backend interfaces. It uses 'event' for
-// event_wait and 'fence' for event_synchronize. Polling on an event for
-// event_synchronize wouldn't be sufficient to wait for command buffers to complete,
-// and would lead to validation errors.
-struct vk_event {
-    vk::Event event;
-    vk::Fence fence;
-    vk_command_buffer* cmd_buffer = nullptr;
-};
-
 struct vk_semaphore {
    vk::Semaphore s;
    uint64_t value;
 };

+// vk_event is used for the event-related backend interfaces. It uses vk::Events for
+// event_wait and a timeline semaphore for event_synchronize. Polling on an event for
+// event_synchronize wouldn't be sufficient to wait for command buffers to complete,
+// and would lead to validation errors.
+struct vk_event {
+    std::vector<vk::Event> events_free; // Events available for reuse
+    std::vector<vk::Event> events_submitted; // Events that are fully submitted and can be reused on next synchronize
+    vk::Event event;
+    bool has_event;
+
+    vk_semaphore tl_semaphore;
+    vk_command_buffer* cmd_buffer = nullptr;
+    uint64_t cmd_buffer_use_counter = 0;
+};
+
 struct vk_submission {
    vk_command_buffer* buffer = nullptr;
    std::vector<vk_semaphore> wait_semaphores;
@@ -2319,7 +2336,7 @@ static vk_command_buffer* ggml_vk_create_cmd_buffer(vk_device& device, vk_comman
        vk::CommandBufferLevel::ePrimary,
        1);
    const std::vector<vk::CommandBuffer> cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info);
-    p.cmd_buffers.push_back({ cmd_buffers.front(), true });
+    p.cmd_buffers.push_back({ cmd_buffers.front(), 0, true });
    return &p.cmd_buffers[p.cmd_buffers.size()-1];
 }

@@ -2788,6 +2805,15 @@ static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subct
    );
 }

+static void ggml_vk_reset_event(vk_context& ctx, vk::Event& event) {
+    VK_LOG_DEBUG("ggml_vk_set_event()");
+
+    ctx->s->buffer->buf.resetEvent(
+        event,
+        ctx->p->q->stage_flags
+    );
+}
+
 static void ggml_vk_set_event(vk_context& ctx, vk::Event& event) {
    VK_LOG_DEBUG("ggml_vk_set_event()");

@@ -5731,6 +5757,7 @@ static void ggml_vk_instance_init() {
    vk_perf_logger_concurrent = getenv("GGML_VK_PERF_LOGGER_CONCURRENT") != nullptr;
    vk_enable_sync_logger = getenv("GGML_VK_SYNC_LOGGER") != nullptr;
    vk_memory_logger_enabled = getenv("GGML_VK_MEMORY_LOGGER") != nullptr;
+    vk_sync_log_enabled = getenv("GGML_VK_SYNC_LOG") != nullptr;
    const char* GGML_VK_PIPELINE_STATS = getenv("GGML_VK_PIPELINE_STATS");
    if (GGML_VK_PIPELINE_STATS != nullptr) {
        vk_pipeline_stats_filter = GGML_VK_PIPELINE_STATS;
@@ -6392,6 +6419,7 @@ static vk_subbuffer ggml_vk_tensor_subbuffer(
 static vk_command_buffer* ggml_vk_get_or_create_cmd_buffer(vk_device& device, vk_command_pool& pool) {
    for (auto& cmd_buffer : pool.cmd_buffers) {
        if (!cmd_buffer.in_use) {
+            cmd_buffer.use_counter++;
            cmd_buffer.in_use = true;
            return &cmd_buffer;
        }
@@ -6495,11 +6523,13 @@ static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
    subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
 }

-static vk_context ggml_vk_get_compute_ctx(ggml_backend_vk_context * ctx) {
+static vk_context ggml_vk_get_compute_ctx(ggml_backend_vk_context * ctx, const char * caller = "unknown") {
    if (!ctx->compute_ctx.expired()) {
        return ctx->compute_ctx.lock();
    }

+    VK_SYNC_LOG(ctx->name, "get_compute_ctx: CREATING NEW compute_ctx, caller=" << caller);
+
    vk_context result = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);

    ctx->compute_ctx = result;
@@ -12652,7 +12682,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_contex
        ggml_vk_ctx_end(subctx);
        ggml_vk_submit(subctx, {});
        ctx->submit_pending = true;
-        ggml_vk_synchronize(ctx);
+        ggml_vk_synchronize(ctx, "preallocate_buffers");
        GGML_ASSERT(ctx->compute_ctx.expired());
        ggml_vk_ctx_begin(ctx->device, subctx);
        ctx->compute_ctx = subctx;
@@ -12729,7 +12759,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
        }
    }

-    vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
+    vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx, "build_graph");

    {
        // This logic detects dependencies between modes in the graph and calls ggml_vk_sync_buffers
@@ -13228,7 +13258,7 @@ static void ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
        ctx->submit_pending = true;

 #ifdef GGML_VULKAN_CHECK_RESULTS
-        ggml_vk_synchronize(ctx);
+        ggml_vk_synchronize(ctx, "check_results");
        ggml_vk_check_results_1(ctx, cgraph, tensor_idx);
 #endif
    }
@@ -13287,7 +13317,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
    // discard any unsubmitted command buffers
    ctx->compute_ctx.reset();
    // wait for any pending command buffers to finish
-    ggml_vk_synchronize(ctx);
+    ggml_vk_synchronize(ctx, "cleanup");

    ggml_vk_graph_cleanup(ctx);

@@ -13606,6 +13636,7 @@ static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_b
 static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
+    VK_SYNC_LOG(ctx->name, "set_tensor_async: tensor=" << tensor->name << " size=" << size);
    GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");

    if (size == 0) {
@@ -13626,7 +13657,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
            cpy_ctx = ctx->transfer_ctx.lock();
        }
    } else {
-        cpy_ctx = ggml_vk_get_compute_ctx(ctx);
+        cpy_ctx = ggml_vk_get_compute_ctx(ctx, "set_tensor_async");
    }

    vk_buffer buf = buf_ctx->dev_buffer;
@@ -13646,13 +13677,14 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor

        cpy_ctx->s->buffer->buf.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy });
        deferred_memcpy(ctx->sync_staging->ptr, data, size, &cpy_ctx->in_memcpys);
-        ggml_vk_synchronize(ctx);
+        ggml_vk_synchronize(ctx, "set_tensor_async_staging");
    }
 }

 static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
+    VK_SYNC_LOG(ctx->name, "get_tensor_async: tensor=" << tensor->name << " size=" << size);
    GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");

    if (size == 0) {
@@ -13661,7 +13693,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_

    ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;

-    vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
+    vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx, "get_tensor_async");

    vk_buffer buf = buf_ctx->dev_buffer;

@@ -13680,13 +13712,17 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_

        compute_ctx->s->buffer->buf.copyBuffer(buf->buffer, ctx->sync_staging->buffer, { buffer_cpy });
        deferred_memcpy(data, ctx->sync_staging->ptr, size, &compute_ctx->out_memcpys);
-        ggml_vk_synchronize(ctx);
+        ggml_vk_synchronize(ctx, "get_tensor_async_staging");
    }
 }

 static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
    VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async(" << src << " -> " << dst << ", size=" << ggml_nbytes(src) << ")");
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend_dst->context;
+    VK_SYNC_LOG(ctx->name, "cpy_tensor_async: BEGIN " << src->name << " -> " << dst->name
+        << " size=" << ggml_nbytes(src)
+        << " src_is_vk=" << ggml_backend_buffer_is_vk(src->buffer)
+        << " src_is_host=" << ggml_backend_buffer_is_host(src->buffer));

    // Skip zero-size tensors
    if (ggml_nbytes(src) == 0) {
@@ -13708,7 +13744,7 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_ba
            return false;
        }

-        vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
+        vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx, "cpy_tensor_async_vk2vk");

        ggml_vk_buffer_copy_async(compute_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs,
                                   src_buf_ctx->dev_buffer, vk_tensor_offset(src) + src->view_offs,
@@ -13721,6 +13757,7 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_ba
        size_t pinned_offset = 0;
        ggml_vk_host_get(ctx->device, src->data, pinned_buf, pinned_offset);
        if (pinned_buf == nullptr) {
+            VK_SYNC_LOG(ctx->name, "cpy_tensor_async: host src not pinned, returning false");
            return false;
        }

@@ -13730,27 +13767,34 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_ba
                cpy_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
                ctx->transfer_ctx = cpy_ctx;
                ggml_vk_ctx_begin(ctx->device, cpy_ctx);
+                VK_SYNC_LOG(ctx->name, "cpy_tensor_async: created new transfer_ctx");
            } else {
                cpy_ctx = ctx->transfer_ctx.lock();
            }
+            VK_SYNC_LOG(ctx->name, "cpy_tensor_async: using transfer queue for host->dev copy");
        } else {
-            cpy_ctx = ggml_vk_get_compute_ctx(ctx);
+            cpy_ctx = ggml_vk_get_compute_ctx(ctx, "cpy_tensor_async_host2dev");
+            VK_SYNC_LOG(ctx->name, "cpy_tensor_async: using compute queue for host->dev copy");
        }

-        return ggml_vk_buffer_write_async(cpy_ctx, dst_buf,
+        bool ret = ggml_vk_buffer_write_async(cpy_ctx, dst_buf,
                                          vk_tensor_offset(dst) + dst->view_offs,
                                          src->data, ggml_nbytes(src));
+        VK_SYNC_LOG(ctx->name, "cpy_tensor_async: host->dev write_async returned " << ret);
+        return ret;
    }

    GGML_UNUSED(backend_src);
    return false;
 }

-static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
+static void ggml_vk_synchronize(ggml_backend_vk_context * ctx, const char * caller) {
    VK_LOG_DEBUG("ggml_vk_synchronize()");

    bool do_transfer = !ctx->compute_ctx.expired();

+    VK_SYNC_LOG(ctx->name, "vk_synchronize: do_transfer=" << do_transfer << " submit_pending=" << ctx->submit_pending << " caller=" << caller);
+
    if (ggml_vk_submit_transfer_ctx(ctx)) {
        ctx->submit_pending = true;
    }
@@ -13774,6 +13818,7 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
    }

    if (ctx->submit_pending) {
+        VK_SYNC_LOG(ctx->name, "vk_synchronize: waiting for fence...");
        if (ctx->device->async_use_transfer_queue && ctx->transfer_semaphore_last_submitted < ctx->transfer_semaphore.value) {
            vk::TimelineSemaphoreSubmitInfo tl_info{
                1, &ctx->transfer_semaphore.value,
@@ -13797,6 +13842,7 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
        ctx->submit_pending = false;
        if (cmd_buf) {
            cmd_buf->in_use = false;
+            cmd_buf->buf.reset();
        }
    }

@@ -13812,9 +13858,16 @@ static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
    VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;

-    ggml_vk_synchronize(ctx);
+    VK_SYNC_LOG(ctx->name, "synchronize: BEGIN submit_pending=" << ctx->submit_pending
+        << " compute_ctx_alive=" << !ctx->compute_ctx.expired());
+
+    ggml_vk_synchronize(ctx, "backend_synchronize");
+
+    VK_SYNC_LOG(ctx->name, "synchronize: fence done, calling graph_cleanup");

    ggml_vk_graph_cleanup(ctx);
+
+    VK_SYNC_LOG(ctx->name, "synchronize: DONE");
 }

 static bool ggml_vk_is_empty(ggml_tensor * node) {
@@ -14233,6 +14286,10 @@ static int32_t find_first_set(uint32_t x) {
 static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
    VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
+    VK_SYNC_LOG(ctx->name, "graph_compute: BEGIN nodes=" << cgraph->n_nodes
+        << " compute_ctx_alive=" << !ctx->compute_ctx.expired()
+        << " transfer_ctx_alive=" << !ctx->transfer_ctx.expired()
+        << " submit_pending=" << ctx->submit_pending);

    if (vk_instance.debug_utils_support) {
        vk::DebugUtilsLabelEXT dul = {};
@@ -14285,7 +14342,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
        std::fill(ctx->query_node_idx.begin(), ctx->query_node_idx.end(), 0);

        GGML_ASSERT(ctx->compute_ctx.expired());
-        compute_ctx = ggml_vk_get_compute_ctx(ctx);
+        compute_ctx = ggml_vk_get_compute_ctx(ctx, "graph_compute_perf");
        ctx->query_idx = 0;
        compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
    }
@@ -14295,7 +14352,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg

    if (ctx->prealloc_size_add_rms_partials) {
        ggml_vk_preallocate_buffers(ctx, nullptr);
-        compute_ctx = ggml_vk_get_compute_ctx(ctx);
+        compute_ctx = ggml_vk_get_compute_ctx(ctx, "graph_compute_rms_partials");
        // initialize partial sums to zero.
        ggml_vk_buffer_memset_async(compute_ctx, ctx->prealloc_add_rms_partials, 0, 0, ctx->prealloc_size_add_rms_partials);
        ggml_vk_sync_buffers(ctx, compute_ctx);
@@ -14518,7 +14575,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
        bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);

        if (vk_perf_logger_enabled && enqueued) {
-            compute_ctx = ggml_vk_get_compute_ctx(ctx);
+            compute_ctx = ggml_vk_get_compute_ctx(ctx, "graph_compute_perf_timestamp");
            if (!vk_perf_logger_concurrent) {
                // track a single node/fusion for the current query
                ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
@@ -14601,9 +14658,13 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
    }

    if (!ctx->device->support_async) {
-        ggml_vk_synchronize(ctx);
+        VK_SYNC_LOG(ctx->name, "graph_compute: sync (support_async=false)");
+        ggml_vk_synchronize(ctx, "graph_compute_no_async");
    }

+    VK_SYNC_LOG(ctx->name, "graph_compute: DONE submit_pending=" << ctx->submit_pending
+        << " compute_ctx_alive=" << !ctx->compute_ctx.expired());
+
    return GGML_STATUS_SUCCESS;

    UNUSED(backend);
@@ -14853,24 +14914,45 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
    vk_event *vkev = (vk_event *)event->context;

+    VK_SYNC_LOG(ctx->name, "event_record: BEGIN event=" << event
+        << " compute_ctx_alive=" << !ctx->compute_ctx.expired()
+        << " submit_pending=" << ctx->submit_pending);
+
    ggml_vk_submit_transfer_ctx(ctx);

-    vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
+    vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx, "event_record");
    auto* cmd_buf = compute_ctx->s->buffer; // retrieve pointer before it gets reset

-    // the backend interface doesn't have an explicit reset, so reset it here
-    // before we record the command to set it
-    ctx->device->device.resetEvent(vkev->event);
-    ctx->device->device.resetFences({ vkev->fence });
+    if (vkev->has_event) {
+        // Move existing event into submitted
+        vkev->events_submitted.push_back(vkev->event);
+    }
+
+    // Grab the next event and record it, create one if necessary
+    if (vkev->events_free.empty()) {
+        VK_SYNC_LOG(ctx->name, "event_record: create new event");
+        vkev->event = ctx->device->device.createEvent({});
+    } else {
+        vkev->event = vkev->events_free.back();
+        vkev->events_free.pop_back();
+    }
+
+    vkev->has_event = true;

    ggml_vk_set_event(compute_ctx, vkev->event);

+    vkev->tl_semaphore.value++;
+    compute_ctx->s->signal_semaphores.push_back(vkev->tl_semaphore);
    ggml_vk_ctx_end(compute_ctx);

-    ggml_vk_submit(compute_ctx, {vkev->fence});
+    VK_SYNC_LOG(ctx->name, "event_record: set event, submit cmd_buf=" << cmd_buf->buf << ", vk_event=" << (VkEvent)vkev->event);
+    ggml_vk_submit(compute_ctx, {});
    ctx->submit_pending = true;
    vkev->cmd_buffer = cmd_buf;
+    vkev->cmd_buffer_use_counter = cmd_buf->use_counter;
    ctx->compute_ctx.reset();
+
+    VK_SYNC_LOG(ctx->name, "event_record: DONE event=" << event);
 }

 static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
@@ -14878,11 +14960,16 @@ static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_even
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
    vk_event *vkev = (vk_event *)event->context;

-    vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
+    vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx, "event_wait");

-    ggml_vk_wait_events(compute_ctx, {vkev->event});
-    ggml_vk_ctx_end(compute_ctx);
-    ctx->compute_ctx.reset();
+    if (vkev->has_event) {
+        // Wait for latest event
+        VK_SYNC_LOG(ctx->name, "event_wait: recording vkCmdWaitEvents, event=" << event
+            << " vk_event=" << (VkEvent)vkev->event
+            << " compute_ctx_alive=" << !ctx->compute_ctx.expired());
+        ggml_vk_wait_events(compute_ctx, { vkev->event });
+        VK_SYNC_LOG(ctx->name, "event_wait: DONE (cmd recorded, not yet submitted)");
+    }
 }

 // TODO: enable async and synchronize
@@ -15672,10 +15759,13 @@ static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t
        return nullptr;
    }

-    // The event/fence is expected to initially be in the signaled state.
-    vkev->event = device->device.createEvent({});
-    vkev->fence = device->device.createFence({vk::FenceCreateFlagBits::eSignaled});
-    device->device.setEvent(vkev->event);
+    // No events initially, they get created on demand
+    vkev->has_event = false;
+
+    vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
+    vk::SemaphoreCreateInfo ci{};
+    ci.setPNext(&tci);
+    vkev->tl_semaphore = { device->device.createSemaphore(ci), 0 };

    return new ggml_backend_event {
        /* .device  = */ dev,
@@ -15689,8 +15779,16 @@ static void ggml_backend_vk_device_event_free(ggml_backend_dev_t dev, ggml_backe

    vk_event *vkev = (vk_event *)event->context;

-    device->device.destroyFence(vkev->fence);
-    device->device.destroyEvent(vkev->event);
+    device->device.destroySemaphore(vkev->tl_semaphore.s);
+    for (auto& event : vkev->events_free) {
+        device->device.destroyEvent(event);
+    }
+    for (auto& event : vkev->events_submitted) {
+        device->device.destroyEvent(event);
+    }
+    if (vkev->has_event) {
+        device->device.destroyEvent(vkev->event);
+    }
    delete vkev;
    delete event;
 }
@@ -15701,10 +15799,33 @@ static void ggml_backend_vk_device_event_synchronize(ggml_backend_dev_t dev, ggm
    auto device = ggml_vk_get_device(ctx->device);
    vk_event *vkev = (vk_event *)event->context;

-    VK_CHECK(device->device.waitForFences({ vkev->fence }, true, UINT64_MAX), "event_synchronize");
-    // Finished using current command buffer so we flag for reuse
-    if (vkev->cmd_buffer) {
-        vkev->cmd_buffer->in_use = false;
+    // Only do something if the event has actually been used
+    if (vkev->has_event) {
+        VK_SYNC_LOG(device->name, "event_synchronize: BEGIN waiting on timeline semaphore, event=" << event);
+        vk::Semaphore sem = vkev->tl_semaphore.s;
+        uint64_t val = vkev->tl_semaphore.value;
+        vk::SemaphoreWaitInfo swi{vk::SemaphoreWaitFlags{}, sem, val};
+        VK_CHECK(device->device.waitSemaphores(swi, UINT64_MAX), "event_synchronize");
+
+        VK_SYNC_LOG(device->name, "event_synchronize: timeline semaphore wait done, event=" << event);
+
+        // Reset and move submitted events
+        for (auto& event : vkev->events_submitted) {
+            VK_SYNC_LOG(device->name, "event_synchronize: resetting vkevent=" << event);
+            device->device.resetEvent(event);
+        }
+        vkev->events_free.insert(vkev->events_free.end(), vkev->events_submitted.begin(), vkev->events_submitted.end());
+        vkev->events_submitted.clear();
+
+        // Finished using current command buffer so we flag for reuse
+        if (vkev->cmd_buffer) {
+            // Only flag for reuse if it hasn't been reused already
+            if (vkev->cmd_buffer_use_counter == vkev->cmd_buffer->use_counter) {
+                vkev->cmd_buffer->in_use = false;
+                vkev->cmd_buffer->buf.reset();
+            }
+            vkev->cmd_buffer = nullptr;
+        }
    }
 }

--- a/scripts/sync_vendor.py
+++ b/scripts/sync_vendor.py
@@ -5,7 +5,7 @@ import os
 import sys
 import subprocess

-HTTPLIB_VERSION = "refs/tags/v0.37.0"
+HTTPLIB_VERSION = "refs/tags/v0.37.1"

 vendor = {
    "https://github.com/nlohmann/json/releases/latest/download/json.hpp":     "vendor/nlohmann/json.hpp",
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -7,6 +7,7 @@
 #include "llama-memory.h"
 #include "llama-mmap.h"
 #include "llama-model.h"
+#include "llama-ext.h"

 #include <cinttypes>
 #include <cmath>
@@ -341,6 +342,14 @@ llama_context::llama_context(

        if (cparams.pipeline_parallel) {
            LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
+
+            if (!graph_reuse_disable) {
+                // TODO: figure out a way to make graph reuse work with pipeline parallelism
+                // ref: https://github.com/ggml-org/llama.cpp/pull/20463
+                LLAMA_LOG_WARN("%s: graph reuse is currently not compatible with pipeline parallelism - disabling\n", __func__);
+
+                graph_reuse_disable = true;
+            }
        }

        sched_reserve();
@@ -3129,6 +3138,19 @@ uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
    return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
 }

+struct ggml_cgraph * llama_graph_reserve(
+        struct llama_context * ctx,
+        uint32_t n_tokens,
+        uint32_t n_seqs,
+        uint32_t n_outputs) {
+    auto * memory = ctx->get_memory();
+    llama_memory_context_ptr mctx;
+    if (memory) {
+        mctx = memory->init_full();
+    }
+    return ctx->graph_reserve(n_tokens, n_seqs, n_outputs, mctx.get());
+}
+
 // llama adapter API

 int32_t llama_set_adapters_lora(
--- a/src/llama-ext.h
+++ b/src/llama-ext.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "llama-context.h"
+#include "ggml.h"
+#include "stdint.h"
+
+// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
+LLAMA_API struct ggml_cgraph * llama_graph_reserve(
+        struct llama_context * ctx,
+        uint32_t n_tokens,
+        uint32_t n_seqs,
+        uint32_t n_outputs);
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@@ -1160,13 +1160,13 @@ struct llama_grammar * llama_grammar_init_impl(
    // if there is a grammar, parse it
    // rules will be empty (default) if there are parse errors
    if (!parser.parse(grammar_str) || parser.rules.empty()) {
-        fprintf(stderr, "%s: failed to parse grammar\n", __func__);
+        LLAMA_LOG_ERROR("failed to parse grammar\n");
        return nullptr;
    }

-    // Ensure that there is a "root" node.
-    if (parser.symbol_ids.find("root") == parser.symbol_ids.end()) {
-        fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
+    // Ensure that the grammar contains the start symbol
+    if (parser.symbol_ids.find(grammar_root) == parser.symbol_ids.end()) {
+        LLAMA_LOG_ERROR("grammar does not contain a '%s' symbol\n", grammar_root);
        return nullptr;
    }

@@ -1195,7 +1195,7 @@ struct llama_grammar * llama_grammar_init_impl(
            continue;
        }
        if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
-            LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
+            LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu\n", i);
            return nullptr;
        }
    }
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -260,6 +260,7 @@ endif()
 set(LLAMA_TEST_NAME test-mtmd-c-api)
 llama_build_and_test(test-mtmd-c-api.c)
 target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
+unset(LLAMA_TEST_NAME)

 # GGUF model data fetcher library for tests that need real model metadata
 # Only compile when cpp-httplib has SSL support (CPPHTTPLIB_OPENSSL_SUPPORT)
@@ -284,4 +285,5 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama)
 llama_build_and_test(test-alloc.cpp)
 target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)

-
+llama_build(export-graph-ops.cpp)
+target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
--- a/tests/export-graph-ops.cpp
+++ b/tests/export-graph-ops.cpp
@@ -0,0 +1,169 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+#include "../src/llama-ext.h"
+#include "ggml.h"
+
+#include <array>
+#include <vector>
+#include <set>
+#include <fstream>
+#include <iostream>
+
+struct input_tensor {
+    ggml_type type;
+    std::array<int64_t, 4> ne;
+    std::array<size_t, 4> nb;
+
+    input_tensor(ggml_type type, int64_t * ne, size_t * nb): type(type) {
+        memcpy(this->ne.data(), ne, 4 * sizeof(int64_t));
+        memcpy(this->nb.data(), nb, 4 * sizeof(size_t));
+    }
+
+    bool operator<(const input_tensor &b) const {
+        return std::tie(type, ne, nb) <
+               std::tie(b.type, b.ne, b.nb);
+    }
+
+    void serialize(std::ostream& out) const {
+        out << type << ' ';
+        for (size_t i = 0; i < 4; i++) {
+            out << ne[i] << ' ';
+        }
+        for (size_t i = 0; i < 4; i++) {
+            out << nb[i] << ' ';
+        }
+    }
+};
+
+struct test_object {
+    ggml_op op;
+    ggml_type type;
+    std::array<int64_t, 4> ne;
+    std::vector<int32_t> op_params;
+    std::vector<input_tensor> sources;
+    std::string name;
+
+    void serialize(std::ostream& out) const {
+        out << op << ' ' << type << ' ';
+        for (size_t i = 0; i < 4; i++) {
+            out << ne[i] << ' ';
+        }
+
+        out << op_params.size() << ' ';
+        for (size_t i = 0; i < op_params.size(); i++) {
+            out << op_params[i] << ' ';
+        }
+
+        out << sources.size() << ' ';
+        for (size_t s = 0; s < sources.size(); s++) {
+            sources[s].serialize(out);
+        }
+
+        if (!name.empty()) {
+            out << name;
+        } else {
+            out << '-';
+        }
+
+        out << '\n';
+    }
+
+    bool operator<(const test_object &b) const {
+        return std::tie(op, type, ne, op_params, sources) <
+               std::tie(b.op, b.type, b.ne, b.op_params, b.sources);
+    }
+};
+
+static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set<test_object> & tests) {
+    int n_nodes = ggml_graph_n_nodes(cgraph);
+    int n_skipped = 0;
+    int n_before = (int) tests.size();
+    for (int i = 0; i < n_nodes; i++) {
+        ggml_tensor * node = ggml_graph_node(cgraph, i);
+
+        if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) {
+            n_skipped++;
+            continue;
+        }
+
+        test_object test;
+
+        test.op = node->op;
+        test.type = node->type;
+        memcpy(&test.ne, node->ne, 4 * sizeof(int64_t));
+
+        test.op_params.resize(GGML_MAX_OP_PARAMS / sizeof(int32_t));
+        memcpy(test.op_params.data(), node->op_params, GGML_MAX_OP_PARAMS);
+
+        for (size_t s = 0; s < GGML_MAX_SRC; s++) {
+            if (node->src[s] == nullptr) {
+                break;
+            }
+
+            test.sources.emplace_back(node->src[s]->type, node->src[s]->ne, node->src[s]->nb);
+        }
+
+        test.name = node->name;
+        tests.insert(test);
+    }
+
+    int n_new = (int) tests.size() - n_before;
+    LOG_INF("%s: %d unique ops, %d total nodes, %d skipped (view ops)\n",
+            label, n_new, n_nodes, n_skipped);
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+    params.out_file = "tests.txt";
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS)) {
+        return 1;
+    }
+
+    common_init();
+
+    // Load CPU-only
+    ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    params.devices = { cpu_device, nullptr };
+    params.fit_params = false;
+    params.n_gpu_layers = 0;
+
+    params.warmup = false;
+
+    auto init_result = common_init_from_params(params);
+
+    llama_context * ctx = init_result->context();
+
+    const uint32_t n_seqs  = llama_n_seq_max(ctx);
+    const uint32_t n_tokens = std::min(llama_n_ctx(ctx), llama_n_ubatch(ctx));
+
+    std::set<test_object> tests;
+
+    auto * gf_pp = llama_graph_reserve(ctx, n_tokens, n_seqs, n_tokens);
+    if (!gf_pp) {
+        throw std::runtime_error("failed to reserve prompt processing graph");
+    }
+    extract_graph_ops(gf_pp, "pp", tests);
+
+    auto * gf_tg = llama_graph_reserve(ctx, n_seqs, n_seqs, n_seqs);
+    if (!gf_tg) {
+        throw std::runtime_error("failed to reserve token generation graph");
+    }
+    extract_graph_ops(gf_tg, "tg", tests);
+
+    LOG_INF("%d unique ops total\n", (int) tests.size());
+
+    std::ofstream f(params.out_file);
+
+    if (!f.is_open()) {
+        throw std::runtime_error("Unable to open output file");
+    }
+
+    for (const auto& test : tests) {
+        test.serialize(f);
+    }
+
+    return 0;
+}
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -31,10 +31,12 @@
 #include <cstring>
 #include <ctime>
 #include <future>
+#include <fstream>
 #include <memory>
 #include <random>
 #include <regex>
 #include <set>
+#include <sstream>
 #include <string>
 #include <string_view>
 #include <thread>
@@ -6648,6 +6650,236 @@ struct test_diag : public test_case {
    }
 };

+// Deserializable generic test case
+struct input_tensor {
+    ggml_type type;
+    std::array<int64_t, 4> ne;
+    std::array<size_t, 4> nb; // strides (0 = use default contiguous strides)
+};
+
+static bool is_non_contiguous(const input_tensor & src) {
+    if (src.nb[0] == 0) {
+        return false;
+    }
+    const size_t default_nb0 = ggml_type_size(src.type);
+    const size_t default_nb1 = default_nb0 * (src.ne[0] / ggml_blck_size(src.type));
+    const size_t default_nb2 = default_nb1 * src.ne[1];
+    const size_t default_nb3 = default_nb2 * src.ne[2];
+    return src.nb[0] != default_nb0 ||
+           src.nb[1] != default_nb1 ||
+           src.nb[2] != default_nb2 ||
+           src.nb[3] != default_nb3;
+}
+
+static std::string var_to_str(const std::vector<input_tensor>& sources) {
+    std::ostringstream oss;
+    bool first = true;
+    for (const auto& src : sources) {
+        if (!first) oss << ",";
+        oss << ggml_type_name(src.type) << "[" << src.ne[0] << "," << src.ne[1] << "," << src.ne[2] << "," << src.ne[3] << "]";
+        if (is_non_contiguous(src)) {
+            oss << "nb[" << src.nb[0] << "," << src.nb[1] << "," << src.nb[2] << "," << src.nb[3] << "]";
+        }
+        first = false;
+    }
+    return oss.str();
+}
+
+static std::string var_to_str(const std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)>& params) {
+    std::ostringstream oss;
+    oss << "[";
+    bool first = true;
+    for (size_t i = 0; i < params.size(); ++i) {
+        if (params[i] != 0) {
+            if (!first) oss << ",";
+            oss << i << ":" << params[i];
+            first = false;
+        }
+    }
+    oss << "]";
+    return oss.str();
+}
+
+
+struct test_generic_op : public test_case {
+    const ggml_op op;
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params;
+
+    const std::vector<input_tensor> sources;
+    const std::string name;
+
+    std::string vars() override {
+        if (name.empty()) {
+            return VARS_TO_STR4(type, ne, op_params, sources);
+        }
+
+        return VARS_TO_STR5(name, type, ne, op_params, sources);
+    }
+
+    test_generic_op(ggml_op op, ggml_type type, std::array<int64_t, 4> ne,
+                    std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params,
+                    std::vector<input_tensor> sources, std::string name = "")
+        : op(op), type(type), ne(ne), op_params(op_params), sources(sources), name(std::move(name)) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const size_t source_count = std::min(sources.size(), (size_t)GGML_MAX_SRC);
+
+        std::array<ggml_tensor *, GGML_MAX_SRC> source_tensors;
+        for (size_t i = 0; i < source_count; ++i) {
+            const input_tensor& src = sources[i];
+
+            if (is_non_contiguous(src)) {
+                size_t total_size;
+                const size_t blck_size = ggml_blck_size(src.type);
+                if (blck_size == 1) {
+                    total_size = ggml_type_size(src.type);
+                    for (int d = 0; d < 4; d++) {
+                        total_size += (src.ne[d] - 1) * src.nb[d];
+                    }
+                } else {
+                    total_size = src.ne[0] * src.nb[0] / blck_size;
+                    for (int d = 1; d < 4; d++) {
+                        total_size += (src.ne[d] - 1) * src.nb[d];
+                    }
+                }
+
+                // Convert bytes to elements, padded to block size for quantized types
+                const size_t type_size = ggml_type_size(src.type);
+                size_t backing_elements = (total_size * blck_size + type_size - 1) / type_size;
+                backing_elements = ((backing_elements + blck_size - 1) / blck_size) * blck_size;
+                ggml_tensor * backing = ggml_new_tensor_1d(ctx, src.type, backing_elements);
+                source_tensors[i] = ggml_view_4d(ctx, backing,
+                    src.ne[0], src.ne[1], src.ne[2], src.ne[3],
+                    src.nb[1], src.nb[2], src.nb[3], 0);
+                // nb[0] does not get set by view_4d, so set it manually
+                source_tensors[i]->nb[0] = src.nb[0];
+            } else {
+                source_tensors[i] = ggml_new_tensor_4d(ctx, src.type, src.ne[0], src.ne[1], src.ne[2], src.ne[3]);
+            }
+        }
+
+        // Ops with an inplace flag create a view of src[0] as their output.
+        bool inplace = false;
+        if (op == GGML_OP_SET || op == GGML_OP_ACC) {
+            inplace = op_params[4] != 0;
+        } else if (op == GGML_OP_ADD_REL_POS) {
+            inplace = op_params[0] != 0;
+        }
+
+        ggml_tensor * out;
+        if (inplace && source_count > 0) {
+            out = ggml_view_tensor(ctx, source_tensors[0]);
+        } else {
+            out = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        }
+        out->op = op;
+        for (size_t i = 0; i < source_count; ++i) {
+            out->src[i] = source_tensors[i];
+        }
+
+        memcpy(out->op_params, op_params.data(), GGML_MAX_OP_PARAMS);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    double max_nmse_err() override {
+        switch (op) {
+        case GGML_OP_MUL_MAT:
+        case GGML_OP_MUL_MAT_ID:
+        case GGML_OP_OUT_PROD:
+        case GGML_OP_CONV_TRANSPOSE_2D:
+        case GGML_OP_IM2COL:
+        case GGML_OP_CONV_2D:
+        case GGML_OP_CONV_3D:
+        case GGML_OP_SET_ROWS:
+        case GGML_OP_CPY:
+            return 5e-4;
+        case GGML_OP_SOFT_MAX:
+            return 1e-6;
+        case GGML_OP_RWKV_WKV7:
+            return 5e-3;
+        case GGML_OP_FLASH_ATTN_EXT:
+        {
+            // Scale error with kv length to account for accumulating floating point error
+            const int64_t kv = sources[1].ne[1];
+            return 5e-4 * std::max(1.0, kv / 20000.0);
+        }
+        default:
+            return 1e-7;
+        }
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        ggml_tensor * out = ggml_get_tensor(ctx, "out");
+
+        std::random_device rd;
+        std::default_random_engine rng(rd());
+
+        for (size_t i = 0; i < sources.size() && i < GGML_MAX_SRC; i++) {
+            ggml_tensor * t = out->src[i];
+            if (!t) {
+                break;
+            }
+
+            // FLASH_ATTN_EXT: src[3] is the KQ mask
+            if (op == GGML_OP_FLASH_ATTN_EXT && i == 3) {
+                init_tensor_kq_mask(t);
+                continue;
+            }
+
+            if (t->type == GGML_TYPE_I32 || t->type == GGML_TYPE_I64) {
+                if (op == GGML_OP_GET_ROWS || op == GGML_OP_GET_ROWS_BACK) {
+                    const int64_t num_rows = sources[0].ne[1];
+                    const int64_t nels = ggml_nelements(t);
+                    std::vector<int32_t> data(nels);
+                    std::uniform_int_distribution<int32_t> dist(0, num_rows - 1);
+                    for (int64_t i = 0; i < nels; i++) {
+                        data[i] = dist(rng);
+                    }
+                    ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t));
+                } else if (op == GGML_OP_SET_ROWS) {
+                    init_set_rows_row_ids(t, ne[1]);
+                } else if (op == GGML_OP_ROPE) {
+                    const int mode = op_params[2];
+                    const int64_t nels = (mode & GGML_ROPE_TYPE_MROPE) ? ne[2] * 4 : ne[2];
+                    std::vector<int32_t> data(nels);
+                    std::uniform_int_distribution<int32_t> dist(0, ne[2] - 1);
+                    for (int64_t i = 0; i < nels; i++) {
+                        data[i] = dist(rng);
+                    }
+                    ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t));
+                } else if (op == GGML_OP_MUL_MAT_ID || op == GGML_OP_ADD_ID) {
+                    const int64_t n_expert = (op == GGML_OP_MUL_MAT_ID) ? sources[0].ne[2] : sources[1].ne[1];
+                    for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                        std::vector<int32_t> data(t->ne[0]);
+                        for (int32_t i = 0; i < t->ne[0]; i++) {
+                            data[i] = i % n_expert;
+                        }
+                        std::shuffle(data.begin(), data.end(), rng);
+                        ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
+                    }
+                } else if (op == GGML_OP_SSM_SCAN) {
+                    for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                        std::vector<int32_t> data(t->ne[0]);
+                        for (int32_t i = 0; i < t->ne[0]; i++) {
+                            data[i] = i;
+                        }
+                        std::shuffle(data.begin(), data.end(), rng);
+                        ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
+                    }
+                } else {
+                    init_tensor_uniform(t);
+                }
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+

 enum llm_norm_type {
    LLM_NORM,
@@ -8751,8 +8983,72 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
    return test_cases;
 }

+static std::vector<std::unique_ptr<test_case>> make_test_cases_from_file(const char * path) {
+    std::ifstream f(path);
+
+    if (!f.is_open()) {
+        throw std::runtime_error("Unable to read test file");
+    }
+
+    std::vector<std::unique_ptr<test_case>> test_cases;
+
+    std::string line;
+
+    while (std::getline(f, line)) {
+        std::istringstream iss(line);
+
+        ggml_op op;
+        ggml_type type;
+        std::array<int64_t, 4> ne;
+        std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params = {};
+        std::string name;
+        uint64_t tmp;
+
+        iss >> tmp;
+        op = (ggml_op)tmp;
+        iss >> tmp;
+        type = (ggml_type)tmp;
+
+        for (size_t i = 0; i < 4; i++) {
+            iss >> ne[i];
+        }
+
+        iss >> tmp;
+        for (size_t i = 0; i < tmp && i < op_params.size(); i++) {
+            iss >> op_params[i];
+        }
+
+        iss >> tmp;
+
+        size_t num_src = std::min((uint64_t)GGML_MAX_SRC, tmp);
+        std::vector<input_tensor> sources(num_src);
+        for (size_t i = 0; i < num_src; i++) {
+            input_tensor& src = sources[i];
+            iss >> tmp;
+            src.type = (ggml_type)tmp;
+
+            for (size_t i = 0; i < 4; i++) {
+                iss >> src.ne[i];
+            }
+            for (size_t i = 0; i < 4; i++) {
+                iss >> src.nb[i];
+            }
+        }
+
+        iss >> name;
+
+        if (name.length() == 1 && name[0] == '-') {
+            name = "";
+        }
+
+        test_cases.emplace_back(new test_generic_op(op, type, ne, op_params, sources, std::move(name)));
+    }
+
+    return test_cases;
+}
+
 static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_names_filter, const char * params_filter,
-                         printer * output_printer) {
+                         printer * output_printer, const char * test_file_path) {
    auto filter_test_cases = [](std::vector<std::unique_ptr<test_case>> & test_cases, const char * params_filter) {
        if (params_filter == nullptr) {
            return;
@@ -8770,9 +9066,26 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        }
    };

+    std::vector<std::unique_ptr<test_case>> test_cases;
+
+    if (test_file_path == nullptr) {
+        switch (mode) {
+        case MODE_TEST:
+        case MODE_GRAD:
+        case MODE_SUPPORT:
+            test_cases = make_test_cases_eval();
+            break;
+        case MODE_PERF:
+            test_cases = make_test_cases_perf();
+            break;
+        }
+    } else {
+        test_cases = make_test_cases_from_file(test_file_path);
+    }
+
+    filter_test_cases(test_cases, params_filter);
+
    if (mode == MODE_TEST) {
-        auto test_cases = make_test_cases_eval();
-        filter_test_cases(test_cases, params_filter);
        ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
        if (backend_cpu == NULL) {
            test_operation_info info("", "", "CPU");
@@ -8812,8 +9125,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    }

    if (mode == MODE_GRAD) {
-        auto test_cases = make_test_cases_eval();
-        filter_test_cases(test_cases, params_filter);
        size_t n_ok = 0;
        for (auto & test : test_cases) {
            if (test->eval_grad(backend, op_names_filter, output_printer)) {
@@ -8826,8 +9137,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    }

    if (mode == MODE_PERF) {
-        auto test_cases = make_test_cases_perf();
-        filter_test_cases(test_cases, params_filter);
        for (auto & test : test_cases) {
            test->eval_perf(backend, op_names_filter, output_printer);
        }
@@ -8835,9 +9144,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    }

    if (mode == MODE_SUPPORT) {
-        auto test_cases = make_test_cases_eval();
-        filter_test_cases(test_cases, params_filter);
-
        // Filter out fusion cases
        test_cases.erase(
            std::remove_if(test_cases.begin(), test_cases.end(), [](const std::unique_ptr<test_case> & tc) {
@@ -8956,7 +9262,8 @@ static void show_test_coverage() {
 }

 static void usage(char ** argv) {
-    printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops] [--show-coverage]\n", argv[0]);
+    printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops]", argv[0]);
+    printf(" [--show-coverage] [--test-file <path>]\n");
    printf("    valid modes:\n");
    printf("      - test (default, compare with CPU backend for correctness)\n");
    printf("      - grad (compare gradients from backpropagation with method of finite differences)\n");
@@ -8967,6 +9274,7 @@ static void usage(char ** argv) {
    printf("    --output specifies output format (default: console, options: console, sql, csv)\n");
    printf("    --list-ops lists all available GGML operations\n");
    printf("    --show-coverage shows test coverage\n");
+    printf("    --test-file reads test operators from a test file generated by llama-export-graph-ops\n");
 }

 int main(int argc, char ** argv) {
@@ -8975,6 +9283,7 @@ int main(int argc, char ** argv) {
    const char * op_names_filter = nullptr;
    const char * backend_filter = nullptr;
    const char * params_filter = nullptr;
+    const char * test_file_path = nullptr;

    for (int i = 1; i < argc; i++) {
        if (strcmp(argv[i], "test") == 0) {
@@ -9022,6 +9331,13 @@ int main(int argc, char ** argv) {
        } else if (strcmp(argv[i], "--show-coverage") == 0) {
            show_test_coverage();
            return 0;
+        } else if (strcmp(argv[i], "--test-file") == 0) {
+            if (i + 1 < argc) {
+                test_file_path = argv[++i];
+            } else {
+                usage(argv);
+                return 1;
+            }
        } else {
            usage(argv);
            return 1;
@@ -9074,7 +9390,7 @@ int main(int argc, char ** argv) {
                                                             false, "", ggml_backend_dev_description(dev),
                                                             total / 1024 / 1024, free / 1024 / 1024, true));

-        bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get());
+        bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get(), test_file_path);

        if (ok) {
            n_ok++;
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@@ -15,8 +15,12 @@

 using json = nlohmann::ordered_json;

+static llama_grammar * build_grammar_with_root(const std::string & grammar_str, const char * grammar_root) {
+    return llama_grammar_init_impl(nullptr, grammar_str.c_str(), grammar_root, false, nullptr, 0, nullptr, 0);
+}
+
 static llama_grammar * build_grammar(const std::string & grammar_str) {
-    return llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0);
+    return build_grammar_with_root(grammar_str, "root");
 }

 static bool test_build_grammar_fails(const std::string & grammar_str) {
@@ -860,6 +864,36 @@ static void test_failure_left_recursion() {
    fprintf(stderr, "  ✅︎ Passed\n");
 }

+static void test_failure_missing_root_symbol() {
+    fprintf(stderr, "⚫ Testing missing root symbol:\n");
+
+    const std::string grammar_str = R"""(
+        root ::= "foobar"
+    )""";
+
+    llama_grammar * failure_result = build_grammar_with_root(grammar_str, "nonexistent");
+    assert(failure_result == nullptr);
+
+    fprintf(stderr, "  ✅︎ Passed\n");
+}
+
+static void test_custom_root_symbol_check() {
+    fprintf(stderr, "⚫ Testing custom root symbol check:\n");
+
+    const std::string custom_root_grammar_str = R"""(
+        foobar ::= "foobar"
+    )""";
+
+    llama_grammar * failure_result = build_grammar_with_root(custom_root_grammar_str, "root");
+    assert(failure_result == nullptr);
+
+    llama_grammar * success_result = build_grammar_with_root(custom_root_grammar_str, "foobar");
+    assert(success_result != nullptr);
+    llama_grammar_free_impl(success_result);
+
+    fprintf(stderr, "  ✅︎ Passed\n");
+}
+
 static void test_json_schema() {
    // Note that this is similar to the regular grammar tests,
    //  but we convert each json schema to a grammar before parsing.
@@ -1433,6 +1467,8 @@ int main() {
    test_failure_missing_root();
    test_failure_missing_reference();
    test_failure_left_recursion();
+    test_failure_missing_root_symbol();
+    test_custom_root_symbol_check();
    test_json_schema();
    fprintf(stdout, "All tests passed.\n");
    return 0;
--- a/tools/server/tests/unit/test_template.py
+++ b/tools/server/tests/unit/test_template.py
@@ -11,6 +11,7 @@ sys.path.insert(0, str(path))

 import datetime
 from utils import *
+from typing import Literal

 server: ServerProcess

@@ -23,24 +24,24 @@ def create_server():


@pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]])
-@pytest.mark.parametrize("template_name,reasoning_budget,expected_end", [
-    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", None, "<think>\n"),
-    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B",   -1, "<think>\n"),
-    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B",    0, "<think>\n</think>"),
+@pytest.mark.parametrize("template_name,reasoning,expected_end", [
+    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B",  "on", "<think>\n"),
+    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B","auto", "<think>\n"),
+    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", "off", "<think>\n</think>"),

-    ("Qwen-Qwen3-0.6B", -1, "<|im_start|>assistant\n"),
-    ("Qwen-Qwen3-0.6B",  0, "<|im_start|>assistant\n<think>\n\n</think>\n\n"),
+    ("Qwen-Qwen3-0.6B","auto", "<|im_start|>assistant\n"),
+    ("Qwen-Qwen3-0.6B", "off", "<|im_start|>assistant\n<think>\n\n</think>\n\n"),

-    ("Qwen-QwQ-32B", -1, "<|im_start|>assistant\n<think>\n"),
-    ("Qwen-QwQ-32B",  0, "<|im_start|>assistant\n<think>\n</think>"),
+    ("Qwen-QwQ-32B","auto", "<|im_start|>assistant\n<think>\n"),
+    ("Qwen-QwQ-32B", "off", "<|im_start|>assistant\n<think>\n</think>"),

-    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use", -1, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"),
-    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use",  0, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|><|END_THINKING|>"),
+    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use","auto", "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"),
+    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use", "off", "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|><|END_THINKING|>"),
 ])
-def test_reasoning_budget(template_name: str, reasoning_budget: int | None, expected_end: str, tools: list[dict]):
+def test_reasoning(template_name: str, reasoning: Literal['on', 'off', 'auto'] | None, expected_end: str, tools: list[dict]):
    global server
    server.jinja = True
-    server.reasoning_budget = reasoning_budget
+    server.reasoning = reasoning
    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
    server.start()

--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -95,7 +95,7 @@ class ServerProcess:
    no_webui: bool | None = None
    jinja: bool | None = None
    reasoning_format: Literal['deepseek', 'none', 'nothink'] | None = None
-    reasoning_budget: int | None = None
+    reasoning: Literal['on', 'off', 'auto'] | None = None
    chat_template: str | None = None
    chat_template_file: str | None = None
    server_path: str | None = None
@@ -225,8 +225,8 @@ class ServerProcess:
            server_args.append("--no-jinja")
        if self.reasoning_format is not None:
            server_args.extend(("--reasoning-format", self.reasoning_format))
-        if self.reasoning_budget is not None:
-            server_args.extend(("--reasoning-budget", self.reasoning_budget))
+        if self.reasoning is not None:
+            server_args.extend(("--reasoning", self.reasoning))
        if self.chat_template:
            server_args.extend(["--chat-template", self.chat_template])
        if self.chat_template_file:
--- a/vendor/cpp-httplib/httplib.cpp
+++ b/vendor/cpp-httplib/httplib.cpp
@@ -4424,7 +4424,8 @@ get_range_offset_and_length(Range r, size_t content_length) {
  assert(r.first <= r.second &&
         r.second < static_cast<ssize_t>(content_length));
  (void)(content_length);
-  return std::make_pair(r.first, static_cast<size_t>(r.second - r.first) + 1);
+  return std::make_pair(static_cast<size_t>(r.first),
+                        static_cast<size_t>(r.second - r.first) + 1);
 }

 std::string make_content_range_header_field(
@@ -8616,11 +8617,17 @@ ClientImpl::open_stream(const std::string &method, const std::string &path,
  handle.body_reader_.stream = handle.stream_;
  handle.body_reader_.payload_max_length = payload_max_length_;

-  auto content_length_str = handle.response->get_header_value("Content-Length");
-  if (!content_length_str.empty()) {
+  if (handle.response->has_header("Content-Length")) {
+    bool is_invalid = false;
+    auto content_length = detail::get_header_value_u64(
+        handle.response->headers, "Content-Length", 0, 0, is_invalid);
+    if (is_invalid) {
+      handle.error = Error::Read;
+      handle.response.reset();
+      return handle;
+    }
    handle.body_reader_.has_content_length = true;
-    handle.body_reader_.content_length =
-        static_cast<size_t>(std::stoull(content_length_str));
+    handle.body_reader_.content_length = content_length;
  }

  auto transfer_encoding =
--- a/vendor/cpp-httplib/httplib.h
+++ b/vendor/cpp-httplib/httplib.h
@@ -8,28 +8,8 @@
 #ifndef CPPHTTPLIB_HTTPLIB_H
 #define CPPHTTPLIB_HTTPLIB_H

-#define CPPHTTPLIB_VERSION "0.37.0"
-#define CPPHTTPLIB_VERSION_NUM "0x002500"
-
-/*
- * Platform compatibility check
- */
-
-#if defined(_WIN32) && !defined(_WIN64)
-#if defined(_MSC_VER)
-#pragma message(                                                               \
-    "cpp-httplib doesn't support 32-bit Windows. Please use a 64-bit compiler.")
-#else
-#warning                                                                       \
-    "cpp-httplib doesn't support 32-bit Windows. Please use a 64-bit compiler."
-#endif
-#elif defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ < 8
-#warning                                                                       \
-    "cpp-httplib doesn't support 32-bit platforms. Please use a 64-bit compiler."
-#elif defined(__SIZEOF_SIZE_T__) && __SIZEOF_SIZE_T__ < 8
-#warning                                                                       \
-    "cpp-httplib doesn't support platforms where size_t is less than 64 bits."
-#endif
+#define CPPHTTPLIB_VERSION "0.37.1"
+#define CPPHTTPLIB_VERSION_NUM "0x002501"

 #ifdef _WIN32
 #if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00
@@ -2797,7 +2777,7 @@ inline size_t get_header_value_u64(const Headers &headers,
  std::advance(it, static_cast<ssize_t>(id));
  if (it != rng.second) {
    if (is_numeric(it->second)) {
-      return std::strtoull(it->second.data(), nullptr, 10);
+      return static_cast<size_t>(std::strtoull(it->second.data(), nullptr, 10));
    } else {
      is_invalid_value = true;
    }
Author	SHA1	Message	Date
Ruben Ortlam	0776a6a039	remove event pending stage	2026-03-15 09:00:06 +01:00
Ruben Ortlam	937a425600	fix log	2026-03-14 08:45:28 +00:00
Ruben Ortlam	5c177a1036	fix event reuse issue with multiple vectors	2026-03-14 09:03:29 +01:00
Ruben Ortlam	ccd8d4a6ce	NO MERGE: sync logging	2026-03-14 07:07:39 +01:00
Ruben Ortlam	4374b5ab9a	use multiple events to avoid reset issues	2026-03-14 06:39:20 +01:00
Ruben Ortlam	eebf21c3e9	don't use initializer list for semaphore wait info	2026-03-13 17:14:47 +01:00
Ruben Ortlam	08a4ba6f03	use timeline semaphores instead of fences for event_synchronize	2026-03-13 16:23:04 +01:00
Ruben Ortlam	2204bcedc8	also reset command buffers before reuse	2026-03-13 13:53:23 +01:00
Ruben Ortlam	c0d100e0fc	fix event command buffer reset validation error	2026-03-13 13:49:31 +01:00
Ruben Ortlam	58deae173e	vulkan: fix event wait submission, event command buffer reset	2026-03-13 13:40:53 +01:00
Georgi Gerganov	73c9eb8ced	metal : fix l2 norm scale (#20493 )	2026-03-13 11:43:20 +02:00
Daniel Bevenius	983df142a9	convert : fix/suppress pyright errors (#20442 ) * convert : fix/suppress pyright errors This commit fixes the pyright errors that are generated by pyright for convert_hf_to_gguf.py. The motivation for this is that running this locally generates errors that CI does not, and it can be difficult to spot new errors. One use case is when working on new models which cannot be run in CI due to privacy. Having the ability to run pyright locally is would be helpful in this cases. In the linked issue there is the mention of switching to `ty` which I don't know anything about but in the meantime I would appreciate if we could suppress these errors for now, and later perhaps revert this commit. With this change there are no errors but there are 4 informations messages if the `mistral_common` package is installed. The `--level error` flag can be used to suppress them. Resolves: https://github.com/ggml-org/llama.cpp/issues/20417	2026-03-13 06:00:52 +01:00
Georgi Gerganov	57819b8d4b	llama : disable graph reuse with pipeline parallelism (#20463 )	2026-03-12 21:04:13 +02:00
Alessandro de Oliveira Faria (A.K.A.CABELO)	557fe2d913	vendor : update cpp-httplib to 0.37.1 (#20390 )	2026-03-12 13:57:06 +01:00
Piotr Wilkin (ilintar)	0e810413bb	tests : use `reasoning` instead of `reasoning_budget` in server tests (#20432 )	2026-03-12 13:41:01 +01:00
Ruben Ortlam	128142fe7d	test-backend-ops: allow loading tests from file and parsing model operators into file (#19896 ) * tests: allow loading test-backend-ops tests from json * add error threshold based on op * add error when file cannot be read * add graph operator json extraction tool * add nb parameter for non-contiguous input tensors * fix view check * only use view if non-contiguous/permuted, use C++ random instead of rand() * replace internal API calls with public llama_graph_reserve call * reduce test description length * fix nb[0] not getting set for view * add name to tests * fix inplace error * use text file instead of json * move llama_graph_reserve function to new llama-ext header, move export-graph-ops to tests/ * fix missing declaration * use pragma once * fix indent * fix Windows build	2026-03-12 13:26:00 +01:00
Daniel Bevenius	6de1bc631d	common : update completion executables list [no ci] (#19934 ) This commit updates the bash completion executables list, adding missing executables and removing some that non longer exist.	2026-03-12 12:12:01 +01:00
Asbjørn Olling	0a10c34dc1	grammar: Fix grammar root symbol check (#19761 ) * grammar: fix bad check for root symbol, correct error logging * add tests to demonstrate root symbol check failure	2026-03-12 12:04:56 +01:00