CANN: support flash attention for head dim not multiple of 16, fix ALiBi slope offset (#20031 )

- Allow FLASH_ATTN_EXT when head dimension D is not a multiple of 16 by padding Q/K/V to D_padded = GGML_PAD(D, 16), running FusedInferAttentionScoreV2, then slicing the output back to D (ggml-cann.cpp + aclnn_ops.cpp). - Fix aclnn_get_slope second-part offset: use ggml_type_size(dtype) instead of sizeof(float) so ALiBi slopes are correct when dtype is F16 (e.g. GQA with 48 heads); fixes buffer overflow and large numerical errors in those cases.
model : add control vector support where missing (#20653 )
2026-03-19 14:53:28 +02:00 · 2026-03-19 11:02:42 +08:00 · 2026-03-18 23:25:12 +01:00 · 2026-03-18 23:21:42 +01:00 · 2026-03-18 18:49:57 +01:00 · 2026-03-18 10:23:47 -07:00
37 changed files with 254 additions and 193 deletions
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -933,17 +933,12 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp

    // Copy reasoning to the "thinking" field as expected by the gpt-oss template
    auto adjusted_messages = json::array();
-    for (const auto & msg : inputs.messages) {
-        auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string();
-        auto has_tool_calls        = msg.contains("tool_calls") && msg.at("tool_calls").is_array();
-
-        if (has_reasoning_content && has_tool_calls) {
-            auto adjusted_message        = msg;
-            adjusted_message["thinking"] = msg.at("reasoning_content");
-            adjusted_messages.push_back(adjusted_message);
-        } else {
-            adjusted_messages.push_back(msg);
+    for (auto msg : inputs.messages) {
+        if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
+            msg["thinking"] = msg.at("reasoning_content");
+            msg.erase("content");
        }
+        adjusted_messages.push_back(msg);
    }

    auto prompt = common_chat_template_direct_apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
@@ -969,45 +964,31 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
        "<|channel|>", "<|constrain|>", "<|message|>", "<|start|>", "<|end|>",
    };

-    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
-    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
-    auto include_grammar   = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && has_tools;
+    auto has_tools           = inputs.tools.is_array() && !inputs.tools.empty();
+    auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
+    auto include_grammar     = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);

    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        const std::string END                = "<|end|>";
-        const std::string START              = "<|start|>";
-        const std::string MESSAGE            = "<|message|>";
-        const std::string CHANNEL            = "<|channel|>";
-        const std::string CONSTRAIN          = "<|constrain|>";
-        const std::string START_ASSISTANT    = START + "assistant";
-        const std::string CHANNEL_ANALYSIS   = CHANNEL + "analysis";
-        const std::string CHANNEL_COMMENTARY = CHANNEL + "commentary";
-        const std::string CHANNEL_FINAL      = CHANNEL + "final";
+        auto start           = p.rule("start", p.literal("<|start|>assistant"));
+        auto end             = p.rule("end", p.literal("<|end|>"));
+        auto content         = p.rule("message-content", p.until("<|end|>"));
+        auto channel         = p.literal("<|channel|>") + (p.literal("commentary") | p.literal("analysis"));
+        auto constrain_type  = p.chars("[A-Za-z0-9_-]", 1, -1);

-        auto the_end = END | p.end();
+        auto analysis = p.rule("analysis", p.literal("<|channel|>analysis<|message|>") + p.reasoning(content) + end);
+        auto preamble = p.rule("preamble", p.literal("<|channel|>commentary<|message|>") + p.content(content) + end);
+        auto final_msg = p.rule("final", p.literal("<|channel|>final<|message|>") + p.content(content));
+        auto any = p.rule("any", preamble | analysis);

-        const std::string analysis_header  = CHANNEL_ANALYSIS + MESSAGE;
-        auto              segment_content  = p.until(END);
-        auto              analysis_segment = extract_reasoning ?
-                                                 p.literal(analysis_header) + p.reasoning(segment_content) + p.until(END) + the_end :
-                                                 p.content(analysis_header + p.until(END) + the_end);
+        if (has_response_format) {
+            auto constraint = p.optional(p.space() + p.literal("<|constrain|>") + constrain_type);
+            auto response_format = p.rule("response-format",
+                p.literal("<|channel|>final") + constraint + p.literal("<|message|>") +
+                p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));

-        auto channel_header_content = p.until_one_of({ " to=functions.", MESSAGE });
-        auto content_header         = p.choice({ p.literal(CHANNEL_COMMENTARY), p.literal(CHANNEL_FINAL) });
-        auto content_segment        = p.rule("content-segment", content_header + channel_header_content + MESSAGE +
-                                                                    p.content(segment_content) + the_end);
-
-        if (!inputs.json_schema.is_null()) {
-            auto final_header = p.literal(CHANNEL_FINAL);
-            auto constraint   = p.optional(p.space() + p.literal(CONSTRAIN) + channel_header_content);
-            return p.optional(analysis_segment) + final_header + constraint + MESSAGE +
-                   p.content(p.schema(p.json(), "response-format", inputs.json_schema));
+            return response_format | (analysis + p.zero_or_more(start + analysis) + start + response_format);
        }

-        auto segment  = p.optional(START_ASSISTANT + p.space()) + p.choice({ content_segment, analysis_segment });
-        auto contents = p.optional(segment + p.repeat(p.optional(p.space()) + segment, 0, -1)) + p.end();
-
-        // Tool call parser
        if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
            auto tool_choice = p.choice();

@@ -1016,42 +997,37 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
                std::string  name     = function.at("name");
                const auto & params   = function.at("parameters");

-                // Tool call can appear as:
-                // 1. In role header: " to=functions.NAME<|channel|>..."
-                // 2. In channel: "<|channel|>(analysis|commentary) to=functions.NAME..."
-                auto func_name = p.literal(" to=functions.") + p.tool_name(p.literal(name));
-
-                auto channel    = p.literal(CHANNEL_COMMENTARY) | p.literal(CHANNEL_ANALYSIS);
-                auto constraint = p.space() + p.optional(p.literal(CONSTRAIN) + channel_header_content);
+                auto func_name  = p.literal(" to=functions.") + p.tool_name(p.literal(name));
+                auto constraint = p.optional(p.space() + p.literal("<|constrain|>") + constrain_type);
                auto args       = p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", params));

-                // Pattern 1: recipient in role header
-                // " to=functions.NAME<|channel|>(analysis|commentary)[constraint]<|message|>ARGS"
-                auto tool_in_role = p.tool(p.tool_open(func_name + channel) + constraint + MESSAGE + args);
+                // recipient in role header
+                //   <|start|>assistant to=functions.NAME<|channel|>(commentary|analysis)[constraint]<|message|>ARGS
+                auto tool_in_role = p.tool(p.tool_open(func_name + channel + constraint + p.literal("<|message|>")) + args);

-                // Pattern 2: recipient in channel header
-                // "<|channel|>(analysis|commentary) to=functions.NAME[constraint]<|message|>ARGS"
-                auto tool_in_channel = p.tool(channel + p.tool_open(func_name + constraint + MESSAGE) + args);
+                // recipient in channel header
+                //   <|channel|>(commentary|analysis) to=functions.NAME[constraint]<|message|>ARGS
+                auto tool_in_channel = p.tool(p.tool_open(channel + func_name + constraint + p.literal("<|message|>")) + args);

-                tool_choice |= tool_in_role | tool_in_channel;
+                tool_choice |= p.rule("tool-" + name, tool_in_role | tool_in_channel);
            });

-            auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
-            auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
+            auto tool_call  = p.trigger_rule("tool-call", tool_choice);

-            auto role_start = p.optional(p.space() + p.literal(START_ASSISTANT));
-            auto tool_call  = p.rule("tool-call", p.repeat(role_start + tool_choice, min_calls, max_calls) + p.end());
+            if (inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED) {
+                return tool_call | ( any + p.zero_or_more(start + any) + start + tool_call);
+            }

-            return p.choice({ p.trigger_rule("single-tool", tool_call), p.trigger_rule("tools", p.one_or_more(segment) + tool_call) });
+            return tool_call | final_msg | (any + p.zero_or_more(start + any) + start + (tool_call | final_msg));
        }

-        return contents;
+        return final_msg | (any + p.zero_or_more(start + any) + start + final_msg);
    });

    data.parser = parser.save();

    if (include_grammar) {
-        data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
+        data.grammar_lazy = !(has_response_format || (has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
            foreach_function(inputs.tools, [&](const json & tool) {
                const auto & function = tool.at("function");
@@ -1062,10 +1038,9 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
        });

        data.grammar_triggers = {
-            { COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "^(?:<\\|start\\|>assistant\\s*)?(\\s+to=functions)"               },
-            { COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "(?:<\\|end\\|>)(?:<\\|start\\|>assistant\\s*)?(\\s+to=functions)" },
-            { COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
-             "(?:<\\|start\\|>assistant\\s*)?(<\\|channel\\|>(?:commentary|analysis)\\s+to=functions)"                }
+            { COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "^\\s+to$" },
+            { COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "<\\|start\\|>assistant(\\s+to)" },
+            { COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "<\\|start\\|>assistant(<\\|channel\\|>(?:commentary|analysis)\\s+to)" }
        };
    }

--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1067,7 +1067,7 @@ common_init_result::common_init_result(common_params & params) :

    const llama_vocab * vocab = llama_model_get_vocab(model);

-    // load and optionally apply lora adapters (must be loaded before context creation)
+    // load and optionally apply lora adapters
    for (auto & la : params.lora_adapters) {
        llama_adapter_lora_ptr lora;
        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
 set(GGML_VERSION_MINOR 9)
-set(GGML_VERSION_PATCH 7)
+set(GGML_VERSION_PATCH 8)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

 find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -733,6 +733,10 @@ extern "C" {
    GGML_API size_t  ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
    GGML_API size_t  ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row

+    GGML_DEPRECATED(
+    GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
+    "use ggml_row_size() instead");
+
    GGML_API const char * ggml_type_name(enum ggml_type type);
    GGML_API const char * ggml_op_name  (enum ggml_op   op);
    GGML_API const char * ggml_op_symbol(enum ggml_op   op);
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -1544,8 +1544,8 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx,
        end   = 2 * ((n_head - 1) - n_head_log2) + 1;
        step  = 2;
        count = n_head - n_head_log2;
-        aclnn_get_slope_inner(ctx, (char *) slope_buffer + n_head_log2 * sizeof(float), m1, count, start, end + 1, step,
-                              dtype);
+        aclnn_get_slope_inner(ctx, (char *) slope_buffer + n_head_log2 * ggml_type_size(dtype), m1, count, start, end + 1,
+                              step, dtype);
    }
 }

@@ -3599,6 +3599,44 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst
        acl_k_tensor = ggml_cann_create_tensor(src1, src1_bsnd_ne, src1_bsnd_nb, GGML_MAX_DIMS);
        acl_v_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne, src2_bsnd_nb, GGML_MAX_DIMS);

+        // Step 2.5: Pad Q, K, V along head dimension if D is not a multiple of 16
+        //           (required by FusedInferAttentionScoreV2)
+        const int64_t D         = src0->ne[0];
+        const int64_t D_padded  = GGML_PAD(D, 16);
+        const bool needs_padding = (D != D_padded);
+
+        ggml_cann_pool_alloc q_pad_allocator(ctx.pool());
+        ggml_cann_pool_alloc k_pad_allocator(ctx.pool());
+        ggml_cann_pool_alloc v_pad_allocator(ctx.pool());
+
+        if (needs_padding) {
+            int64_t paddings[] = { 0, D_padded - D, 0, 0, 0, 0, 0, 0 };
+
+            auto pad_fa_tensor = [&](acl_tensor_ptr & tensor, const int64_t * bsnd_ne,
+                                     ggml_cann_pool_alloc & allocator) {
+                int64_t pad_ne[GGML_MAX_DIMS] = { D_padded, bsnd_ne[1], bsnd_ne[2], bsnd_ne[3] };
+                size_t  pad_nb[GGML_MAX_DIMS];
+                pad_nb[0] = faElemSize;
+                for (int i = 1; i < GGML_MAX_DIMS; ++i) {
+                    pad_nb[i] = pad_nb[i - 1] * pad_ne[i - 1];
+                }
+                int64_t nelements = pad_ne[0] * pad_ne[1] * pad_ne[2] * pad_ne[3];
+                void *  buffer    = allocator.alloc(nelements * faElemSize);
+                acl_tensor_ptr padded =
+                    ggml_cann_create_tensor(buffer, faDataType, faElemSize, pad_ne, pad_nb, GGML_MAX_DIMS);
+                aclnn_pad(ctx, tensor.get(), padded.get(), paddings);
+                tensor = std::move(padded);
+            };
+
+            pad_fa_tensor(acl_q_tensor, src0_bsnd_ne, q_pad_allocator);
+            pad_fa_tensor(acl_k_tensor, src1_bsnd_ne, k_pad_allocator);
+            pad_fa_tensor(acl_v_tensor, src2_bsnd_ne, v_pad_allocator);
+
+            src0_bsnd_ne[0] = D_padded;
+            src1_bsnd_ne[0] = D_padded;
+            src2_bsnd_ne[0] = D_padded;
+        }
+
        // Step 3: create the PSEShift tensor if needed
        //         this tensor is considered as mask (f16) in the llama.cpp
        acl_tensor_ptr       bcast_pse_tensor;
@@ -3688,17 +3726,16 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst

        GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
        acl_tensor_ptr       fa_dst_tensor;
-        acl_tensor_ptr       acl_dst_tensor;
        ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
-        if (dst->type == GGML_TYPE_F32) {
-            void * out_f16_buffer = out_f16_allocator.alloc(ggml_nelements(dst) * faElemSize);
-
+        if (dst->type == GGML_TYPE_F32 || needs_padding) {
            int64_t * out_f16_ne = src0_bsnd_ne;
            size_t    out_f16_nb[GGML_MAX_DIMS];
            out_f16_nb[0] = faElemSize;
            for (int i = 1; i < GGML_MAX_DIMS; ++i) {
                out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
            }
+            int64_t out_nelements = out_f16_ne[0] * out_f16_ne[1] * out_f16_ne[2] * out_f16_ne[3];
+            void *  out_f16_buffer = out_f16_allocator.alloc(out_nelements * faElemSize);

            fa_dst_tensor =
                ggml_cann_create_tensor(out_f16_buffer, faDataType, faElemSize, out_f16_ne, out_f16_nb, GGML_MAX_DIMS);
@@ -3730,8 +3767,33 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst
                                nullptr                                // softmaxLse
        );

-        if (dst->type == GGML_TYPE_F32) {
-            // Step 6: post-processing, permute and cast to f32
+        // Step 6: post-processing — slice padded output and/or cast to f32
+        if (needs_padding) {
+            ggml_cann_pool_alloc sliced_f16_allocator(ctx.pool());
+
+            if (dst->type == GGML_TYPE_F32) {
+                int64_t sliced_ne[GGML_MAX_DIMS] = { D, src0_bsnd_ne[1], src0_bsnd_ne[2], src0_bsnd_ne[3] };
+                size_t  sliced_nb[GGML_MAX_DIMS];
+                sliced_nb[0] = faElemSize;
+                for (int i = 1; i < GGML_MAX_DIMS; ++i) {
+                    sliced_nb[i] = sliced_nb[i - 1] * sliced_ne[i - 1];
+                }
+                int64_t sliced_nelements = sliced_ne[0] * sliced_ne[1] * sliced_ne[2] * sliced_ne[3];
+                void *  sliced_buffer    = sliced_f16_allocator.alloc(sliced_nelements * faElemSize);
+                acl_tensor_ptr sliced_f16_tensor = ggml_cann_create_tensor(sliced_buffer, faDataType, faElemSize,
+                                                                           sliced_ne, sliced_nb, GGML_MAX_DIMS);
+
+                GGML_CANN_CALL_ACLNN_OP(ctx, Slice, fa_dst_tensor.get(),
+                                        (int64_t) -1, (int64_t) 0, D, (int64_t) 1, sliced_f16_tensor.get());
+
+                acl_tensor_ptr acl_dst_tensor = ggml_cann_create_tensor(dst);
+                aclnn_cast(ctx, sliced_f16_tensor.get(), acl_dst_tensor.get(), ggml_cann_type_mapping(dst->type));
+            } else {
+                acl_tensor_ptr acl_dst_tensor = ggml_cann_create_tensor(dst);
+                GGML_CANN_CALL_ACLNN_OP(ctx, Slice, fa_dst_tensor.get(),
+                                        (int64_t) -1, (int64_t) 0, D, (int64_t) 1, acl_dst_tensor.get());
+            }
+        } else if (dst->type == GGML_TYPE_F32) {
            acl_tensor_ptr acl_dst_tensor = ggml_cann_create_tensor(dst);
            aclnn_cast(ctx, fa_dst_tensor.get(), acl_dst_tensor.get(), ggml_cann_type_mapping(dst->type));
        }
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -2503,10 +2503,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
                    // different head sizes of K and V are not supported yet
                    return false;
                }
-                if (op->src[0]->ne[0] % 16 != 0) {
-                    // TODO: padding to support
-                    return false;
-                }
                float logitSoftcap = 0.0f;
                memcpy(&logitSoftcap, (const float *) (op->op_params) + 2, sizeof(float));
                if (logitSoftcap != 0.0f) {
--- a/ggml/src/ggml-cpu/arch/x86/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp
@@ -531,7 +531,6 @@ static void gemv_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t

    UNUSED(bs);

-    __m128i changemask = _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
    __m256i finalpermutemask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);

    // Permute mask used for easier vector processing at later stages
@@ -580,6 +579,7 @@ static void gemv_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
                if constexpr (
                        std::is_same_v<block_tx8, block_q4_0x8> ||
                        std::is_same_v<block_tx8, block_iq4_nlx8>) {
+                    const __m128i changemask = _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
                    col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
                } else if constexpr (std::is_same_v<block_tx8, block_mxfp4x8>) {
                    // Load 8 E8M0 exponents and convert to float via LUT
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -126,7 +126,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
        if (err == hipSuccess) {
            // hipMemAdviseSetCoarseGrain is an optional performance hint;
            // ignore errors (e.g. hipErrorInvalidValue on some APU/iGPU configs).
-            cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device);
+            (void)cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device);
            (void)hipGetLastError(); // clear any error
        }

--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -509,50 +509,39 @@ static void ggml_backend_webgpu_wait_profile_futures(webgpu_global_context &
 static void ggml_backend_webgpu_wait(webgpu_global_context &          ctx,
                                     std::vector<webgpu_submission> & subs,
                                     bool                             block = true) {
-    // If we have too many in-flight submissions, wait on the oldest one first.
    if (subs.empty()) {
        return;
    }
-    while (subs.size() >= WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD) {
-        auto waitStatus = ctx->instance.WaitAny(1, &subs[0].submit_done, UINT64_MAX);
-        if (ggml_backend_webgpu_handle_wait_status(waitStatus)) {
+
+    bool blocking_wait = block || subs.size() >= WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD;
+    while (blocking_wait) {
+        auto waitStatus = ctx->instance.WaitAny(1, &subs[0].submit_done, 0);
+        if (ggml_backend_webgpu_handle_wait_status(waitStatus, true)) {
 #ifdef GGML_WEBGPU_GPU_PROFILE
            ggml_backend_webgpu_wait_profile_futures(ctx, subs[0].profile_futures, true);
 #endif
            subs.erase(subs.begin());
        }
+        blocking_wait = (block && !subs.empty()) || subs.size() >= WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD;
    }

    if (subs.empty()) {
        return;
    }

-    if (block) {
-        for (auto & sub : subs) {
-            while (!sub.submit_done.completed) {
-                auto waitStatus = ctx->instance.WaitAny(1, &sub.submit_done, UINT64_MAX);
-                ggml_backend_webgpu_handle_wait_status(waitStatus);
-            }
+    // Poll each submit future once and remove completed submissions.
+    for (auto sub = subs.begin(); sub != subs.end();) {
+        auto waitStatus = ctx->instance.WaitAny(1, &sub->submit_done, 0);
+        bool success    = ggml_backend_webgpu_handle_wait_status(waitStatus, true);
 #ifdef GGML_WEBGPU_GPU_PROFILE
-            ggml_backend_webgpu_wait_profile_futures(ctx, sub.profile_futures, true);
-#endif
-        }
-        subs.clear();
-    } else {
-        // Poll each submit future once and remove completed submissions.
-        for (auto sub = subs.begin(); sub != subs.end();) {
-            auto waitStatus = ctx->instance.WaitAny(1, &sub->submit_done, 0);
-            ggml_backend_webgpu_handle_wait_status(waitStatus, true);
-#ifdef GGML_WEBGPU_GPU_PROFILE
-            ggml_backend_webgpu_wait_profile_futures(ctx, sub->profile_futures, false);
-            if (sub->submit_done.completed && sub->profile_futures.empty()) {
+        ggml_backend_webgpu_wait_profile_futures(ctx, sub->profile_futures, false);
+        if (success && sub->profile_futures.empty()) {
 #else
-            if (sub->submit_done.completed) {
+        if (success) {
 #endif
-                sub = subs.erase(sub);
-            } else {
-                ++sub;
-            }
+            sub = subs.erase(sub);
+        } else {
+            ++sub;
        }
    }
 }
@@ -2961,17 +2950,16 @@ static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_buffer_type(ggm

    static struct ggml_backend_buffer_type ggml_backend_webgpu_buffer_type = {
        /* .iface = */ {
-                        /* .get_name         = */ ggml_backend_webgpu_buffer_type_get_name,
-                        /* .alloc_buffer     = */
-            ggml_backend_webgpu_buffer_type_alloc_buffer,                                    /* .get_alignment    = */
-            ggml_backend_webgpu_buffer_type_get_alignment,                                   /* .get_max_size     = */
-            ggml_backend_webgpu_buffer_type_get_max_size,                                    /* .get_alloc_size   = */
-            ggml_backend_webgpu_buffer_type_get_alloc_size, /* .is_host          = */ NULL,  // defaults to false
+                        /* .get_name       = */ ggml_backend_webgpu_buffer_type_get_name,
+                        /* .alloc_buffer   = */ ggml_backend_webgpu_buffer_type_alloc_buffer,
+                        /* .get_alignment  = */ ggml_backend_webgpu_buffer_type_get_alignment,
+                        /* .get_max_size   = */ ggml_backend_webgpu_buffer_type_get_max_size,
+                        /* .get_alloc_size = */ ggml_backend_webgpu_buffer_type_get_alloc_size,
+                        /* .is_host        = */ NULL,  // defaults to false
        },
        /* .device  = */
-        dev,
-        /* .context = */
-        NULL
+         dev,
+        /* .context = */ NULL
    };

    return &ggml_backend_webgpu_buffer_type;
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1294,6 +1294,12 @@ size_t ggml_row_size(enum ggml_type type, int64_t ne) {
    return ggml_type_size(type)*ne/ggml_blck_size(type);
 }

+double ggml_type_sizef(enum ggml_type type) {
+    assert(type >= 0);
+    assert(type < GGML_TYPE_COUNT);
+    return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
+}
+
 const char * ggml_type_name(enum ggml_type type) {
    assert(type >= 0);
    assert(type < GGML_TYPE_COUNT);
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -425,8 +425,7 @@ class GGUFWriter:
        fout = self.fout[file_id]

        # pop the first tensor info
-        # TODO: cleaner way to get the first key
-        first_tensor_name = [name for name, _ in zip(self.tensors[file_id].keys(), range(1))][0]
+        first_tensor_name = next(iter(self.tensors[file_id]))
        ti = self.tensors[file_id].pop(first_tensor_name)
        assert ti.nbytes == tensor.nbytes

--- a/include/llama-cpp.h
+++ b/include/llama-cpp.h
@@ -21,9 +21,7 @@ struct llama_sampler_deleter {
 };

 struct llama_adapter_lora_deleter {
-    void operator()(llama_adapter_lora *) {
-        // llama_adapter_lora_free is deprecated
-    }
+    void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
 };

 typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
--- a/include/llama.h
+++ b/include/llama.h
@@ -636,7 +636,6 @@ extern "C" {

    // Load a LoRA adapter from file
    // The adapter is valid as long as the associated model is not freed
-    // All adapters must be loaded before context creation
    LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
            struct llama_model * model,
            const char * path_lora);
@@ -660,9 +659,8 @@ extern "C" {
    LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);

    // Manually free a LoRA adapter
-    // NOTE: loaded adapters will be free when the associated model is deleted
-    LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter),
-            "adapters are now freed together with the associated model");
+    // NOTE: loaded adapters that are not manually freed will be freed when the associated model is deleted
+    LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);

    // Get the invocation tokens if the current lora is an alora
    LLAMA_API uint64_t            llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-553552e1d88be2b214b85e5159eedd39a63e2c34
+c044a8eeae2591faa0950c8b5e514cbc4bbfc4ca
--- a/src/llama-adapter.cpp
+++ b/src/llama-adapter.cpp
@@ -418,7 +418,7 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
 }

 llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
-    llama_adapter_lora * adapter = new llama_adapter_lora();
+    llama_adapter_lora * adapter = new llama_adapter_lora(model);

    try {
        llama_adapter_lora_init_impl(*model, path_lora, *adapter);
@@ -471,8 +471,17 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
    return snprintf(buf, buf_size, "%s", it->second.c_str());
 }

-void llama_adapter_lora_free(llama_adapter_lora *) {
-    // deprecated: adapters are freed by llama_model's destructor
+void llama_adapter_lora_free(llama_adapter_lora * adapter) {
+    if (adapter == nullptr) {
+        return;
+    }
+
+    if (adapter->model != nullptr) {
+        adapter->model->loras.erase(adapter);
+        adapter->model = nullptr;
+    }
+
+    delete adapter;
 }

 uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
--- a/src/llama-adapter.h
+++ b/src/llama-adapter.h
@@ -61,6 +61,8 @@ struct llama_adapter_lora_weight {
 };

 struct llama_adapter_lora {
+    llama_model * model = nullptr;
+
    // map tensor name to lora_a_b
    std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;

@@ -75,7 +77,7 @@ struct llama_adapter_lora {
    // activated lora (aLoRA)
    std::vector<llama_token> alora_invocation_tokens;

-    llama_adapter_lora() = default;
+    explicit llama_adapter_lora(llama_model * model) : model(model) {}
    ~llama_adapter_lora() = default;

    llama_adapter_lora_weight * get_weight(ggml_tensor * w);
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1165,9 +1165,11 @@ bool llama_context::set_adapter_cvec(
                int32_t   il_end) {
    LLAMA_LOG_DEBUG("%s: il_start = %d, il_end = %d\n", __func__, il_start, il_end);

-    // TODO: should we reserve?
+    bool res = cvec->apply(model, data, len, n_embd, il_start, il_end);

-    return cvec->apply(model, data, len, n_embd, il_start, il_end);
+    sched_need_reserve = true;
+
+    return res;
 }

 llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
--- a/src/models/bitnet.cpp
+++ b/src/models/bitnet.cpp
@@ -121,6 +121,9 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
        cur = ggml_add(ctx0, cur, ffn_inp);
        cb(cur, "l_out", il);

+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
        // input for next layer
        inpL = cur;
    }
--- a/src/models/chatglm.cpp
+++ b/src/models/chatglm.cpp
@@ -111,8 +111,13 @@ llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_

        }

-        inpL = ggml_add(ctx0, cur, ffn_inp);
-        cb(inpL, "l_out", il);
+        cur = ggml_add(ctx0, cur, ffn_inp);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
+        // input for next layer
+        inpL = cur;
    }

    cur = build_norm(inpL,
--- a/src/models/cogvlm.cpp
+++ b/src/models/cogvlm.cpp
@@ -86,6 +86,10 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
        cur = ggml_add(ctx0, cur, ffn_inp);
        cb(cur, "ffn_out", il);

+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
+        // input for next layer
        inpL = cur;
    }

--- a/src/models/eurobert.cpp
+++ b/src/models/eurobert.cpp
@@ -82,6 +82,7 @@ llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_grap

        cur = ggml_add(ctx0, cur, ffn_inp);

+        // input for next layer
        inpL = cur;
    }
    cur = inpL;
--- a/src/models/jais.cpp
+++ b/src/models/jais.cpp
@@ -66,8 +66,14 @@ llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params
                    LLM_FFN_SILU, LLM_FFN_PAR, il);
            cb(cur, "ffn_out", il);
        }
-        inpL = ggml_add(ctx0, cur, ffn_inp);
-        cb(inpL, "l_out", il);
+
+        cur = ggml_add(ctx0, cur, ffn_inp);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
+        // input for next layer
+        inpL = cur;
    }
    cur = build_norm(inpL,
            model.output_norm,
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -362,6 +362,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
        cur = build_cvec(cur, il);
        cb(cur, "l_out", il);

+        // input for next layer
        inpL = cur;
    }
    cur = inpL;
--- a/src/models/lfm2.cpp
+++ b/src/models/lfm2.cpp
@@ -177,6 +177,9 @@ llm_build_lfm2<iswa>::llm_build_lfm2(const llama_model & model, const llm_graph_
        cb(ffn_norm_out, "model.layers.{}.ffn_out", il);

        cur = ggml_add(ctx0, cur, ffn_out);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
    }

    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
--- a/src/models/plamo2.cpp
+++ b/src/models/plamo2.cpp
@@ -71,6 +71,7 @@ llm_build_plamo2::llm_build_plamo2(const llama_model & model, const llm_graph_pa
        cur = ggml_add(ctx0, cur, residual);
        cb(cur, "ffn_residual", il);

+        // input for next layer
        inpL = cur;
    }

--- a/src/models/plamo3.cpp
+++ b/src/models/plamo3.cpp
@@ -109,6 +109,8 @@ llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_gr

        cur = build_cvec(cur, il);
        cb(cur, "l_out", il);
+
+        // input for next layer
        inpL = cur;
    }

--- a/src/models/qwen35.cpp
+++ b/src/models/qwen35.cpp
@@ -64,6 +64,9 @@ llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_pa
        cur = ggml_add(ctx0, cur, ffn_residual);
        cb(cur, "post_ffn", il);

+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
        // Input for next layer
        inpL = cur;
    }
--- a/src/models/qwen35moe.cpp
+++ b/src/models/qwen35moe.cpp
@@ -64,6 +64,9 @@ llm_build_qwen35moe::llm_build_qwen35moe(const llama_model & model, const llm_gr
        cur = ggml_add(ctx0, cur, ffn_residual);
        cb(cur, "post_moe", il);

+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
        // Input for next layer
        inpL = cur;
    }
--- a/src/models/qwen3next.cpp
+++ b/src/models/qwen3next.cpp
@@ -56,6 +56,9 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
        cur = ggml_add(ctx0, cur, ffn_residual);
        cb(cur, "post_moe", il);

+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
        // Input for next layer
        inpL = cur;
    }
--- a/src/models/smallthinker.cpp
+++ b/src/models/smallthinker.cpp
@@ -101,6 +101,7 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
        cur = ffn_out;

        cur = ggml_add(ctx0, cur, ffn_inp);
+
        cur = build_cvec(cur, il);
        cb(cur, "l_out", il);

--- a/src/models/step35-iswa.cpp
+++ b/src/models/step35-iswa.cpp
@@ -145,9 +145,11 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll
            cb(cur, "ffn_out", il);
        }
        cur = ggml_add(ctx0, cur, ffn_inp);
+
        cur = build_cvec(cur, il);
        cb(cur, "l_out", il);

+        // input for next layer
        inpL = cur;
    }

--- a/tests/test-backend-sampler.cpp
+++ b/tests/test-backend-sampler.cpp
@@ -89,6 +89,7 @@ struct test_context {
        cparams.n_batch = 512;
        cparams.samplers = configs.data();
        cparams.n_samplers = configs.size();
+        cparams.kv_unified = true;

        // If n_seq_max is not specified, calculate it from configs
        if (n_seq_max < 0) {
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -2448,7 +2448,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {

        // Analysis channel (reasoning) with final channel (content)
        tst.test(
-               "<|channel|>analysis<|message|>I'm\nthinking<|end|>\n<|channel|>final<|message|>Hello, world!\nWhat's "
+               "<|channel|>analysis<|message|>I'm\nthinking<|end|><|start|>assistant<|channel|>final<|message|>Hello, world!\nWhat's "
               "up?")
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .expect(message_assist_thoughts)
@@ -2461,15 +2461,6 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            .expect_reasoning("I'm\nthinking")
            .run();

-        // Reasoning format none - reasoning stays in content
-        tst.test(
-               "<|channel|>analysis<|message|>I'm\nthinking<|end|>\n<|channel|>final<|message|>Hello, world!\nWhat's "
-               "up?")
-            .reasoning_format(COMMON_REASONING_FORMAT_NONE)
-            .expect_content(
-                "<|channel|>analysis<|message|>I'm\nthinking<|end|>Hello, world!\nWhat's up?")
-            .run();
-
        // Tool call with recipient in role header: " to=functions.NAME<|channel|>analysis<|message|>JSON"
        tst.test(" to=functions.special_function<|channel|>analysis<|message|>{\"arg1\": 1}")
            .tools({ special_function_tool })
@@ -2496,37 +2487,16 @@ static void test_template_output_peg_parsers(bool detailed_debug) {

        // Tool call with reasoning + content (analysis first, then tool call)
        tst.test(
-               "<|channel|>analysis<|message|>I'm\nthinking<|end|>\n"
+               "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
               "<|start|>assistant to=functions.special_function<|channel|>analysis<|message|>{\"arg1\": 1}")
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .tools({ special_function_tool })
            .expect(message_assist_call_thoughts)
            .run();

-        // Tool calling with extra channel before
+        // Complex tool calling
        tst.test(
-                "<|channel|>analysis<|message|>I'm\nthinking<|end|><|start|>assistant<|channel|>commentary"
-                " to=functions.special_function <|message|>{\"arg1\": 1}")
-            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
-            .tools({ special_function_tool })
-            .expect(message_assist_call_thoughts)
-            .run();
-
-        // Reasoning after final channel
-        // Tool calling after final channel
-        tst.test(
-            "<|channel|>final<|message|><|end|>"
-            "<|start|>assistant<|channel|>analysis<|message|>Thinking about edit..."
-        )
-            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
-            .expect_reasoning("Thinking about edit...")
-            .expect_content("")
-            .run();
-
-        // Tool calling after final channel
-        tst.test(
-            "<|channel|>final<|message|><|end|>"
-            "<|start|>assistant<|channel|>analysis<|message|>Thinking about edit...<|end|>"
+            "<|channel|>analysis<|message|>Thinking about edit...<|end|>"
            "<|start|>assistant<|channel|>commentary to=functions.edit <|constrain|>json"
            "<|message|>{\"oldString\": \"if (part < railCount - 1) {\", \"newString\": \"if (part < 4) {\", \"replaceAll\": false}"
            )
@@ -2561,19 +2531,17 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            })
            .run();

-        // Parallel tool calls
+        // Structured output
        tst.test(
-               " to=functions.special_function<|channel|>analysis<|message|>{\"arg1\": 1}\n"
-               "<|start|>assistant to=functions.special_function_with_opt<|channel|>analysis<|message|>{\"arg1\": 1, "
-               "\"arg2\": 2}")
-            .parallel_tool_calls(true)
-            .tools({
-                special_function_tool, special_function_tool_with_optional_param
-        })
-            .expect_tool_calls({
-                { "special_function", R"({"arg1": 1})", {} },
-                { "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
-            })
+            "<|channel|>analysis<|message|>I need to output the invoice details in JSON<|end|>"
+            "<|start|>assistant<|channel|>final <|constrain|>json"
+            "<|message|>"
+            R"({"amount": 123.45, "date": "2025-12-03"})"
+            )
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .json_schema(invoice_schema)
+            .expect_reasoning("I need to output the invoice details in JSON")
+            .expect_content(R"({"amount": 123.45, "date": "2025-12-03"})")
            .run();
    }

--- a/tests/test-jinja.cpp
+++ b/tests/test-jinja.cpp
@@ -1897,8 +1897,9 @@ import sys
 from datetime import datetime
 from jinja2.sandbox import SandboxedEnvironment

-tmpl = json.loads(sys.argv[1])
-vars_json = json.loads(sys.argv[2])
+merged_input = json.loads(sys.stdin.buffer.read().decode("utf-8"))
+tmpl = merged_input["tmpl"]
+vars_json = merged_input["vars"]

 env = SandboxedEnvironment(
    trim_blocks=True,
@@ -1921,8 +1922,9 @@ sys.stdout.buffer.write(result.encode())
 static void test_template_py(testing & t, const std::string & name, const std::string & tmpl, const json & vars, const std::string & expect) {
    t.test(name, [&tmpl, &vars, &expect](testing & t) {
        // Prepare arguments
-        std::string tmpl_json = json(tmpl).dump();
-        std::string vars_json = vars.dump();
+        json merged;
+        merged["tmpl"] = json(tmpl);
+        merged["vars"] = vars;

 #ifdef _WIN32
        const char * python_executable = "python.exe";
@@ -1930,7 +1932,7 @@ static void test_template_py(testing & t, const std::string & name, const std::s
        const char * python_executable = "python3";
 #endif

-        const char * command_line[] = {python_executable, "-c", py_script.c_str(), tmpl_json.c_str(), vars_json.c_str(), NULL};
+        const char * command_line[] = {python_executable, "-c", py_script.c_str(), NULL};

        struct subprocess_s subprocess;
        int options = subprocess_option_combined_stdout_stderr
@@ -1944,6 +1946,20 @@ static void test_template_py(testing & t, const std::string & name, const std::s
            t.assert_true("subprocess creation", false);
            return;
        }
+        FILE * p_stdin = subprocess_stdin(&subprocess);
+
+        // Write input
+        std::string input = merged.dump();
+        auto written = fwrite(input.c_str(), 1, input.size(), p_stdin);
+        if (written != input.size()) {
+            t.log("Failed to write complete input to subprocess stdin");
+            t.assert_true("subprocess stdin write", false);
+            subprocess_destroy(&subprocess);
+            return;
+        }
+        fflush(p_stdin);
+        fclose(p_stdin); // Close stdin to signal EOF to the Python process
+        subprocess.stdin_file = nullptr;

        // Read output
        std::string output;
--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAttachmentsDropdown.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAttachmentsDropdown.svelte
@@ -148,7 +148,7 @@
 					</Tooltip.Trigger>

 					<Tooltip.Content side="right">
-						<p>Images require vision models to be processed</p>
+						<p>Image processing requires a vision model</p>
 					</Tooltip.Content>
 				</Tooltip.Root>
 			{/if}
@@ -173,7 +173,7 @@
 					</Tooltip.Trigger>

 					<Tooltip.Content side="right">
-						<p>Audio files require audio models to be processed</p>
+						<p>Audio files processing requires an audio model</p>
 					</Tooltip.Content>
 				</Tooltip.Root>
 			{/if}
--- a/tools/server/webui/src/routes/+page.svelte
+++ b/tools/server/webui/src/routes/+page.svelte
@@ -57,7 +57,6 @@
 		// Handle ?q= parameter - create new conversation and send message
 		if (qParam !== null) {
 			await conversationsStore.createConversation();
-			await chatStore.sendMessage(qParam);
 			clearUrlParams();
 		} else if (modelParam || newChatParam === 'true') {
 			clearUrlParams();
Author	SHA1	Message	Date
Chenguang Li	07ba6d275b	CANN: support flash attention for head dim not multiple of 16, fix ALiBi slope offset (#20031 ) - Allow FLASH_ATTN_EXT when head dimension D is not a multiple of 16 by padding Q/K/V to D_padded = GGML_PAD(D, 16), running FusedInferAttentionScoreV2, then slicing the output back to D (ggml-cann.cpp + aclnn_ops.cpp). - Fix aclnn_get_slope second-part offset: use ggml_type_size(dtype) instead of sizeof(float) so ALiBi slopes are correct when dtype is F16 (e.g. GQA with 48 heads); fixes buffer overflow and large numerical errors in those cases.	2026-03-19 11:02:42 +08:00
Michael Grau	6729d4920c	model : add control vector support where missing (#20653 ) * Add control vector functions to qwen3.5 and qwen-next models * Add missing cvec compatibility to the rest of the models * Adjust comments and formatting * cleanup * whitespace --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2026-03-18 23:25:12 +01:00
Sigbjørn Skjæret	d13d60af1d	gguf-py : cleaner way to get the first key (#20727 )	2026-03-18 23:21:42 +01:00
crsawyer	5744d7ec43	Rebuild index.html.gz (#20724 )	2026-03-18 18:49:57 +01:00
Reese Levine	8ced5f41f9	Move to no timeout for WaitAny in graph submission to avoid deadlocks in some cases on llvm-pipe backends (#20618 )	2026-03-18 10:23:47 -07:00
Shaw Nguyen	78d550b541	ggml-cpu/x86: fix unused changemask warning in repack (#20692 )	2026-03-18 18:45:06 +02:00
Georgi Gerganov	4efd326e71	sync : ggml	2026-03-18 15:17:28 +02:00
Georgi Gerganov	b08f7322ee	ggml : bump version to 0.9.8 (ggml/1442)	2026-03-18 15:17:28 +02:00
Georgi Gerganov	79187f2fb8	ggml : restore ggml_type_sizef() to aboid major version bump (ggml/1441)	2026-03-18 15:17:28 +02:00
Julien Chaumond	48e61238e1	webui: improve tooltip wording for attachment requirements (#20688 ) * webui: improve tooltip wording for attachment requirements Co-Authored-By: Claude <Agents+claude@huggingface.co> * chore: update webui build output * chore: update webui build output --------- Co-authored-by: Claude <Agents+claude@huggingface.co>	2026-03-18 14:01:02 +01:00
Pop Flamingo	312cf03328	llama : re-enable manual LoRA adapter free (#19983 ) * Re-enable manual LoRA adapter free * Remove stale "all adapters must be loaded before context creation" stale comments	2026-03-18 12:03:26 +02:00
Masato Nakasaka	f4049ad735	tests : fix test-jinja-py Windows failures by bypassing command-line args [no ci] (#20483 ) * Fix errors occurring on Windows * Reverted fix #20365 will take care of CRLF isue * Changed to write to directly to stdin * Prevent fclose to happen twice	2026-03-18 10:43:31 +01:00
Aldehir Rojas	5e8910a0db	common : rework gpt-oss parser (#20393 ) * common : rework gpt-oss parser * cont : fix gpt-oss tests * cont : add structured output test * cont : rename final to final_msg	2026-03-18 10:41:25 +01:00
Aaron Teo	fe00a84b4b	tests: enable kv_unified to prevent cuda oom error on rtx 2060 (#20645 ) Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>	2026-03-18 17:40:22 +08:00
Aleksander Grygier	7ab321d40d	webui: Fix duplicated messages on q param (#20715 ) * fix: Remove duplicate message sending on `?q` param * chore: update webui build output	2026-03-18 10:32:43 +01:00
uvos	7533a7d509	HIP : ignore return of hipMemAdvise [no ci] (#20696 )	2026-03-18 09:53:13 +01:00
Andreas Obersteiner	a69d54f990	context : fix graph not resetting when control vector changes (#20381 )	2026-03-18 08:10:13 +02:00