server : print warning when HTTP timeout exceeded (#22907 )

backend sampling: support returning post-sampling probs (#22622 )
* server: Never return 0.0 post-sampling probabilities * backend sampling: support returning post-sampling probs
2026-05-14 17:07:43 +03:00 · 2026-05-10 22:00:18 +03:00 · 2026-05-10 19:12:02 +02:00 · 2026-05-10 18:46:54 +02:00 · 2026-05-10 17:32:41 +02:00 · 2026-05-10 17:00:11 +03:00
200 changed files with 18344 additions and 9441 deletions
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -33,10 +33,10 @@ RUN mkdir -p /app/full \

 FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

-ARG IGC_VERSION=v2.30.1
-ARG IGC_VERSION_FULL=2_2.30.1+20950
-ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
+ARG IGC_VERSION=v2.32.7
+ARG IGC_VERSION_FULL=2_2.32.7+21184
+ARG COMPUTE_RUNTIME_VERSION=26.14.37833.4
+ARG COMPUTE_RUNTIME_VERSION_FULL=26.14.37833.4-0
 ARG IGDGMM_VERSION=22.9.0
 RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -103,6 +103,7 @@ let
    vulkan-headers
    vulkan-loader
    shaderc
+    spirv-headers
  ];
 in

@@ -146,7 +147,6 @@ effectiveStdenv.mkDerivation (finalAttrs: {
      ninja
      pkg-config
      git
-      spirv-headers
    ]
    ++ optionals useCuda [
      cudaPackages.cuda_nvcc
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@@ -29,10 +29,10 @@ jobs:
      uses: actions/setup-python@v6
      with:
        python-version: '3.11'
+        pip-install: poetry==2.4.0
    - name: Install dependencies
      run: |
        cd gguf-py
-        python -m pip install poetry==2.3.2
        poetry install

    - name: Build package
--- a/.gitignore
+++ b/.gitignore
@@ -110,6 +110,7 @@ uv.lock

 # Nix

+flake.lock
 /result

 # Test binaries
--- a/1
+++ b/1
@@ -76,6 +76,7 @@
 /ggml/src/ggml-vulkan/                  @ggml-org/ggml-vulkan
 /ggml/src/ggml-webgpu/                  @ggml-org/ggml-webgpu
 /ggml/src/ggml-zdnn/                    @ggml-org/ggml-zdnn @Andreas-Krebbel @AlekseiNikiforovIBM
+/ggml/src/ggml-zendnn/                  @avinashcpandey @Jiten1parmar @z-vishal
 /ggml/src/ggml.c                        @ggerganov
 /ggml/src/ggml.cpp                      @ggerganov
 /ggml/src/gguf.cpp                      @JohannesGaessler @Green-Sky
--- a/README.md
+++ b/README.md
@@ -529,6 +529,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
 - [How to build](docs/build.md)
 - [Running on Docker](docs/docker.md)
 - [Build on Android](docs/android.md)
+- [Multi-GPU usage](docs/multi-gpu.md)
 - [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
 - [GGML tips & tricks](https://github.com/ggml-org/llama.cpp/wiki/GGML-Tips-&-Tricks)

--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@@ -369,9 +369,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
                                           arguments.name_suffix) +
                           arguments.value_prefix +
                           (schema_info.resolves_to_string(param_schema) ?
-                                p.tool_arg_string_value(p.schema(until_suffix,
-                                                                 "tool-" + name + "-arg-" + param_name + "-schema",
-                                                                 param_schema, true)) :
+                                p.tool_arg_string_value(until_suffix) :
                                p.tool_arg_json_value(p.schema(
                                    p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
                                    p.space()) +
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -80,7 +80,7 @@ json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
    if (!content.empty()) {
        jmsg["content"] = content;
    } else if (!content_parts.empty()) {
-        if (concat_typed_text) {
+        if (concat_typed_text || contains_media()) {
            std::string text;
            bool last_was_media_marker = false;
            // join parts with newline, do not add newline before or after media markers
--- a/common/chat.h
+++ b/common/chat.h
@@ -94,6 +94,15 @@ struct common_chat_msg {
               tool_name.empty() && tool_call_id.empty();
    }

+    bool contains_media() const {
+        for (const auto & part : content_parts) {
+            if (part.type == "media_marker") {
+                return true;
+            }
+        }
+        return false;
+    }
+
    void set_tool_call_ids(std::vector<std::string> &           ids_cache,
                           const std::function<std::string()> & gen_tool_call_id) {
        for (auto i = 0u; i < tool_calls.size(); i++) {
--- a/common/fit.cpp
+++ b/common/fit.cpp
@@ -109,16 +109,24 @@ static std::vector<llama_device_memory_data> common_get_device_memory_data(
        ret.back().total = total;
    }
    for (size_t i = 0; i < nd; i++) {
+        ggml_backend_dev_t dev = llama_model_get_device(model, i);
+
        size_t free;
        size_t total;
-        ggml_backend_dev_memory(llama_model_get_device(model, i), &free, &total);
+        ggml_backend_dev_memory(dev, &free, &total);

-        // devices can return 0 bytes for free and total memory if they do not
-        // have any to report. in this case, we will use the host memory as a fallback
-        // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
+        // Some non-GPU accelerator backends, such as BLAS, report 0/0 and rely on
+        // the host-memory fallback. For GPU-like backends, keep 0/0 so --fit does
+        // not assign anything to a device with an unknown memory budget.
        if (free == 0 && total == 0) {
-            free  = ret.back().free;
-            total = ret.back().total;
+            const enum ggml_backend_dev_type type = ggml_backend_dev_type(dev);
+            if (type == GGML_BACKEND_DEVICE_TYPE_GPU || type == GGML_BACKEND_DEVICE_TYPE_IGPU) {
+                LOG_WRN("%s: device %s did not report memory; --fit will not use it\n",
+                        __func__, ggml_backend_dev_name(dev));
+            } else {
+                free  = ret.back().free;
+                total = ret.back().total;
+            }
        }
        ret[i].free  = free;
        ret[i].total = total;
--- a/common/reasoning-budget.cpp
+++ b/common/reasoning-budget.cpp
@@ -158,8 +158,6 @@ static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_tok
    for (size_t i = 0; i < cur_p->size; i++) {
        if (cur_p->data[i].id != forced) {
            cur_p->data[i].logit = -INFINITY;
-        } else {
-            cur_p->data[i].logit = +INFINITY; // force the token
        }
    }
 }
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -547,6 +547,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    auto & chain = gsmpl->chain;
    auto & cur_p = gsmpl->cur_p; // initialized by set_logits

+    gsmpl->set_logits(ctx, idx);
+
    // Check if a backend sampler has already sampled a token in which case we
    // return that token id directly.
    {
@@ -558,17 +560,17 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
            GGML_ASSERT(!gsmpl->grmr    && "using grammar in combination with backend sampling is not supported");
            GGML_ASSERT(!gsmpl->rbudget && "using reasoning budget in combination with backend sampling is not supported");

-            // TODO: simplify
-            gsmpl->cur.resize(1);
-            gsmpl->cur[0] = { id, 0.0f, 1.0f };
-            cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
+            for (size_t i = 0; i < cur_p.size; ++i) {
+                if (cur_p.data[i].id == id) {
+                    cur_p.selected = i;
+                    break;
+                }
+            }

            return id;
        }
    }

-    gsmpl->set_logits(ctx, idx);
-
    // apply reasoning budget first
    llama_sampler_apply(rbudget, &cur_p);

--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -710,7 +710,7 @@ class ModelBase:
                self._repack_nvfp4(name, weight, scale, scale2, input_scale)

        # Flush any remaining experts (fallback if n_experts was unknown)
-        for bid, proj_type in expert_blocks.keys():
+        for bid, proj_type in list(expert_blocks.keys()):
            self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type)

        # Remove consumed tensors so get_tensors/modify_tensors won't see them
@@ -718,7 +718,7 @@ class ModelBase:
            self.model_tensors.pop(name, None)

        # Remove any remaining unused auxiliary tensors
-        for name in self.model_tensors.keys():
+        for name in list(self.model_tensors.keys()):
            if name.endswith((".k_scale", ".v_scale")):
                del self.model_tensors[name]

@@ -1063,7 +1063,7 @@ class TextModel(ModelBase):
        name, gen = item

        # Skip multimodal tensors
-        if name.startswith(("mlp", "vit.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.")) \
+        if name.startswith(("mlp", "vit.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.", "speech_embeddings.")) \
                or "visual." in name or "vision." in name or "audio." in name or "talker." in name \
                or "vision_" in name or "audio_" in name or "sam_model" in name \
                or "token2wav." in name or "code2wav." in name \
@@ -1360,6 +1360,9 @@ class TextModel(ModelBase):
        if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
            # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
            res = "qwen2"
+        if chkhsh == "1444df51289cfa8063b96f0e62b1125440111bc79a52003ea14b6eac7016fd5f":
+            # ref: https://huggingface.co/openbmb/MiniCPM-V-4_6
+            res = "qwen35"
        if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
            # ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
            res = "grok-2"
@@ -1567,6 +1570,9 @@ class TextModel(ModelBase):
        if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015":
            # ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B
            res = "f2llmv2"
+        if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57":
+            # ref: https://huggingface.co/sarvamai/sarvam-30b
+            res = "sarvam-moe"

        if res is None:
            logger.warning("\n")
@@ -5499,16 +5505,101 @@ class _LinearAttentionVReorderBase(Qwen3NextModel):
        yield from super().modify_tensors(data_torch, name, bid)


+class _Qwen35MRopeMixin:
+    # Qwen3.5 always applies interleaved MRoPE (see Qwen3_5RotaryEmbedding in transformers);
+    # the upstream default mrope_section is [11, 11, 10] and llama.cpp's QWEN35 / QWEN35MOE
+    # loaders treat qwen35.rope.dimension_sections as required, so make sure it is always
+    # written even when a particular checkpoint omits the field in `rope_parameters`.
+    _QWEN35_DEFAULT_MROPE_SECTION = [11, 11, 10, 0]
+
+    gguf_writer: gguf.GGUFWriter
+    rope_parameters: dict
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()  # ty: ignore[unresolved-attribute]
+        if "mrope_section" not in self.rope_parameters:
+            self.gguf_writer.add_rope_dimension_sections(self._QWEN35_DEFAULT_MROPE_SECTION)
+
+
@ModelBase.register("Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM")
-class Qwen3_5TextModel(_LinearAttentionVReorderBase):
+class Qwen3_5TextModel(_Qwen35MRopeMixin, _LinearAttentionVReorderBase):
    model_arch = gguf.MODEL_ARCH.QWEN35


@ModelBase.register("Qwen3_5MoeForConditionalGeneration", "Qwen3_5MoeForCausalLM")
-class Qwen3_5MoeTextModel(_LinearAttentionVReorderBase):
+class Qwen3_5MoeTextModel(_Qwen35MRopeMixin, _LinearAttentionVReorderBase):
    model_arch = gguf.MODEL_ARCH.QWEN35MOE


+# MiniCPM-V 4.6: text tower is Qwen3.5 (linear+full hybrid attention) wrapped under
+# `model.language_model.*`; vision tower is SigLIP + a window-attention ViT merger
+# + a final DownsampleMLP merger. The same HF arch is registered twice below: once as
+# the LM (text mode) and once as the mmproj (vision mode), mirroring the Qwen3-VL setup.
+
+@ModelBase.register("MiniCPMV4_6ForConditionalGeneration")
+class MiniCPMV4_6TextModel(Qwen3_5TextModel):
+    model_arch = gguf.MODEL_ARCH.QWEN35
+
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, gen = item
+
+        if name.startswith("model.merger."):
+            return None
+        # MTP tensors are not used at inference yet; align with Qwen3Next behaviour
+        if name.startswith("mtp"):
+            return None
+
+        return super().filter_tensors(item)
+
+
+@ModelBase.register("MiniCPMV4_6ForConditionalGeneration")
+class MiniCPMV4_6VisionModel(MmprojModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if self.hparams_vision is not None:
+            # In MiniCPM-V 4.6 `vision_config.image_size` (980) describes the SigLIP
+            # positional embedding bucket grid (70 x 70), while the per-slice processing
+            # resolution is the preprocessor's `scale_resolution` (typically 448).
+            # The CLIP loader in tools/mtmd/clip.cpp consumes `clip.vision.image_size`
+            # as the slice size and warmup resolution, so report `scale_resolution` there
+            # to match the upstream MiniCPMV4_6ImageProcessorPil slicing rules.
+            scale_resolution = self.preprocessor_config.get("scale_resolution")
+            if scale_resolution is not None:
+                self.hparams_vision["image_size"] = int(scale_resolution)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        assert self.hparams_vision is not None
+
+        # projector type string is consumed by clip_projector_type_from_string() in clip.cpp
+        # (mapped to PROJECTOR_TYPE_MINICPMV4_6).
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MINICPMV4_6)
+
+        # ViT merger 2x2 + final merger 2x2 = 4x spatial merge per dimension; used for slice alignment
+        self.gguf_writer.add_vision_projector_scale_factor(4)
+
+        # borrow wa_layer_indexes for vit_merger insertion point
+        insert_layer_id = int(self.global_config.get(
+            "insert_layer_id", self.hparams_vision.get("insert_layer_id", 6)))
+        self.gguf_writer.add_vision_wa_layer_indexes([insert_layer_id])
+
+        # SigLIP vision body uses gelu_pytorch_tanh, which matches ggml_gelu (tanh approx).
+        self.gguf_writer.add_vision_use_gelu(True)
+        self.gguf_writer.add_vision_attention_layernorm_eps(
+            self.hparams_vision.get("layer_norm_eps", 1e-6))
+
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, gen = item
+
+        # lm_head / MTP -> belong to the LM file
+        if name.startswith(("lm_head.", "mtp")):
+            return None
+
+        return super().filter_tensors(item)
+
+
@ModelBase.register("GPT2LMHeadModel")
 class GPT2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.GPT2
@@ -7900,13 +7991,37 @@ class Gemma4Model(Gemma3Model):
        rope_freqs_full = torch.tensor(values, dtype=torch.float32)
        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), rope_freqs_full)

+    def _generate_nvfp4_tensors(self):
+        # Gemma-4 stores a per-layer router.per_expert_scale ([n_expert]) that scales
+        # each expert's contribution. It's mathematically equivalent to a per-expert
+        # scalar on the down_proj output, which is exactly where ffn_down_exps_s is
+        # applied at inference. Fold it into each expert's NVFP4 weight_scale_2 so the
+        # existing NVFP4 path produces the right scales.
+        n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0
+        for name in [n for n in self.model_tensors if n.endswith(".router.per_expert_scale")]:
+            bid_match = re.search(r"\.layers\.(\d+)\.", name)
+            if bid_match is None:
+                continue
+            bid = bid_match.group(1)
+            prefix = name[: name.index(f".layers.{bid}.") + len(f".layers.{bid}.")]
+            w2_targets = [f"{prefix}experts.{e}.down_proj.weight_scale_2" for e in range(n_experts)]
+            present = [w2 in self.model_tensors for w2 in w2_targets]
+            if not any(present):
+                continue
+            assert all(present), f"layer {bid}: partial NVFP4 quantization across experts"
+            r = self.model_tensors.pop(name)
+            for e, w2 in enumerate(w2_targets):
+                s = self.model_tensors[w2]
+                self.model_tensors[w2] = lambda s=s, r=r, i=e: s() * r()[i]
+        super()._generate_nvfp4_tensors()
+
    @classmethod
    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
        name, gen = item

        if name.endswith("per_dim_scale") or name.endswith("layer_scalar"):
            name = name + ".weight"
-        if ".experts." in name and not name.endswith(".weight"):
+        if ".experts." in name and not name.endswith((".weight", ".weight_scale", ".weight_scale_2", ".input_scale")):
            name += ".weight"

        return super().filter_tensors((name, gen))
@@ -9405,10 +9520,126 @@ class MiniMaxM2Model(TextModel):
        yield from super().modify_tensors(data_torch, name, bid)


-@ModelBase.register("MiMoV2FlashForCausalLM")
+@ModelBase.register("MiMoV2FlashForCausalLM", "MiMoV2ForCausalLM")
 class MimoV2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.MIMO2

+    # MiMo V2-Flash, V2.5 and V2.5-Pro all ship 3 trained MTP layers under model.mtp.layers.{0,1,2}.
+    # The HF config does not expose the count, so it's hardcoded to match the count found in the safetensors.
+    _n_nextn = 3
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.block_count = self.hparams["num_hidden_layers"] + self._n_nextn
+        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+
+    @staticmethod
+    def _tp_aware_qkv_dequant(weight: Tensor, scale_inv: Tensor,
+                              n_q: int, n_kv: int, hd: int, vhd: int,
+                              bs: int = 128) -> Tensor:
+        # MiMo-V2.5 (TP=4) and V2.5-Pro (TP=8) ship qkv_proj sharded across TP
+        # ranks; per rank, rows are stacked as [Q_per | K_per | V_per].
+        # weight_scale_inv has ceil(rows_per_rank/bs) block-rows per rank (last
+        # may extend past rows_per_rank with phantom rows not in the weight).
+        # Naive repeat_interleave aligns rank 0 only and mis-applies scales to
+        # later ranks once rows_per_rank isn't a multiple of bs.
+        # Re-group the per-rank [Q_per|K_per|V_per] rows into a single fused
+        # [Q | K | V] tensor matching the un-sharded original layout.
+        q_size = n_q * hd
+        k_size = n_kv * hd
+        v_size = n_kv * vhd
+        total_rows = q_size + k_size + v_size
+        if weight.shape[0] != total_rows:
+            raise ValueError(f"qkv_proj weight rows {weight.shape[0]} != q+k+v {total_rows}")
+
+        # detect TP from scale_inv block count, descending order so larger matches first
+        tp = None
+        for cand in (8, 4):
+            if total_rows % cand != 0:
+                continue
+            rpr = total_rows // cand
+            bpr = (rpr + bs - 1) // bs
+            if scale_inv.shape[0] == cand * bpr:
+                tp = cand
+                break
+        if tp is None:
+            raise ValueError(
+                f"qkv_proj: cannot detect TP - scale_inv rows {scale_inv.shape[0]}, "
+                f"q+k+v {total_rows}")
+
+        q_per = q_size // tp
+        k_per = k_size // tp
+        v_per = v_size // tp
+        rows_per_rank = q_per + k_per + v_per
+        blocks_per_rank = (rows_per_rank + bs - 1) // bs
+
+        scale_inv = scale_inv.float()
+        # per-row scale-row index: rank * blocks_per_rank + (rr_in_rank // bs)
+        row_idx = torch.arange(total_rows)
+        rr = row_idx % rows_per_rank
+        rank = row_idx // rows_per_rank
+        scale_row_idx = rank * blocks_per_rank + (rr // bs)
+        # gather: (total_rows, n_col_blocks)
+        scale_per_row_block = scale_inv[scale_row_idx]
+        # expand col-blocks -> cols: each block-col covers `bs` weight cols
+        scale_full = scale_per_row_block.repeat_interleave(bs, dim=1)
+        # crop to weight col count (in case last col-block isn't full)
+        scale_full = scale_full[:, : weight.shape[1]]
+        dequant = weight.float() * scale_full
+
+        if tp == 1:
+            return dequant
+
+        # Re-group per-rank [Q_per|K_per|V_per] rows into unified [Q | K | V]
+        qs, ks, vs = [], [], []
+        for r in range(tp):
+            base = r * rows_per_rank
+            qs.append(dequant[base : base + q_per])
+            ks.append(dequant[base + q_per : base + q_per + k_per])
+            vs.append(dequant[base + q_per + k_per : base + rows_per_rank])
+        return torch.cat(qs + ks + vs, dim=0)
+
+    def dequant_model(self):
+        # Capture raw FP8 (weight, scale_inv) lambdas for qkv_proj BEFORE super
+        # rewrites them with the existing dequant. Replace super's lambda after
+        # it runs so scale_inv removal still happens via the standard path.
+        qkv_overrides: dict[str, tuple[Callable, Callable, int]] = {}
+        qc = self.hparams.get("quantization_config")
+        if isinstance(qc, dict) and qc.get("quant_method") == "fp8":
+            pat = re.compile(r"^model\.layers\.(\d+)\.self_attn\.qkv_proj\.weight_scale_inv$")
+            for name in list(self.model_tensors.keys()):
+                m = pat.match(name)
+                if not m:
+                    continue
+                weight_name = name.removesuffix("_scale_inv")
+                if weight_name not in self.model_tensors:
+                    continue
+                qkv_overrides[weight_name] = (
+                    self.model_tensors[weight_name],
+                    self.model_tensors[name],
+                    int(m.group(1)),
+                )
+
+        super().dequant_model()
+
+        if not qkv_overrides:
+            return
+
+        n_q = self.hparams["num_attention_heads"]
+        hd = self.hparams["head_dim"]
+        vhd = self.hparams["v_head_dim"]
+        hybrid = self.hparams["hybrid_layer_pattern"]
+        n_layer_text = self.hparams["num_hidden_layers"]
+        for weight_name, (w_fn, s_fn, bid) in qkv_overrides.items():
+            # MTP layers (bid >= n_layer_text) use SWA-style attention dims
+            is_swa = True if bid >= n_layer_text else hybrid[bid] == 1
+            n_kv = self.hparams["swa_num_key_value_heads" if is_swa else "num_key_value_heads"]
+            self.model_tensors[weight_name] = (
+                lambda w_fn=w_fn, s_fn=s_fn, n_q=n_q, n_kv=n_kv, hd=hd, vhd=vhd:
+                    MimoV2Model._tp_aware_qkv_dequant(w_fn(), s_fn(), n_q, n_kv, hd, vhd)
+            )
+
    def set_gguf_parameters(self):
        super().set_gguf_parameters()

@@ -9419,11 +9650,14 @@ class MimoV2Model(TextModel):

        n_head_kv = self.hparams["num_key_value_heads"]
        n_head_kv_swa = self.hparams["swa_num_key_value_heads"]
-        n_head_kv_arr = [n_head_kv_swa if use_swa == 1 else n_head_kv for use_swa in self.hparams["hybrid_layer_pattern"]]
+        # Extend the per-layer pattern with SWA entries for the MTP blocks so the
+        # runtime arrays (sized to extended block_count) are fully populated.
+        hybrid = list(self.hparams["hybrid_layer_pattern"]) + [1] * self._n_nextn
+        n_head_kv_arr = [n_head_kv_swa if use_swa == 1 else n_head_kv for use_swa in hybrid]
        self.gguf_writer.add_head_count_kv(n_head_kv_arr)

        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
-        self.gguf_writer.add_sliding_window_pattern(self.hparams["hybrid_layer_pattern"])
+        self.gguf_writer.add_sliding_window_pattern(hybrid)
        self.gguf_writer.add_value_length(self.hparams["v_head_dim"])
        self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
        self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
@@ -9433,6 +9667,12 @@ class MimoV2Model(TextModel):

        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5))

+        v_scale = self.hparams.get("attention_value_scale")
+        if v_scale is not None:
+            self.gguf_writer.add_attn_value_scale(float(v_scale))
+
+        self.gguf_writer.add_nextn_predict_layers(self._n_nextn)
+
    _experts: list[dict[str, Tensor]] | None = None

    @classmethod
@@ -9442,13 +9682,21 @@ class MimoV2Model(TextModel):
        if "attention_sink" in name and not name.endswith(".weight"):
            name += ".weight"

-        # TODO: mimo v2 does not indicate the number of next-token-prediction layers, therefore we cannot do the same way as GLM4_MOE
-        if "model.mtp." in name:
-            return None
-
        return super().filter_tensors((name, gen))

    def modify_tensors(self, data_torch, name, bid):
+        # Remap MTP/NextN tensors to additional layer slots so the standard tensor map handles them.
+        # HF: model.mtp.layers.{i}.foo  ->  model.layers.{n_layer_text + i}.foo
+        m = re.match(r"^model\.mtp\.layers\.(\d+)\.(.*)$", name)
+        if m is not None:
+            mtp_idx = int(m.group(1))
+            assert mtp_idx < self._n_nextn, f"MTP layer index {mtp_idx} >= _n_nextn ({self._n_nextn})"
+            rest = m.group(2)
+            n_layer_text = self.hparams["num_hidden_layers"]
+            new_bid = n_layer_text + mtp_idx
+            name = f"model.layers.{new_bid}.{rest}"
+            bid = new_bid
+
        # process the experts separately
        if name.find("mlp.experts") != -1:
            n_experts = self.hparams["n_routed_experts"]
@@ -11346,6 +11594,34 @@ class BailingMoeV2Model(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


+@ModelBase.register("SarvamMoEForCausalLM", "modeling_sarvam_moe.SarvamMoEForCausalLM")
+class SarvamMoEModel(BailingMoeV2Model):
+    model_arch = gguf.MODEL_ARCH.BAILINGMOE2
+    # Sarvam-MoE shares the BailingMoeV2 architecture; only differences:
+    #  - full rotary (no partial_rotary_factor)
+    #  - expert bias is zero-mean normalized at load time
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        if (rope_dim := hparams.get("head_dim")) is None:
+            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
+        # Override the partial-rotary value written by BailingMoeV2 with the full rotary dim
+        self.gguf_writer.add_rope_dimension_count(rope_dim)
+
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, gen = item
+        if name.endswith(".expert_bias"):
+            # Sarvam normalizes expert bias to zero mean
+            inner = gen
+
+            def gen():
+                t = inner()
+                return t - t.mean()
+        return super().filter_tensors((name, gen))
+
+
@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
 class GroveMoeModel(TextModel):
    model_arch = gguf.MODEL_ARCH.GROVEMOE
@@ -13463,6 +13739,27 @@ class DotsOCRVisionModel(MmprojModel):
        yield from super().modify_tensors(data_torch, name, bid)


+@ModelBase.register("Sarashina2VisionForCausalLM")
+class Sarashina2VLTextModel(LlamaModel):
+    model_arch = gguf.MODEL_ARCH.LLAMA
+
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, gen = item
+        if name.startswith("llm."):
+            name = name.replace("llm.", "", 1)
+        elif name.startswith("norm."):
+            return None
+        return super().filter_tensors((name, gen))
+
+
+@ModelBase.register("Sarashina2VisionForCausalLM")
+class Sarashina2VLVisionModel(Qwen2VLVisionModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.global_config['model_type'] = "qwen2_vl"
+
+
 ###### CONVERSION LOGIC ######


@@ -13719,7 +14016,7 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
    # Step3-VL keeps text config under text_config but uses a custom top-level architecture.
    # For text conversion we route to a dedicated text-only class.
    # TODO: refactor this later to avoid adding exception here
-    if model_type == ModelType.TEXT and arch == "StepVLForConditionalGeneration":
+    if model_type == ModelType.TEXT and arch in ("StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM"):
        return arch

    # if "architectures" is found in the sub-config, use that instead
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -155,6 +155,7 @@ models = [
    {"name": "joyai-llm",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
    {"name": "kanana2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
    {"name": "f2llmv2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
+    {"name": "sarvam-moe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
@@ -175,6 +176,7 @@ pre_computed_hashes = [
    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
    {"name": "kimi-k2",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base",   "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
    {"name": "qwen2",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
+    {"name": "qwen35",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM-V-4_6", "chkhsh": "1444df51289cfa8063b96f0e62b1125440111bc79a52003ea14b6eac7016fd5f"},
    {"name": "grok-2",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
    # jina-v2-de variants
    {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -737,6 +737,14 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
 | UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|

+## Compile-time Flags
+
+Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spot.
+
+| Name            | Function                                                                         |
+|-----------------|----------------------------------------------------------------------------------|
+| DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. |
+
 ## Design Rule

 - Open to all contributors.
--- a/docs/multi-gpu.md
+++ b/docs/multi-gpu.md
@@ -0,0 +1,127 @@
+# Using multiple GPUs with llama.cpp
+
+This guide explains how to run [llama.cpp](https://github.com/ggml-org/llama.cpp) across more than one GPU. It covers the split modes, the command-line flags that control them, the limitations you need to know about, and ready-to-use recipes for `llama-cli` and `llama-server`.
+
+The CLI arguments listed here are the same for both tools - or most llama.cpp binaries for that matter.
+
+---
+
+## When you need multi-GPU
+
+Reach for multi-GPU when one of these is true:
+
+- **The model doesn't fit in a single GPU's VRAM.** By spreading the weights across two or more GPUs the whole model can stay on accelerators. Otherwise part of the model will need to be run off of the comparatively slower system RAM.
+- **You want more throughput.** By distributing the computation across multiple GPUs, each individual GPU has to do less work. This can result in better prefill and/or token generation performance, depending on the split mode and interconnect speed vs. the speed of an individual GPU.
+
+---
+
+## The split modes
+
+Set with `--split-mode` / `-sm`.
+
+| Mode | What it does | When to use |
+|---|---|---|
+| `none` | Use a single GPU only. Pick which one with `--main-gpu`. | You explicitly want to confine the model to one GPU even though more are visible. |
+| `layer` (**default**) | Pipeline parallelism. Each GPU holds a contiguous slice of layers. The KV cache for layer *l* lives on the GPU that owns layer *l*. | Default and most compatible multi-GPU choice. You want more memory than a single GPU provides and your priority is a fast prefill. Can tolerate slow interconnect speeds between GPUs. |
+| `row` | **Deprecated.** Older row-split tensor-parallel path with comparatively poor performance. Splits only dense weights across GPUs. Superseded by `tensor` which should be universally superior if it can be used. | Avoid in new deployments. |
+| `tensor` | **EXPERIMENTAL.** Tensor parallelism that splits both weights *and* KV across the participating GPUs via a "meta device" abstraction. | You want more memory than a single GPU provides and your priority is fast token generation. Prefill speeds approach pipeline parallel speeds for large, dense models and fast GPU interconnect speeds. Treat as experimental as the code is less mature than pipeline parallelism. Performance should be good for multiple NVIDIA GPUs using the CUDA backend, no guarantees otherwise. |
+
+> Pipeline parallel (`layer`) vs. tensor parallel (`tensor`): pipeline-parallel runs different layers on different GPUs and processes tokens sequentially through the pipeline. This minimizes data transfers between GPUs but requires many tokens to scale well. Tensor-parallel splits each layer across GPUs and does multiple cross-GPU reductions per layer. This enables parallelizing any workload but is much more bottlenecked by the GPU interconnect speed. Pipeline-parallel maximizes batch throughput; tensor-parallel minimizes latency.
+
+---
+
+## Command-line arguments reference
+
+| Short | Long | Value | Default | Notes |
+|---|---|---|---|---|
+| `-sm` | `--split-mode` | `none` \| `layer` \| `tensor` | `layer` | See modes above. |
+| `-ts` | `--tensor-split` | comma-separated proportions, e.g. `3,1` | mode-dependent | How much of the model goes to each GPU. If omitted, `layer`/`row` use automatic splitting proportional to memory, while `tensor` splits tensor segments evenly. With `3,1` on two GPUs, GPU 0 gets 75 %, GPU 1 gets 25 %. The values follow the order in `--device`. |
+| `-mg` | `--main-gpu` | integer device index | `0` | The single GPU used in `--split-mode none`. |
+| `-ngl` | `--n-gpu-layers` / `--gpu-layers` | integer \| `auto` \| `all` | `auto` | Maximum number of layers to keep in VRAM. Use `999` or `all` to push everything possible to the GPUs. |
+| `-dev` | `--device` | comma-separated device names, or `none` | auto | Restrict which devices llama.cpp may use. See `--list-devices` for names. |
+| | `--list-devices` | - | - | Print the available devices and their memory. Run this first to learn the names you'd pass to `--device`. |
+| `-fa` | `--flash-attn` | `on` \| `off` \| `auto` | `auto` | Required when using `--split-mode tensor` and/or quantized V cache. Supported (and therefore enabled by default) for most combinations of models and backends. |
+| `-ctk` | `--cache-type-k` | `f32` \| `f16` \| `bf16` \| `q8_0` \| `q4_0` \| ... | `f16` | KV cache type for K. |
+| `-ctv` | `--cache-type-v` | same as `-ctk` | `f16` | KV cache type for V. |
+| `-fit` | `--fit` | `on` \| `off` | `on` | Auto-fit unset args to device memory. **Not supported with `tensor`. You may need to manually set the `--ctx-size` to make the model fit.**  |
+
+As for any CUDA program, the environment variable `CUDA_VISIBLE_DEVICES` can be used to control which GPUs to use for the CUDA backend: if you set it, llama.cpp only sees the specified GPUs. Use `--device` for selecting GPUs from among those visible to llama.cpp, this works for any backend.
+
+---
+
+## Recipes
+
+### 1. Default - pipeline parallel across all visible GPUs
+
+```bash
+llama-cli -m model.gguf
+llama-server -m model.gguf
+```
+
+Easiest configuration. KV cache spreads across the GPUs along with the layers. `--fit` (on by default) sizes things automatically.
+
+### 2. Pipeline parallel with a custom split ratio
+
+```bash
+llama-cli -m model.gguf -ts 3,1
+```
+
+Useful when GPUs have different memory: GPU 0 (3 parts) and GPU 1 (1 part). Proportions are normalized so `-ts 3,1` is the same as e.g. `-ts 75,25`.
+
+### 3. Single-GPU mode, picking a specific GPU
+
+```bash
+llama-cli --list-devices
+llama-cli -m model.gguf -dev CUDA1
+```
+
+Use only the device listed as `CUDA1` when calling with `--list-devices`.
+
+### 4. Tensor parallelism (experimental)
+
+```bash
+llama-cli -m model.gguf -sm tensor -ctk f16 -ctv f16
+```
+
+- `--flash-attn off` or (`--flash-attn auto` resolving to `off` when it isn't supported) is a hard error.
+- KV cache types must be non-quantized: `f32`, `f16`, or `bf16`. Support for quantized KV cache is not implemented and trying to use it will result in an error.
+- Mark this configuration as experimental in your tooling: validate output quality before deploying.
+- `--split-mode tensor`is not implemented for all architectures. The following will fail with *"LLAMA_SPLIT_MODE_TENSOR not implemented for architecture '...'"*:
+
+  - **MoE / hybrid:** Grok, MPT, OLMoE, DeepSeek2, GLM-DSA, Nemotron-H, Nemotron-H-MoE, Granite-Hybrid, LFM2-MoE, Minimax-M2, Mistral4, Kimi-Linear, Jamba, Falcon-H1
+  - **State-space / RWKV-style:** Mamba, Mamba2 (and the hybrid Mamba-attention models above)
+  - **Other:** PLAMO2, MiniCPM3, Gemma-3n, OLMo2, BitNet, T5
+
+### 5. With NCCL
+
+There's no runtime flag for NCCL - it's selected at build time (`-DGGML_CUDA_NCCL=ON`, this is the default). Note that NCCL is **not** automatically distributed with CUDA and you may need to install it manually - when in doubt check the CMake log to see whether or not it can find the package. When llama.cpp is compiled with NCCL support it uses it automatically for cross-GPU reductions in `tensor` mode. When NCCL is missing on a multi-GPU build, you'll see this one-time warning and performance will be lower:
+
+```
+NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal
+```
+
+When using the "ROCm" backend (which is the ggml CUDA code translated for AMD via HIP), the AMD equivalent RCCL can be used by compiling with `-DGGML_HIP_RCCL=ON`. Note that RCCL is by default *disabled* because (unlike NCCL) it was not universally beneficial during testing.
+### 6. With CUDA peer-to-peer access (`GGML_CUDA_P2P`)
+
+CUDA peer-to-peer (P2P) lets GPUs transfer data directly between each other instead of going through system memory, which generally improves multi-GPU performance. It is **opt-in** at runtime - set the environment variable `GGML_CUDA_P2P` to any value to enable it:
+
+```bash
+GGML_CUDA_P2P=1 llama-cli -m model.gguf -sm tensor
+```
+
+P2P requires driver support (usually restricted to workstation/datacenter GPUs) and **may cause crashes or corrupted outputs on some motherboards or BIOS configurations** (e.g. when IOMMU is enabled). If you see instability after enabling it, unset the variable.
+
+---
+
+## Troubleshooting
+
+| Symptom | How to fix |
+|---|---|
+| Startup error *"SPLIT_MODE_TENSOR requires flash_attn to be enabled"* | Add `-fa on` or remove `-fa off`. |
+| Startup error *"simultaneous use of SPLIT_MODE_TENSOR and KV cache quantization not implemented"* | Use `-ctk f16 -ctv f16` (or `bf16`/`f32`) with `--split-mode tensor`. |
+| Startup error *"LLAMA_SPLIT_MODE_TENSOR not implemented for architecture 'X'"* | Architecture not on the TENSOR allow-list. Use `--split-mode layer`. |
+| Warning *"NCCL is unavailable, multi GPU performance will be suboptimal"* | llama.cpp wasn't built with NCCL. Either accept the lower performance or install NCCL and rebuild. |
+| CUDA OOM at startup or during prefill in `--split-mode tensor` | Auto-fit is disabled in this mode, so reduce memory pressure yourself. In order from least to most disruptive: lower `--ctx-size` (`-c`) (KV cache is roughly proportional to `n_ctx`); for `llama-server`, lower `--parallel` (`-np`) (a slot KV cache is allocated per concurrent sequence); as a last resort, reduce `--n-gpu-layers` (`-ngl`) (the remaining layers run on CPU and inference will be much slower). |
+| Performance is worse with multi-GPU than single-GPU | The performance is bottlenecked by GPU interconnect speed. For `--split-mode tensor`, verify that NCCL is being used. Try `--split-mode layer` (less communication than `tensor`). Increase GPU interconnect speed via more PCIe lanes or e.g. NVLink (if available). |
+| GPU not used at all | `--n-gpu-layers` is `0` or too low - try explicitly setting `-ngl all`. Or you are accidentally hiding the GPUs via an environment variable like `CUDA_VISIBLE_DEVICES=-1`. Or your build doesn't include support for the relevant backend. |
+| Crashes or corrupted outputs after setting `GGML_CUDA_P2P=1` | Some motherboards and BIOS settings (e.g. with IOMMU enabled) don't support CUDA peer-to-peer reliably. Unset `GGML_CUDA_P2P`. |
--- a/docs/multimodal/minicpmv4.6.md
+++ b/docs/multimodal/minicpmv4.6.md
@@ -0,0 +1,49 @@
+## MiniCPM-V 4.6
+
+### Prepare models and code
+
+Download [MiniCPM-V-4_6](https://huggingface.co/openbmb/MiniCPM-V-4_6) PyTorch model from huggingface to "MiniCPM-V-4_6" folder.
+
+The model must be the standard `transformers` v5.7.0+ checkpoint (no `trust_remote_code`); the architecture in `config.json` is `MiniCPMV4_6ForConditionalGeneration` with a `qwen3_5_text` text model and a SigLIP-based vision tower plus a window-attention `vit_merger`.
+
+### Build llama.cpp
+
+If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
+
+Clone llama.cpp:
+```bash
+git clone https://github.com/ggml-org/llama.cpp
+cd llama.cpp
+```
+
+Build llama.cpp using `CMake`:
+```bash
+cmake -B build
+cmake --build build --config Release
+```
+
+
+### Usage of MiniCPM-V 4.6
+
+Unlike older MiniCPM-V variants, MiniCPM-V 4.6 is converted directly through `convert_hf_to_gguf.py`. The same script is invoked twice on the original Hugging Face directory: once to produce the language-model GGUF and once with `--mmproj` to produce the multimodal projector GGUF.
+
+```bash
+# language model
+python ./convert_hf_to_gguf.py ../MiniCPM-V-4_6 --outfile ../MiniCPM-V-4_6/ggml-model-f16.gguf
+
+# multimodal projector (vision tower + window-attention vit_merger + DownsampleMLP merger)
+python ./convert_hf_to_gguf.py ../MiniCPM-V-4_6 --mmproj --outfile ../MiniCPM-V-4_6/mmproj-model-f16.gguf
+
+# optional: quantize to Q4_K_M
+./build/bin/llama-quantize ../MiniCPM-V-4_6/ggml-model-f16.gguf ../MiniCPM-V-4_6/ggml-model-Q4_K_M.gguf Q4_K_M
+```
+
+
+Inference on Linux or Mac
+```bash
+# run in single-turn mode
+./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_6/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+
+# run in conversation mode
+./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_6/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4_6/mmproj-model-f16.gguf
+```
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -17,7 +17,7 @@ Legend:
 |                              ABS | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
@@ -36,15 +36,15 @@ Legend:
 |                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |               CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                           CUMSUM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
-|                             DIAG | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                           CUMSUM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                             DIAG | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              ELU | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                              EXP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                             FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                             FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                  GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
@@ -101,11 +101,11 @@ Legend:
 |                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
-|                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                              SQR | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                             SQRT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                         SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ |
+|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                             STEP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              SUM | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
--- a/docs/ops/SYCL.csv
+++ b/docs/ops/SYCL.csv
--- a/examples/sycl/start-svr.sh
+++ b/examples/sycl/start-svr.sh
@@ -111,14 +111,14 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then
    echo "Use $GGML_SYCL_DEVICE as main GPU"
    #use signle GPU only
    GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
-    export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
+    export ONEAPI_DEVICE_SELECTOR="level_zero:${GGML_SYCL_DEVICE}"
    echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
 else
    echo "Use all Intel GPUs, including iGPU & dGPU"
    GPUS_SETTING="-sm ${SPLIT_MODE}"
 fi

-echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE}  --mmap "
+echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE}  --mmap --host 0.0.0.0 --port 8000"
 ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap --host 0.0.0.0 --port 8000


--- a/examples/sycl/test.sh
+++ b/examples/sycl/test.sh
@@ -119,7 +119,7 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then
    echo "Use $GGML_SYCL_DEVICE as main GPU"
    #use signle GPU only
    GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
-    export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
+    export ONEAPI_DEVICE_SELECTOR="level_zero:${GGML_SYCL_DEVICE}"
    echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
 else
    echo "Use all Intel GPUs, including iGPU & dGPU"
--- a/flake.lock
+++ b/flake.lock
@@ -1,58 +0,0 @@
-{
-  "nodes": {
-    "flake-parts": {
-      "inputs": {
-        "nixpkgs-lib": "nixpkgs-lib"
-      },
-      "locked": {
-        "lastModified": 1730504689,
-        "narHash": "sha256-hgmguH29K2fvs9szpq2r3pz2/8cJd2LPS+b4tfNFCwE=",
-        "owner": "hercules-ci",
-        "repo": "flake-parts",
-        "rev": "506278e768c2a08bec68eb62932193e341f55c90",
-        "type": "github"
-      },
-      "original": {
-        "owner": "hercules-ci",
-        "repo": "flake-parts",
-        "type": "github"
-      }
-    },
-    "nixpkgs": {
-      "locked": {
-        "lastModified": 1732014248,
-        "narHash": "sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w=",
-        "owner": "NixOS",
-        "repo": "nixpkgs",
-        "rev": "23e89b7da85c3640bbc2173fe04f4bd114342367",
-        "type": "github"
-      },
-      "original": {
-        "owner": "NixOS",
-        "ref": "nixos-unstable",
-        "repo": "nixpkgs",
-        "type": "github"
-      }
-    },
-    "nixpkgs-lib": {
-      "locked": {
-        "lastModified": 1730504152,
-        "narHash": "sha256-lXvH/vOfb4aGYyvFmZK/HlsNsr/0CVWlwYvo2rxJk3s=",
-        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
-      },
-      "original": {
-        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
-      }
-    },
-    "root": {
-      "inputs": {
-        "flake-parts": "flake-parts",
-        "nixpkgs": "nixpkgs"
-      }
-    }
-  },
-  "root": "root",
-  "version": 7
-}
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
 set(GGML_VERSION_MINOR 11)
-set(GGML_VERSION_PATCH 0)
+set(GGML_VERSION_PATCH 1)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -169,7 +169,7 @@ extern "C" {
        // device type
        enum ggml_backend_dev_type type;
        // device id
-        //   for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
+        //   for PCI devices, this should be the lower-case PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:c1:00.0")
        //   if the id is unknown, this should be NULL
        const char * device_id;
        // device capabilities
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -965,7 +965,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
        }
        if (sched->debug > 1) {
            ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
-            GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d,c=%d:", i, ggml_op_name(node->op), node->name,
+            GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d,c=%d:", i, ggml_op_desc(node), node->name,
                fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node),
                graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)], node->flags & GGML_TENSOR_FLAG_COMPUTE ? 1 : 0);
            for (int j = 0; j < GGML_MAX_SRC; j++) {
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@@ -203,7 +203,6 @@
 #elif defined(__riscv)
 // quants.c
 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
-#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x1_generic ggml_quantize_mat_q8_0_4x1
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
--- a/ggml/src/ggml-cpu/arch/riscv/quants.c
+++ b/ggml/src/ggml-cpu/arch/riscv/quants.c
@@ -480,6 +480,104 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
 #endif
 }

+#if defined(__riscv_v)
+static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl256(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) {
+    const int qk = QK1_0;
+    const int nb = n / qk;
+    assert(n % qk == 0);
+
+    const block_q1_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    //LMUL = 1, VLMAX = 32
+    const size_t vl32 = __riscv_vsetvl_e8m1(32);
+    assert(vl32 == 32);
+
+    const vint16m1_t zero = __riscv_vmv_v_x_i16m1(0, 1);
+
+    float sumf = 0;
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+
+        float acc = 0;
+
+        for (int k = 0; k < 4; ++k) {
+            const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
+            const vbool8_t is_not_zero = __riscv_vlm_v_b8(x[ib].qs + 4 * k, vl32);
+
+            const vint8m1_t qy = __riscv_vle8_v_i8m1(yb->qs, vl32);
+            const vint8m1_t neg_qy = __riscv_vneg_v_i8m1(qy, vl32);
+            const vint8m1_t sy = __riscv_vmerge_vvm_i8m1(neg_qy, qy, is_not_zero, vl32);
+
+            const vint16m1_t red = __riscv_vwredsum_vs_i8m1_i16m1(sy, zero, vl32);
+            acc += GGML_CPU_FP16_TO_FP32(yb->d) * (float)__riscv_vmv_x_s_i16m1_i16(red);
+        }
+
+        sumf += d0 * acc;
+    }
+
+    *s = sumf;
+}
+
+static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl128(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) {
+    const int qk = QK1_0;
+    const int nb = n / qk;
+    assert(n % qk == 0);
+
+    const block_q1_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    //LMUL = 2, VLMAX = 32
+    const size_t vl32 = __riscv_vsetvl_e8m2(32);
+    assert(vl32 == 32);
+
+    const vint16m1_t zero = __riscv_vmv_v_x_i16m1(0, 1);
+
+    float sumf = 0;
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+
+        float acc = 0;
+
+        for (int k = 0; k < 4; ++k) {
+            const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
+            const vbool4_t is_not_zero = __riscv_vlm_v_b4(x[ib].qs + 4 * k, vl32);
+
+            const vint8m2_t qy = __riscv_vle8_v_i8m2(yb->qs, vl32);
+            const vint8m2_t neg_qy =__riscv_vneg_v_i8m2(qy, vl32);
+            const vint8m2_t sy = __riscv_vmerge_vvm_i8m2(neg_qy, qy, is_not_zero, vl32);
+
+            const vint16m1_t red = __riscv_vwredsum_vs_i8m2_i16m1(sy, zero, vl32);
+            acc += GGML_CPU_FP16_TO_FP32(yb->d) * (float)__riscv_vmv_x_s_i16m1_i16(red);
+        }
+
+        sumf += d0 * acc;
+    }
+
+    *s = sumf;
+}
+#endif
+
+void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+#if defined(__riscv_v)
+    assert(nrc == 1);
+
+    const size_t vlen_bits = __riscv_vlenb() * 8;
+
+    if (vlen_bits >= 256) {
+        ggml_vec_dot_q1_0_q8_0_vl256(n, s, vx, vy);
+    } else if (vlen_bits >= 128) {
+        ggml_vec_dot_q1_0_q8_0_vl128(n, s, vx, vy);
+    } else {
+        ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    }
+#else
+    ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
 void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    assert(nrc == 1);
    UNUSED(nrc);
--- a/ggml/src/ggml-cuda/allreduce.cu
+++ b/ggml/src/ggml-cuda/allreduce.cu
@@ -0,0 +1,968 @@
+#include "allreduce.cuh"
+
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+
+#include "convert.cuh"
+#include "ggml-impl.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+
+// ---------------------------------------------------------------------------
+// CUDA AllReduce for tensor-parallel inference across two GPUs.
+//
+// Provides an in-place sum reduction over matching tensors on two CUDA
+// devices in the same process.  Used by the tensor-split path alongside
+// NCCL; targets setups without NVLink, where data is exchanged between the
+// GPUs by staging it through pinned host memory over PCIe.
+//
+// Two reduction strategies are selected per call by tensor size:
+//
+//   * Chunked kernel path (small reductions): a single CUDA kernel both
+//     stages data through pinned host memory and performs the local sum.
+//     Cross-GPU synchronization happens *inside the kernel* (busy-wait on
+//     a host-memory flag), which keeps launch overhead low for the
+//     latency-sensitive token-generation case.
+//
+//   * Copy-engine path (large reductions): the transfer is split into
+//     D2H + H2D cudaMemcpyAsync chunks driven by the GPU's copy engine,
+//     followed by a small device-side add kernel.  Cross-GPU
+//     synchronization happens *outside the kernel*, via CUDA events
+//     between streams.  This keeps the compute engine free while large
+//     transfers are in flight, which matters for prefill-sized tensors.
+//     Reductions larger than the per-call inner cap are processed by an
+//     outer chunker that issues sequential inner calls.
+// ---------------------------------------------------------------------------
+
+// ---------------------------------------------------------------------------
+// Cross-GPU signal mechanism
+//
+// One int per (slot, rank) pair in pinned host memory.  Each AR call writes a
+// strictly increasing token (= the AR call number) into its own arrival int.
+// The peer spins until its read of the other's arrival int equals the token
+// it expects for this call -- a mismatch means the peer hasn't arrived yet.
+// Tokens never repeat over realistic call rates (32-bit int wraps in tens of
+// days at thousands of ARs/sec), so arrival ints don't need to be reset
+// between calls; we initialize once at pipeline init and let the values
+// accumulate.
+//
+// There is exactly one writer (the owning GPU) and one reader (the peer), so
+// we don't need atomics.  A volatile store paired with __threadfence_system()
+// provides the release ordering that makes the D2H writes visible system-wide
+// before the arrival token is observed.
+//
+// atomicAdd_system() requires hostNativeAtomicSupported, which is unavailable
+// on PCIe-attached consumer GPUs without NVLink, so the volatile path is the
+// portable choice.
+// ---------------------------------------------------------------------------
+
+static __device__ __forceinline__ void ggml_cuda_ar_signal_set(int * p, int token) {
+    *(volatile int *)p = token;
+}
+static __device__ __forceinline__ int ggml_cuda_ar_signal_get(const int * p) {
+    return *(const volatile int *)p;
+}
+
+// Byte spacing between adjacent arrival ints.  64 bytes (one cache line)
+// ensures each GPU/block's arrival slot lives on its own line, preventing
+// false-sharing stalls on the polling GPU.
+static constexpr size_t GGML_CUDA_AR_ARRIVAL_STRIDE = 64;
+
+// Number of blocks the chunked kernel launches with.  Each block stripes a
+// disjoint slice of the data and synchronizes through its own arrival-token
+// slot so multiple SMs can pump PCIe stores in parallel.
+static constexpr int GGML_CUDA_AR_KERNEL_BLOCKS = 8;
+
+// ---------------------------------------------------------------------------
+// Chunked kernel AllReduce -- 2 GPUs, supports float, half, and bfloat16.
+//
+// Both GPUs run this kernel simultaneously on independent streams.  sendbuf
+// and recvbuf live in T_dst (the caller's tensor type); host_mine / host_other
+// carry data in T_wire (the on-wire type, possibly narrower than T_dst -- e.g.
+// T_dst=F32 with T_wire=BF16 halves the bytes pushed across PCIe).  When
+// T_dst == T_wire the casts below are no-ops.
+//
+// Each GPU runs three phases:
+//
+//   Phase 1 (all threads): cast sendbuf (T_dst) -> T_wire and store as
+//                          single-instruction-width vectors into host_mine.
+//                          __threadfence_system() commits these writes to host
+//                          memory.
+//   Phase 2 (thread 0):    write token to arrival_mine; spin until
+//                          arrival_other == token.
+//   Phase 3 (all threads): read T_wire vectors from host_other, cast
+//                          each element to T_dst, and sum with the local
+//                          sendbuf value (also rounded through T_wire so that
+//                          both GPUs truncate identically -- this guarantees
+//                          bit-equivalent results across the two devices).
+//
+// Multi-block: blocks stripe vectors across (gridDim.x * blockDim.x) global
+// threads to keep multiple SMs issuing PCIe stores in parallel.  Each block
+// has its own arrival-token slot (offset by blockIdx.x * ARRIVAL_STRIDE);
+// thread 0 of each block signals/spins on that slot independently of other
+// blocks.  Tail elements (the leftover < ELEMS_PER_VEC at the end) are
+// handled only by block 0 to avoid cross-block writes to the same slots.
+// ---------------------------------------------------------------------------
+template <typename T_dst, typename T_wire>
+static __global__ void ggml_cuda_ar_kernel(
+        const T_dst  *              sendbuf,
+        T_dst        *              recvbuf,
+        T_wire       * __restrict__ host_mine,
+        const T_wire * __restrict__ host_other,
+        int                         count,
+        int *                       arrival_mine,
+        int *                       arrival_other,
+        int                         token) {
+
+    // Vector unit for the wire type, sized to the arch's widest single-instruction
+    // copy (16 B on Volta+).  Each phase-1 iter writes one vector to host memory;
+    // each phase-3 iter reads one and produces ELEMS_PER_VEC sums.
+    constexpr int ELEMS_PER_VEC = ggml_cuda_get_max_cpy_bytes() / sizeof(T_wire);
+    constexpr int ARRIVAL_INTS  = (int)(GGML_CUDA_AR_ARRIVAL_STRIDE / sizeof(int));
+
+    const int tid       = threadIdx.x;
+    const int nt        = blockDim.x;
+    const int bid       = blockIdx.x;
+    const int gtid      = bid * nt + tid;
+    const int gnt       = gridDim.x * nt;
+    const int count_vec = count / ELEMS_PER_VEC;
+    const int tail      = count_vec * ELEMS_PER_VEC;
+
+    // Phase 1: cast sendbuf (T_dst) -> host_mine (T_wire) and store as vectors.
+    {
+        for (int i = gtid; i < count_vec; i += gnt) {
+            const int off = i * ELEMS_PER_VEC;
+            T_wire wire[ELEMS_PER_VEC];
+            #pragma unroll
+            for (int k = 0; k < ELEMS_PER_VEC; ++k) {
+                wire[k] = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
+            }
+            ggml_cuda_memcpy_1<sizeof(wire)>(&host_mine[off], wire);
+        }
+        if (bid == 0 && tid < count - tail) {
+            host_mine[tail + tid] = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
+        }
+    }
+
+    // Commit this block's host writes before signalling.
+    __threadfence_system();
+    __syncthreads();
+
+    // Phase 2: thread 0 of each block signals on its own arrival slot, then
+    // spins for the matching slot from peer.  Per-block tokens mean blocks
+    // proceed independently -- no inter-block barrier needed.
+    if (tid == 0) {
+        int       * my_slot    = arrival_mine  + bid * ARRIVAL_INTS;
+        const int * other_slot = arrival_other + bid * ARRIVAL_INTS;
+
+        ggml_cuda_ar_signal_set(my_slot, token);
+        __threadfence_system(); // make our signal visible system-wide
+
+        while (ggml_cuda_ar_signal_get(other_slot) != token) {
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+            __nanosleep(100);
+#else
+            NO_DEVICE_CODE;
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+        }
+    }
+
+    __syncthreads();
+
+    // Acquire peer's host_other writes (this block's stripe of them).
+    __threadfence_system();
+
+    // Phase 3: read peer's T_wire vector, cast both sides through T_wire for
+    // bit-equivalence, sum in T_dst precision, and write back to recvbuf.
+    {
+        for (int i = gtid; i < count_vec; i += gnt) {
+            const int off = i * ELEMS_PER_VEC;
+            T_wire wire[ELEMS_PER_VEC];
+            ggml_cuda_memcpy_1<sizeof(wire)>(wire, &host_other[off]);
+            #pragma unroll
+            for (int k = 0; k < ELEMS_PER_VEC; ++k) {
+                const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
+                recvbuf[off + k] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(wire[k]);
+            }
+        }
+        if (bid == 0 && tid < count - tail) {
+            const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
+            recvbuf[tail + tid] =
+                ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(host_other[tail + tid]);
+        }
+    }
+}
+
+// Combined load-convert-add kernel.  The peer's contribution arrives as T_src
+// (which may be a lower-precision type than T_dst when the BF16 round-trip is
+// active).  For bit-equivalence between the two GPUs, dst is first rounded
+// through T_src's precision via ggml_cuda_cast -- peer already truncated its
+// own value the same way before sending -- so both sides perform identical
+// arithmetic.  When T_dst == T_src the round-trip cast is a no-op.
+template <typename T_dst, typename T_src>
+static __global__ void ggml_cuda_ar_add_kernel(
+        T_dst       * __restrict__ dst,
+        const T_src * __restrict__ src,
+        int count) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int nt  = gridDim.x * blockDim.x;
+    for (int i = tid; i < count; i += nt) {
+        const T_src d_low = ggml_cuda_cast<T_src>(dst[i]);
+        dst[i] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(src[i]);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Pipeline structure
+// ---------------------------------------------------------------------------
+
+// Number of slots in the event / arrival ring.  Two slots is sufficient:
+// lockstep guarantees the two GPUs are at most one AR (or chunk) apart, so
+// slot[N%2] is always safe to reuse -- peer has already consumed slot[N%2]
+// from AR N-2 by the time we get to AR N.  acquire_slot's
+// cudaEventSynchronize on ev.ker for both devices makes that consumption
+// explicit before we overwrite host_buf[slot] for the new AR.
+static constexpr int GGML_CUDA_AR_POOL_SIZE = 2;
+
+// Maximum chunk size (bytes per GPU) handled by one chunked kernel launch.
+// Larger tensors are reduced by issuing multiple chunked launches.
+static constexpr size_t GGML_CUDA_AR_MAX_BYTES = 1024 * 1024; // 1 MB
+
+// Copy-engine path: largest tensor accepted on this path; sets host_large /
+// dev_tmp allocation size.
+static constexpr size_t GGML_CUDA_AR_COPY_MAX_BYTES = 32 * 1024 * 1024; // 32 MB
+
+// AR wire size at which the copy-engine path takes over from the chunked-
+// kernel path.  Override via GGML_CUDA_AR_COPY_THRESHOLD.
+static constexpr size_t GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT = 1024 * 1024; // 1 MB
+// Per-call CE chunk-size heuristic: chunk_bytes = clamp(nbytes / 4, MIN, MAX).
+// The /4 keeps ~4 chunks in flight at any moment (good D2H/H2D overlap with
+// the peer); the clamps cover the cases where nbytes/4 is too small (per-
+// memcpy fixed cost dominates) or too large (chunk-level pipelining stalls).
+// Env var GGML_CUDA_AR_COPY_CHUNK_BYTES can override with a fixed value.
+static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN = 512 * 1024;       // 512 KB
+static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX = 2 * 1024 * 1024;  // 2 MB
+// Absolute floor that an env-var override is allowed to set; this caps the
+// per-slot copy-event array.  256 KB -> up to 128 chunks per 32 MB tensor.
+static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN = 256 * 1024;
+static constexpr int GGML_CUDA_AR_COPY_MAX_CHUNKS =
+    static_cast<int>((GGML_CUDA_AR_COPY_MAX_BYTES + GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN - 1) /
+                    GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN);
+
+struct ggml_cuda_ar_event_slot {
+    cudaEvent_t app = nullptr;  // upstream computation complete
+    cudaEvent_t cpy[GGML_CUDA_AR_COPY_MAX_CHUNKS] = {};  // copy-engine D2H chunks complete
+    cudaEvent_t h2d = nullptr;  // copy-engine H2Ds complete (handoff AR stream -> compute stream)
+    cudaEvent_t ker = nullptr;  // AllReduce kernel complete
+};
+
+// Mapped pinned host allocation: cudaHostAlloc + cudaHostGetDevicePointer
+// in one place, with the host handle preserved for cudaFreeHost.  Used where
+// the CPU never touches the buffer -- only the device reads/writes via the
+// mapped device pointer.  Required on systems where cudaDevAttrCanUseHost-
+// PointerForRegisteredMem is 0 and the host pointer can't be used as a
+// device pointer.
+struct ggml_cuda_ar_host_mapping {
+    uint8_t * host = nullptr;   // cudaFreeHost handle; also the H-side ptr for cudaMemcpyAsync
+    uint8_t * dev  = nullptr;   // device-side pointer for kernels / cudaMemset
+
+    cudaError_t alloc(size_t bytes) {
+        cudaError_t rc = cudaHostAlloc(reinterpret_cast<void **>(&host), bytes,
+                                       cudaHostAllocPortable | cudaHostAllocMapped);
+        if (rc != cudaSuccess) {
+            host = nullptr;
+            return rc;
+        }
+        rc = cudaHostGetDevicePointer(reinterpret_cast<void **>(&dev), host, 0);
+        if (rc != cudaSuccess) {
+            cudaFreeHost(host);
+            host = nullptr;
+            dev  = nullptr;
+        }
+        return rc;
+    }
+
+    void free() {
+        if (host) {
+            cudaFreeHost(host);
+            host = nullptr;
+            dev  = nullptr;
+        }
+    }
+};
+
+struct ggml_cuda_ar_pipeline {
+    int      n_devices;
+    int      devices[GGML_CUDA_MAX_DEVICES];
+    size_t   buf_bytes;    // bytes per device in host_buf[]
+    size_t   copy_bytes;   // bytes per device in host_large[] / dev_tmp[]
+    size_t   copy_threshold;
+    size_t   copy_chunk_bytes;
+    size_t   bf16_threshold; // tensors >= this size (bytes) are reduced via FP32->BF16 round-trip; 0 disables
+    uint64_t call_count;
+
+    // Per-device resources.
+    ggml_cuda_ar_host_mapping host_buf[GGML_CUDA_MAX_DEVICES];   // pinned staging (chunked kernel)
+    ggml_cuda_ar_host_mapping host_large[GGML_CUDA_MAX_DEVICES]; // pinned staging (copy-engine)
+    char *                    dev_tmp[GGML_CUDA_MAX_DEVICES];    // device scratch for copy-engine path
+    cudaStream_t             streams[GGML_CUDA_MAX_DEVICES];   // non-blocking
+    ggml_cuda_ar_event_slot  ev_pool[GGML_CUDA_MAX_DEVICES][GGML_CUDA_AR_POOL_SIZE];
+
+    // Copy-engine: per-device "I finished reading my peer's host_large"
+    // event.  Indexed by RECORDER device.  Recorded same-device on streams[i]
+    // after stage 2's last H2D from host_large[peer].  Waited cross-device
+    // by peer's stage-1 stream before the next AR overwrites host_large[peer].
+    cudaEvent_t              host_large_read_done[GGML_CUDA_MAX_DEVICES];
+    bool                     host_large_read_done_valid;
+
+    // Copy-engine: per-device "my add_kernel is done with dev_tmp" event.
+    // Recorded on the compute stream after each add_kernel; the AR stream
+    // waits on it before the next copy_impl's H2D overwrites dev_tmp.  Lets us
+    // single-buffer dev_tmp despite add_kernel running on a separate stream.
+    cudaEvent_t              dev_tmp_kernel_done[GGML_CUDA_MAX_DEVICES];
+    bool                     dev_tmp_kernel_done_valid;
+
+    // Arrival ring: ARRIVAL_STRIDE bytes between adjacent ints.  Mapped pinned
+    // memory; CPU never reads/writes -- only the kernel and cudaMemset.
+    // Use ggml_cuda_ar_arrival_ptr() to index.
+    ggml_cuda_ar_host_mapping arrival;
+};
+
+// Base pointer for the (slot, rank) per-block token block.  The kernel adds
+// blockIdx.x * (ARRIVAL_STRIDE/sizeof(int)) internally to land on its own slot.
+static int * ggml_cuda_ar_arrival_ptr(const ggml_cuda_ar_pipeline * p, int slot, int rank) {
+    const size_t offset = ((size_t)slot * p->n_devices + rank) *
+                          GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE;
+    return reinterpret_cast<int *>(p->arrival.dev + offset);
+}
+
+static uint64_t ggml_cuda_ar_env_u64(const char * name, uint64_t default_value) {
+    const char * value = getenv(name);
+    if (value == nullptr || value[0] == '\0') {
+        return default_value;
+    }
+
+    char * end = nullptr;
+    const unsigned long long parsed = strtoull(value, &end, 10);
+    return end != value ? (uint64_t) parsed : default_value;
+}
+
+struct ggml_cuda_ar_slot_info {
+    int slot;
+    int token;
+};
+
+static ggml_cuda_ar_slot_info ggml_cuda_ar_acquire_slot(ggml_cuda_ar_pipeline * p) {
+    const int  slot        = static_cast<int>(p->call_count % GGML_CUDA_AR_POOL_SIZE);
+    const bool pool_lapped = p->call_count >= GGML_CUDA_AR_POOL_SIZE;
+    p->call_count++;
+
+    if (pool_lapped) {
+        for (int i = 0; i < p->n_devices; ++i) {
+            ggml_cuda_set_device(p->devices[i]);
+            CUDA_CHECK(cudaEventSynchronize(p->ev_pool[i][slot].ker));
+        }
+    }
+
+    return { slot, (int) p->call_count };
+}
+
+// Per-AR copy-engine chunk size: env-var override if set, else heuristic
+// (clamp(nbytes/4, HEURISTIC_MIN, HEURISTIC_MAX)).
+static size_t ggml_cuda_ar_chunk_bytes(const ggml_cuda_ar_pipeline * p, size_t nbytes) {
+    if (p->copy_chunk_bytes > 0) {
+        return p->copy_chunk_bytes;
+    }
+    return std::min(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX,
+                    std::max(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN, nbytes / 4));
+}
+
+static void ggml_cuda_ar_wait_for_compute(
+        ggml_cuda_ar_pipeline * p, ggml_backend_cuda_context * cuda_ctx, int rank, int slot) {
+    ggml_cuda_ar_event_slot & ev = p->ev_pool[rank][slot];
+    CUDA_CHECK(cudaEventRecord(ev.app, cuda_ctx->stream()));
+    CUDA_CHECK(cudaStreamWaitEvent(p->streams[rank], ev.app));
+}
+
+// ---------------------------------------------------------------------------
+// Init / free
+// ---------------------------------------------------------------------------
+
+ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int * devices, size_t n_devices) {
+
+    if (n_devices != 2) {
+        GGML_LOG_DEBUG("%s: internal AllReduce only supports n_devices=2 (got %zu); "
+                       "falling back\n", __func__, n_devices);
+        return nullptr;
+    }
+
+    // The chunked kernel uses __nanosleep, which is sm70+ (Volta+).
+    for (size_t i = 0; i < n_devices; ++i) {
+        const int cc = ggml_cuda_info().devices[devices[i]].cc;
+        if (cc < GGML_CUDA_CC_VOLTA) {
+            GGML_LOG_DEBUG("%s: internal AllReduce requires compute capability >= %d "
+                           "(device %d has cc=%d); falling back\n",
+                           __func__, GGML_CUDA_CC_VOLTA, devices[i], cc);
+            return nullptr;
+        }
+    }
+
+    auto * p = new ggml_cuda_ar_pipeline{};
+    p->n_devices        = n_devices;
+    p->copy_bytes       = GGML_CUDA_AR_COPY_MAX_BYTES;
+    p->copy_threshold   = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_THRESHOLD", GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT);
+    // 0 = use the per-call heuristic (default).  Non-zero env value forces a
+    // fixed chunk size for diagnostics, with a floor at COPY_CHUNK_BYTES_MIN.
+    p->copy_chunk_bytes = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_CHUNK_BYTES", 0);
+    if (p->copy_chunk_bytes > 0 && p->copy_chunk_bytes < GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN) {
+        GGML_LOG_WARN("%s: GGML_CUDA_AR_COPY_CHUNK_BYTES=%zu below minimum %zu; clamping\n",
+                      __func__, p->copy_chunk_bytes, GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN);
+        p->copy_chunk_bytes = GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN;
+    }
+    // Default 1: BF16 round-trip is always on for F32 inputs (any non-zero
+    // ne).  Set GGML_CUDA_AR_BF16_THRESHOLD=0 to disable, or to a larger
+    // byte threshold to opt out for small tensors.
+    p->bf16_threshold   = ggml_cuda_ar_env_u64("GGML_CUDA_AR_BF16_THRESHOLD", 1);
+    for (size_t i = 0; i < n_devices; ++i) {
+        p->devices[i] = devices[i];
+    }
+
+    // Per-device streams and event pools.
+    for (size_t i = 0; i < n_devices; ++i) {
+        ggml_cuda_set_device(p->devices[i]);
+
+        cudaStream_t stream = nullptr;
+        if (cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking) != cudaSuccess) {
+            GGML_LOG_ERROR("%s: cudaStreamCreateWithFlags failed for device %d\n",
+                           __func__, p->devices[i]);
+            ggml_cuda_ar_pipeline_free(p);
+            return nullptr;
+        }
+        p->streams[i] = stream;
+
+        for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) {
+            bool ok =
+                cudaEventCreateWithFlags(&p->ev_pool[i][s].app, cudaEventDisableTiming) == cudaSuccess &&
+                cudaEventCreateWithFlags(&p->ev_pool[i][s].h2d, cudaEventDisableTiming) == cudaSuccess &&
+                cudaEventCreateWithFlags(&p->ev_pool[i][s].ker, cudaEventDisableTiming) == cudaSuccess;
+            for (int c = 0; ok && c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) {
+                ok = cudaEventCreateWithFlags(&p->ev_pool[i][s].cpy[c], cudaEventDisableTiming) == cudaSuccess;
+            }
+            if (!ok) {
+                GGML_LOG_ERROR("%s: cudaEventCreate failed for device %d slot %d\n",
+                               __func__, p->devices[i], s);
+                ggml_cuda_ar_pipeline_free(p);
+                return nullptr;
+            }
+        }
+
+        if (cudaEventCreateWithFlags(&p->host_large_read_done[i], cudaEventDisableTiming) != cudaSuccess) {
+            GGML_LOG_ERROR("%s: cudaEventCreate for host_large_read_done failed for device %d\n",
+                           __func__, p->devices[i]);
+            ggml_cuda_ar_pipeline_free(p);
+            return nullptr;
+        }
+        if (cudaEventCreateWithFlags(&p->dev_tmp_kernel_done[i], cudaEventDisableTiming) != cudaSuccess) {
+            GGML_LOG_ERROR("%s: cudaEventCreate for dev_tmp_kernel_done failed for device %d\n",
+                           __func__, p->devices[i]);
+            ggml_cuda_ar_pipeline_free(p);
+            return nullptr;
+        }
+    }
+
+    // Arrival ring: cache-line padded so each GPU's int is on its own line.
+    const size_t arrival_bytes =
+        (size_t)GGML_CUDA_AR_POOL_SIZE * n_devices *
+        GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE;
+    if (p->arrival.alloc(arrival_bytes) != cudaSuccess) {
+        GGML_LOG_ERROR("%s: alloc for arrival ring failed (%zu bytes)\n",
+                       __func__, arrival_bytes);
+        ggml_cuda_ar_pipeline_free(p);
+        return nullptr;
+    }
+    ggml_cuda_set_device(p->devices[0]);
+    if (cudaMemset(p->arrival.dev, 0, arrival_bytes) != cudaSuccess) {
+        GGML_LOG_ERROR("%s: cudaMemset for arrival ring failed (%zu bytes)\n",
+                       __func__, arrival_bytes);
+        ggml_cuda_ar_pipeline_free(p);
+        return nullptr;
+    }
+
+    // Per-device pinned staging buffers -- POOL_SIZE-deep ring so the chunked-
+    // kernel can write the next slot's data while the peer is still reading
+    // the previous slot's. Indexed by (slot * buf_bytes) at the call site.
+    p->buf_bytes = GGML_CUDA_AR_MAX_BYTES;
+    const size_t host_buf_total = (size_t) GGML_CUDA_AR_POOL_SIZE * p->buf_bytes;
+    for (size_t i = 0; i < n_devices; ++i) {
+        if (p->host_buf[i].alloc(host_buf_total) != cudaSuccess) {
+            GGML_LOG_ERROR("%s: alloc for staging failed (%zu bytes)\n",
+                           __func__, host_buf_total);
+            ggml_cuda_ar_pipeline_free(p);
+            return nullptr;
+        }
+    }
+
+    // Copy-engine path: pinned host staging + device scratch, sized for the
+    // largest tensor we accept on this path (GGML_CUDA_AR_COPY_MAX_BYTES).
+    // dev_tmp is single-buffered; cross-AR safety is enforced by an explicit
+    // cross-stream wait in copy_impl on the prior AR's add_kernel-done event.
+    for (size_t i = 0; i < n_devices; ++i) {
+        ggml_cuda_set_device(p->devices[i]);
+        if (p->host_large[i].alloc(p->copy_bytes) != cudaSuccess) {
+            GGML_LOG_ERROR("%s: alloc for large staging failed (%zu bytes)\n",
+                           __func__, p->copy_bytes);
+            ggml_cuda_ar_pipeline_free(p);
+            return nullptr;
+        }
+        if (cudaMalloc(reinterpret_cast<void **>(&p->dev_tmp[i]), p->copy_bytes) != cudaSuccess) {
+            GGML_LOG_ERROR("%s: cudaMalloc for copy scratch failed (%zu bytes) on device %d\n",
+                           __func__, p->copy_bytes, p->devices[i]);
+            ggml_cuda_ar_pipeline_free(p);
+            return nullptr;
+        }
+    }
+
+    GGML_LOG_INFO("%s: initialized AllReduce pipeline: %zu GPUs, "
+                  "%zu KB chunked kernel staging + %zu MB copy-engine staging per GPU\n",
+                  __func__, n_devices, p->buf_bytes >> 10, p->copy_bytes >> 20);
+
+    return p;
+}
+
+void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * p) {
+    if (!p) {
+        return;
+    }
+
+    // Drain all in-flight kernels before tearing down resources.
+    for (int i = 0; i < p->n_devices; ++i) {
+        if (p->streams[i]) {
+            ggml_cuda_set_device(p->devices[i]);
+            cudaStreamSynchronize(p->streams[i]);
+        }
+    }
+
+    for (int i = 0; i < p->n_devices; ++i) {
+        p->host_buf[i].free();
+        p->host_large[i].free();
+        if (p->dev_tmp[i]) {
+            ggml_cuda_set_device(p->devices[i]);
+            cudaFree(p->dev_tmp[i]);
+        }
+        ggml_cuda_set_device(p->devices[i]);
+        for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) {
+            if (p->ev_pool[i][s].app) { cudaEventDestroy(p->ev_pool[i][s].app); }
+            for (int c = 0; c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) {
+                if (p->ev_pool[i][s].cpy[c]) { cudaEventDestroy(p->ev_pool[i][s].cpy[c]); }
+            }
+            if (p->ev_pool[i][s].h2d) { cudaEventDestroy(p->ev_pool[i][s].h2d); }
+            if (p->ev_pool[i][s].ker) { cudaEventDestroy(p->ev_pool[i][s].ker); }
+        }
+        if (p->host_large_read_done[i]) {
+            ggml_cuda_set_device(p->devices[i]);
+            cudaEventDestroy(p->host_large_read_done[i]);
+        }
+        if (p->dev_tmp_kernel_done[i]) {
+            ggml_cuda_set_device(p->devices[i]);
+            cudaEventDestroy(p->dev_tmp_kernel_done[i]);
+        }
+        if (p->streams[i]) {
+            ggml_cuda_set_device(p->devices[i]);
+            cudaStreamDestroy(p->streams[i]);
+        }
+    }
+    p->arrival.free();
+    delete p;
+}
+
+// ---------------------------------------------------------------------------
+// Dispatch
+// ---------------------------------------------------------------------------
+
+// Asymmetric copy_impl: data sent over PCIe in T_src precision (one element of
+// nbytes per ne element); accumulated locally into a T_dst buffer.  When
+// T_src == T_dst this is the original homogeneous reduction.  When they differ
+// (e.g. BF16 wire / F32 accumulator) the add kernel rounds dst through T_src
+// for bit-equivalence between GPUs and we skip the otherwise-needed
+// post-conversion entirely.
+template <typename T_src, typename T_dst>
+static bool ggml_cuda_ar_allreduce_copy_impl(
+        ggml_cuda_ar_pipeline * p,
+        ggml_backend_t        * backends,
+        T_src * const           src_buf[GGML_CUDA_MAX_DEVICES],
+        T_dst * const           dst_buf[GGML_CUDA_MAX_DEVICES],
+        const bool              compute[GGML_CUDA_MAX_DEVICES],
+        int64_t                 ne,
+        size_t                  nbytes) {
+    GGML_ASSERT(p->n_devices == 2);
+    GGML_ASSERT(nbytes <= p->copy_bytes);
+    GGML_ASSERT(ne <= std::numeric_limits<int>::max());
+
+    const size_t chunk_bytes = ggml_cuda_ar_chunk_bytes(p, nbytes);
+    GGML_ASSERT(chunk_bytes > 0);
+
+    const int slot = ggml_cuda_ar_acquire_slot(p).slot;
+    const size_t copy_chunks = (nbytes + chunk_bytes - 1) / chunk_bytes;
+    GGML_ASSERT(copy_chunks <= GGML_CUDA_AR_COPY_MAX_CHUNKS);
+
+    ggml_backend_cuda_context * cuda_ctx[2] = {};
+
+    // Stage 1: both GPUs copy their local contribution to pinned host memory.
+    for (int i = 0; i < 2; ++i) {
+        ggml_cuda_set_device(p->devices[i]);
+        cuda_ctx[i] = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
+        GGML_ASSERT(cuda_ctx[i]->device == p->devices[i]);
+
+        ggml_cuda_ar_wait_for_compute(p, cuda_ctx[i], i, slot);
+
+        // Wait for peer's H2D from our host_large[i] (recorded in the
+        // previous AR's stage 2) to complete before we overwrite host_large[i].
+        // host_large_read_done[peer] = peer finished reading host_large[i].
+        // No-op on the first AR -- no prior record exists.
+        if (p->host_large_read_done_valid) {
+            const int peer = 1 - i;
+            CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->host_large_read_done[peer]));
+        }
+
+        if (!compute[i]) {
+            CUDA_CHECK(cudaMemsetAsync(src_buf[i], 0, nbytes, p->streams[i]));
+        }
+
+        for (size_t c = 0; c < copy_chunks; ++c) {
+            const size_t offset = c * chunk_bytes;
+            const size_t this_bytes = (nbytes - offset) < chunk_bytes ?
+                (nbytes - offset) : chunk_bytes;
+
+            CUDA_CHECK(cudaMemcpyAsync(
+                p->host_large[i].host + offset, reinterpret_cast<char *>(src_buf[i]) + offset, this_bytes,
+                cudaMemcpyDeviceToHost, p->streams[i]));
+            CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].cpy[c], p->streams[i]));
+        }
+    }
+
+    // Stage 2: each GPU waits for each peer D2H chunk, pulls that chunk back to
+    // local device scratch (dev_tmp), then performs one device-local add over
+    // the assembled peer tensor.  The H2Ds run on the AR stream (copy engine)
+    // and the add_kernel runs on the caller's compute stream, so the AR stream
+    // stays pure-copy and avoids an in-stream copy->compute engine switch every
+    // AR.  dev_tmp is single-buffered: the AR stream waits cross-stream on the
+    // prior AR's add_kernel-done event before overwriting it.
+    for (int i = 0; i < 2; ++i) {
+        const int peer = 1 - i;
+        ggml_cuda_set_device(p->devices[i]);
+
+        // Wait for the previous AR's add_kernel (on the compute stream) to
+        // finish reading dev_tmp before our H2D overwrites it.  No-op on the
+        // first copy_impl call.
+        if (p->dev_tmp_kernel_done_valid) {
+            CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->dev_tmp_kernel_done[i]));
+        }
+
+        for (size_t c = 0; c < copy_chunks; ++c) {
+            const size_t offset = c * chunk_bytes;
+            const size_t this_bytes = (nbytes - offset) < chunk_bytes ?
+                (nbytes - offset) : chunk_bytes;
+
+            CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->ev_pool[peer][slot].cpy[c]));
+            CUDA_CHECK(cudaMemcpyAsync(
+                p->dev_tmp[i] + offset, p->host_large[peer].host + offset, this_bytes,
+                cudaMemcpyHostToDevice, p->streams[i]));
+        }
+
+        // Mark our reads of host_large[peer] complete so peer's next AR can
+        // safely overwrite it.
+        CUDA_CHECK(cudaEventRecord(p->host_large_read_done[i], p->streams[i]));
+
+        // Hand off from AR stream (copy engine) to compute stream: compute
+        // stream waits for all H2Ds to finish, then runs the add_kernel.
+        CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].h2d, p->streams[i]));
+        CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx[i]->stream(), p->ev_pool[i][slot].h2d));
+
+        const int block_size = 256;
+        int n_blocks = (int) ((ne + block_size - 1) / block_size);
+        if (n_blocks > 1024) {
+            n_blocks = 1024;
+        }
+        ggml_cuda_ar_add_kernel<T_dst, T_src><<<n_blocks, block_size, 0, cuda_ctx[i]->stream()>>>(
+            dst_buf[i],
+            reinterpret_cast<const T_src *>(p->dev_tmp[i]),
+            (int) ne);
+        CUDA_CHECK(cudaGetLastError());
+
+        // Record dev_tmp-released on the compute stream so the next copy_impl
+        // can wait for the kernel to finish before overwriting dev_tmp.  Also
+        // record AR-done as ev.ker for acquire_slot's pool-wraparound sync.
+        CUDA_CHECK(cudaEventRecord(p->dev_tmp_kernel_done[i], cuda_ctx[i]->stream()));
+        CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, cuda_ctx[i]->stream()));
+    }
+    p->host_large_read_done_valid = true;
+    p->dev_tmp_kernel_done_valid = true;
+
+    return true;
+}
+
+// Outer-level chunker: copy_impl handles up to copy_bytes per call (limited by
+// the host_large / dev_tmp allocation size).  When the full AR exceeds that,
+// slice the tensor into copy_bytes-sized pieces and call copy_impl repeatedly.
+// Each slice goes through its own stage 1 -> stage 2 cycle and acquires its own
+// slot, so cross-AR fences and pool wraparound work the same way as for any
+// other sequence of small ARs.
+template <typename T_src, typename T_dst>
+static bool ggml_cuda_ar_allreduce_copy_outer(
+        ggml_cuda_ar_pipeline * p,
+        ggml_backend_t        * backends,
+        T_src * const           src_buf[GGML_CUDA_MAX_DEVICES],
+        T_dst * const           dst_buf[GGML_CUDA_MAX_DEVICES],
+        const bool              compute[GGML_CUDA_MAX_DEVICES],
+        int64_t                 ne) {
+    const int64_t outer_max_elems = (int64_t) (p->copy_bytes / sizeof(T_src));
+    GGML_ASSERT(outer_max_elems > 0);
+
+    bool ok = true;
+    for (int64_t outer_start = 0; outer_start < ne && ok; outer_start += outer_max_elems) {
+        const int64_t outer_ne     = std::min(outer_max_elems, ne - outer_start);
+        const size_t  outer_nbytes = (size_t) outer_ne * sizeof(T_src);
+
+        T_src * src[GGML_CUDA_MAX_DEVICES] = {};
+        T_dst * dst[GGML_CUDA_MAX_DEVICES] = {};
+        for (int i = 0; i < p->n_devices; ++i) {
+            src[i] = src_buf[i] + outer_start;
+            dst[i] = dst_buf[i] + outer_start;
+        }
+        ok = ggml_cuda_ar_allreduce_copy_impl<T_src, T_dst>(
+            p, backends, src, dst, compute, outer_ne, outer_nbytes);
+    }
+    return ok;
+}
+
+bool ggml_cuda_ar_allreduce(
+        ggml_cuda_ar_pipeline * p,
+        ggml_backend_t        * backends,
+        ggml_tensor           ** tensors) {
+    GGML_ASSERT(p != nullptr);
+
+    const int n = p->n_devices;
+    GGML_ASSERT(n == 2);
+
+    const ggml_type input_type = tensors[0]->type;
+    GGML_ASSERT(input_type == GGML_TYPE_F32 || input_type == GGML_TYPE_F16 || input_type == GGML_TYPE_BF16);
+
+    const int64_t ne = ggml_nelements(tensors[0]);
+    GGML_ASSERT(ne > 0);
+
+    const size_t   input_nbytes = ggml_nbytes(tensors[0]);
+
+    // BF16 round-trip: F32 inputs >= bf16_threshold are converted to BF16 for
+    // the reduction (chunked or copy-engine), halving on-wire bytes. Matches
+    // NCCL's behaviour. The pre-conversion zeroes inactive shards so the
+    // inner paths see them as already-prepared compute tensors.
+    const bool use_bf16 =
+        input_type == GGML_TYPE_F32 &&
+        p->bf16_threshold > 0 &&
+        input_nbytes >= p->bf16_threshold;
+
+    const ggml_type kernel_type = use_bf16 ? GGML_TYPE_BF16 : input_type;
+    const size_t    type_size   = ggml_type_size(kernel_type);
+    GGML_ASSERT(p->buf_bytes >= type_size);
+    const size_t    nbytes      = (size_t) ne * type_size;
+
+    bool compute_flag[GGML_CUDA_MAX_DEVICES] = {};
+    for (int i = 0; i < n; ++i) {
+        compute_flag[i] = (tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) != 0;
+    }
+
+    // Decide between copy-engine and chunked kernel paths based on the working
+    // type's actual byte count.  No upper bound: copy_outer slices reductions
+    // larger than copy_bytes into copy_bytes-sized pieces.
+    const bool use_copy_engine =
+        p->copy_threshold > 0 &&
+        nbytes >= p->copy_threshold;
+
+    // BF16 inactive-shard zeroing: when use_bf16 is on, the combined kernel
+    // (chunked kernel path) and the combined add kernel (copy_engine path)
+    // both accumulate into the F32 tensor data directly, so an inactive
+    // shard's accumulator must start at zero.
+    if (use_bf16) {
+        for (int i = 0; i < n; ++i) {
+            if (!compute_flag[i]) {
+                auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
+                GGML_ASSERT(cuda_ctx->device == p->devices[i]);
+                ggml_cuda_set_device(p->devices[i]);
+                CUDA_CHECK(cudaMemsetAsync(tensors[i]->data, 0, (size_t) ne * sizeof(float), cuda_ctx->stream()));
+            }
+        }
+    }
+
+    // Pre-convert F32 -> BF16 into bf16_tmp ONLY for the copy_engine + use_bf16
+    // path; the chunked kernel path's combined kernel does the conversion
+    // inline as it writes to host_buf.
+    ggml_cuda_pool_alloc<nv_bfloat16> bf16_tmp[GGML_CUDA_MAX_DEVICES];
+    void * copy_src_ptr[GGML_CUDA_MAX_DEVICES] = {};
+
+    if (use_copy_engine && use_bf16) {
+        to_bf16_cuda_t to_bf16 = ggml_get_to_bf16_cuda(GGML_TYPE_F32);
+        for (int i = 0; i < n; ++i) {
+            auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
+            GGML_ASSERT(cuda_ctx->device == p->devices[i]);
+            bf16_tmp[i].pool = &cuda_ctx->pool();
+            bf16_tmp[i].alloc(ne);
+            ggml_cuda_set_device(p->devices[i]);
+            if (compute_flag[i]) {
+                to_bf16(tensors[i]->data, bf16_tmp[i].get(), ne, cuda_ctx->stream());
+                CUDA_CHECK(cudaGetLastError());
+            } else {
+                CUDA_CHECK(cudaMemsetAsync(bf16_tmp[i].get(), 0, nbytes, cuda_ctx->stream()));
+            }
+            copy_src_ptr[i] = bf16_tmp[i].get();
+        }
+    }
+
+    bool ok = true;
+    if (use_copy_engine) {
+        // After up-front BF16 conversion, the tmp buffers already hold the
+        // (possibly zeroed-for-inactive) data, so the inner path can treat
+        // every shard as compute.
+        bool inner_compute[GGML_CUDA_MAX_DEVICES];
+        for (int i = 0; i < n; ++i) {
+            inner_compute[i] = use_bf16 ? true : compute_flag[i];
+        }
+
+        // Dispatch into copy_impl with explicit src/dst types.  When use_bf16
+        // is on, the wire type is BF16 (src = bf16_tmp) and the accumulator
+        // is F32 (dst = tensors[i]->data); the combined add kernel rounds dst
+        // through BF16 for bit-equivalence and writes F32 directly, so no
+        // post-conversion is needed.  Otherwise src == dst (same native type).
+        if (use_bf16) {
+            GGML_ASSERT(kernel_type == GGML_TYPE_BF16);
+            nv_bfloat16 * src[GGML_CUDA_MAX_DEVICES] = {};
+            float       * dst[GGML_CUDA_MAX_DEVICES] = {};
+            for (int i = 0; i < n; ++i) {
+                src[i] = static_cast<nv_bfloat16 *>(copy_src_ptr[i]);
+                dst[i] = static_cast<float *>(tensors[i]->data);
+            }
+            ok = ggml_cuda_ar_allreduce_copy_outer<nv_bfloat16, float>(
+                p, backends, src, dst, inner_compute, ne);
+        } else {
+            switch (kernel_type) {
+                case GGML_TYPE_F32: {
+                    float * buf[GGML_CUDA_MAX_DEVICES] = {};
+                    for (int i = 0; i < n; ++i) {
+                        buf[i] = static_cast<float *>(tensors[i]->data);
+                    }
+                    ok = ggml_cuda_ar_allreduce_copy_outer<float, float>(
+                        p, backends, buf, buf, inner_compute, ne);
+                    break;
+                }
+                case GGML_TYPE_BF16: {
+                    nv_bfloat16 * buf[GGML_CUDA_MAX_DEVICES] = {};
+                    for (int i = 0; i < n; ++i) {
+                        buf[i] = static_cast<nv_bfloat16 *>(tensors[i]->data);
+                    }
+                    ok = ggml_cuda_ar_allreduce_copy_outer<nv_bfloat16, nv_bfloat16>(
+                        p, backends, buf, buf, inner_compute, ne);
+                    break;
+                }
+                case GGML_TYPE_F16: {
+                    half * buf[GGML_CUDA_MAX_DEVICES] = {};
+                    for (int i = 0; i < n; ++i) {
+                        buf[i] = static_cast<half *>(tensors[i]->data);
+                    }
+                    ok = ggml_cuda_ar_allreduce_copy_outer<half, half>(
+                        p, backends, buf, buf, inner_compute, ne);
+                    break;
+                }
+                default:
+                    GGML_ASSERT(false);
+            }
+        }
+    } else {
+        // host_buf carries T_wire-typed data; max_chunk_elems is the count that
+        // fits in one host_buf at the wire size.
+        const size_t max_chunk_elems = p->buf_bytes / type_size;
+        const size_t input_type_size = ggml_type_size(input_type);
+
+        // Chunked kernel path runs entirely on the caller's compute stream:
+        // since AR is a barrier here, same-stream ordering subsumes any
+        // cross-stream event handshake that the copy-engine path needs, and
+        // skips the cross-stream scheduling overhead that was hurting the
+        // small-tensor (tg) latency on the AR-stream variant.  Only ev.ker is
+        // still recorded at end-of-AR for acquire_slot's pool-wraparound check.
+        for (int64_t chunk_start = 0; chunk_start < ne; chunk_start += (int64_t) max_chunk_elems) {
+            const size_t remaining_elems = (size_t) (ne - chunk_start);
+            const size_t chunk_elems = remaining_elems < max_chunk_elems ? remaining_elems : max_chunk_elems;
+            const size_t chunk_dst_bytes  = chunk_elems * input_type_size;
+
+            const auto [slot, token] = ggml_cuda_ar_acquire_slot(p);
+            const bool last_chunk = chunk_start + (int64_t) chunk_elems == ne;
+
+            for (int i = 0; i < n; ++i) {
+                const int peer = 1 - i;  // valid for n == 2 only
+                ggml_cuda_set_device(p->devices[i]);
+                auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
+                GGML_ASSERT(cuda_ctx->device == p->devices[i]);
+                cudaStream_t stream = cuda_ctx->stream();
+
+                char * data = static_cast<char *>(tensors[i]->data) + chunk_start * (int64_t) input_type_size;
+
+                // Match NCCL/meta-backend semantics: inactive shards contribute
+                // zeros.  On the BF16 path the F32 tensor data was already
+                // zeroed up-front (above), so per-chunk zeroing isn't needed.
+                if (!compute_flag[i] && !use_bf16) {
+                    CUDA_CHECK(cudaMemsetAsync(data, 0, chunk_dst_bytes, stream));
+                }
+
+#define LAUNCH_AR_KERNEL(T_dst, T_wire) \
+                ggml_cuda_ar_kernel<T_dst, T_wire><<<dim3(GGML_CUDA_AR_KERNEL_BLOCKS), dim3(256), 0, stream>>>( \
+                    reinterpret_cast<const T_dst *>(data), \
+                    reinterpret_cast<T_dst *>(data), \
+                    reinterpret_cast<T_wire *>(p->host_buf[i].dev + (size_t) slot * p->buf_bytes), \
+                    reinterpret_cast<const T_wire *>(p->host_buf[peer].dev + (size_t) slot * p->buf_bytes), \
+                    static_cast<int>(chunk_elems), \
+                    ggml_cuda_ar_arrival_ptr(p, slot, i), \
+                    ggml_cuda_ar_arrival_ptr(p, slot, peer), \
+                    token)
+
+                if (use_bf16) {
+                    GGML_ASSERT(input_type == GGML_TYPE_F32);
+                    LAUNCH_AR_KERNEL(float, nv_bfloat16);
+                } else {
+                    switch (input_type) {
+                        case GGML_TYPE_F32:  LAUNCH_AR_KERNEL(float,       float);       break;
+                        case GGML_TYPE_F16:  LAUNCH_AR_KERNEL(half,        half);        break;
+                        case GGML_TYPE_BF16: LAUNCH_AR_KERNEL(nv_bfloat16, nv_bfloat16); break;
+                        default: GGML_ASSERT(false);
+                    }
+                }
+
+#undef LAUNCH_AR_KERNEL
+                CUDA_CHECK(cudaGetLastError());
+
+                if (last_chunk) {
+                    CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, stream));
+                }
+            }
+        }
+    }
+
+    return ok;
+}
+
+#else // defined(GGML_USE_HIP) || defined(GGML_USE_MUSA)
+
+// HIP and MUSA lack the host-mapped pinned-memory APIs (cudaHostAllocPortable
+// / cudaHostAllocMapped / cudaHostGetDevicePointer) and __nanosleep that this
+// implementation relies on, so the internal AllReduce is a CUDA-only feature.
+// The dispatcher in ggml-cuda.cu treats a nullptr pipeline as "init failed"
+// and silently falls back to the meta backend's generic AllReduce.
+ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int *, size_t) {
+    return nullptr;
+}
+void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline *) {
+}
+bool ggml_cuda_ar_allreduce(ggml_cuda_ar_pipeline *, ggml_backend_t *, ggml_tensor **) {
+    return false;
+}
+
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
--- a/ggml/src/ggml-cuda/allreduce.cuh
+++ b/ggml/src/ggml-cuda/allreduce.cuh
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "common.cuh"
+#include "ggml-backend-impl.h"
+
+#include <cstddef>
+
+// Opaque pipeline context -- owns all pinned buffers, streams, and events.
+struct ggml_cuda_ar_pipeline;
+
+// Allocate a pipeline for n_devices GPUs.
+// devices[] holds the CUDA device IDs in rank order.
+// Returns nullptr on allocation failure.
+ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(
+    const int * devices, size_t n_devices);
+
+// Release all resources owned by the pipeline.
+void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * pipeline);
+
+// Execute an in-place AllReduce (sum) across tensors[0..n_devices-1].
+// tensors[i] must live on the device managed by backends[i] and be
+// contiguous F32, F16, or BF16.
+// Preconditions are checked by the CUDA comm dispatcher before calling this.
+// Returns true once the reduction work has been enqueued successfully.
+bool ggml_cuda_ar_allreduce(
+    ggml_cuda_ar_pipeline * pipeline,
+    ggml_backend_t        * backends,
+    ggml_tensor           ** tensors);
+
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -61,6 +61,11 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 32, 128, 2,  64,  64,  64,  64, 2, true);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 64, 128, 2,  64,  64,  64,  64, 2, true);

+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128,  8,  64, 4,  64,  96,  64,  64, 2, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 16,  64, 4,  32,  96,  64,  64, 2, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 32, 128, 2,  32,  96,  64,  64, 2, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 64, 128, 2,  32,  96,  64,  64, 2, true);
+
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256,  8,  64, 4,  64, 128, 128, 128, 2, true);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 16,  64, 4,  32, 128, 128, 128, 2, true);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2,  32, 128, 128, 128, 2, true);
@@ -1561,6 +1566,10 @@ static __global__ void flash_attn_ext_f16(
        NO_DEVICE_CODE;
        return;
    }
+    if (DKQ == 192 && ncols2 != 8 && ncols2 != 16) {
+        NO_DEVICE_CODE;
+        return;
+    }
 #ifdef VOLTA_MMA_AVAILABLE
    if (ncols1*ncols2 < 32) {
        NO_DEVICE_CODE;
--- a/ggml/src/ggml-cuda/fattn-tile.cu
+++ b/ggml/src/ggml-cuda/fattn-tile.cu
@@ -34,6 +34,10 @@ void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor
            GGML_ASSERT(V->ne[0] == K->ne[0]);
            ggml_cuda_flash_attn_ext_tile_case<128, 128>(ctx, dst);
        } break;
+        case 192: {
+            GGML_ASSERT(V->ne[0] == 128);
+            ggml_cuda_flash_attn_ext_tile_case<192, 128>(ctx, dst);
+        } break;
        case 256: {
            GGML_ASSERT(V->ne[0] == K->ne[0]);
            ggml_cuda_flash_attn_ext_tile_case<256, 256>(ctx, dst);
--- a/ggml/src/ggml-cuda/fattn-tile.cuh
+++ b/ggml/src/ggml-cuda/fattn-tile.cuh
@@ -62,6 +62,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 16, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 2,  64,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  2,  64, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  4, 128, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  8, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 2,  64,  64)
+
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  2,  64, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  4, 128, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  8, 256, 2,  64,  64)
@@ -124,6 +130,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 16, 128, 3,  32, 128)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 2,  64,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  2, 128, 3,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  4, 128, 3,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  8, 256, 2,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 2,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 2,  32,  64)
+
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  2, 128, 3,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  4, 128, 3,  32,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  8, 256, 2,  32, 256)
@@ -193,6 +205,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 64, 256, 2,  64,  32)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  2, 256, 2, 128,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  4, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  8, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 2,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 2,  32,  64)
+
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  2, 256, 2, 128,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  4, 256, 2,  64, 128)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  8, 256, 2,  64, 128)
@@ -264,6 +282,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 3, 128,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 64, 256, 3,  64,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  2,  64, 8,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  4, 128, 6,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  8, 128, 6,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 5,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 3,  64,  64)
+
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  2,  64, 8,  32,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  4, 128, 6,  32, 256)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  8, 128, 6,  32, 256)
@@ -1250,7 +1274,20 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
        }
    }

-    if constexpr (DKQ <= 512 && DKQ != 320) {
+    if constexpr (DKQ == 192) {
+        // MiMo-V2.5 / V2.5-Pro / V2-Flash: gqa_ratio is 8 (SWA) or 16 (full attn)
+        if (use_gqa_opt && gqa_ratio % 16 == 0) {
+            launch_fattn_tile_switch_ncols1<DKQ, DV, 16, use_logit_softcap>(ctx, dst);
+            return;
+        }
+        if (use_gqa_opt && gqa_ratio % 8 == 0) {
+            launch_fattn_tile_switch_ncols1<DKQ, DV,  8, use_logit_softcap>(ctx, dst);
+            return;
+        }
+        GGML_ABORT("flash-attn tile (192/128): expected GQA ratio multiple of 8");
+    }
+
+    if constexpr (DKQ <= 512 && DKQ != 320 && DKQ != 192) {
        if (use_gqa_opt && gqa_ratio % 8 == 0) {
            launch_fattn_tile_switch_ncols1<DKQ, DV, 8, use_logit_softcap>(ctx, dst);
            return;
@@ -1303,6 +1340,7 @@ extern DECL_FATTN_TILE_CASE( 80,  80);
 extern DECL_FATTN_TILE_CASE( 96,  96);
 extern DECL_FATTN_TILE_CASE(112, 112);
 extern DECL_FATTN_TILE_CASE(128, 128);
+extern DECL_FATTN_TILE_CASE(192, 128);
 extern DECL_FATTN_TILE_CASE(256, 256);
 extern DECL_FATTN_TILE_CASE(320, 256);
 extern DECL_FATTN_TILE_CASE(512, 512);
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -139,6 +139,22 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
            GGML_ASSERT(V->ne[0] == 128);
            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<128, 128>(ctx, dst);
            break;
+        case 192: {
+            // MiMo-V2.5 / V2.5-Pro / V2-Flash: gqa_ratio is 8 (SWA) or 16 (full attn)
+            GGML_ASSERT(V->ne[0] == 128);
+            float max_bias = 0.0f;
+            memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
+            const bool use_gqa_opt = mask && max_bias == 0.0f;
+            GGML_ASSERT(use_gqa_opt);
+            GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
+            const int gqa_ratio = Q->ne[2] / K->ne[2];
+            if (gqa_ratio % 16 == 0) {
+                ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<192, 128, 16>(ctx, dst);
+            } else {
+                GGML_ASSERT(gqa_ratio % 8 == 0);
+                ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<192, 128,  8>(ctx, dst);
+            }
+        } break;
        case 256:
            GGML_ASSERT(V->ne[0] == 256);
            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<256, 256>(ctx, dst);
@@ -368,6 +384,14 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
                return BEST_FATTN_KERNEL_NONE;
            }
            break;
+        case 192:
+            if (V->ne[0] != 128 || !gqa_opt_applies) {
+                return BEST_FATTN_KERNEL_NONE;
+            }
+            if (gqa_ratio % 8 != 0) {
+                return BEST_FATTN_KERNEL_NONE;
+            }
+            break;
        case 320:
            if (V->ne[0] != 256 || !gqa_opt_applies) {
                return BEST_FATTN_KERNEL_NONE;
@@ -425,7 +449,8 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
    }

    // For small batch sizes the vector kernel may be preferable over the kernels optimized for large batch sizes:
-    const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;
+    // 192 satisfies % 64 == 0 but has no vec instance (DKQ != DV); force it onto the MMA path.
+    const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && Q->ne[0] != 192 && K->ne[1] % FATTN_KQ_STRIDE == 0;

    // If Turing tensor cores are available, use them:
    if (turing_mma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72) {
@@ -454,7 +479,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const

    if (volta_mma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72) {
        int gqa_ratio_eff = 1;
-        const int ncols2_max = Q->ne[0] == 576 ? 16 : 8;
+        const int ncols2_max = (Q->ne[0] == 576 || Q->ne[0] == 192) ? 16 : 8;
        while (gqa_ratio % (2*gqa_ratio_eff) == 0 && gqa_ratio_eff < ncols2_max) {
            gqa_ratio_eff *= 2;
        }
@@ -468,7 +493,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
    }

    // Use the WMMA kernel if possible:
-    if (ggml_cuda_should_use_wmma_fattn(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 512 && Q->ne[0] != 576) {
+    if (ggml_cuda_should_use_wmma_fattn(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 192 && Q->ne[0] != 512 && Q->ne[0] != 576) {
        if (can_use_vector_kernel && Q->ne[1] <= 2) {
            return BEST_FATTN_KERNEL_VEC;
        }
@@ -501,7 +526,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
    }

    // Use MFMA flash attention for CDNA (MI100+):
-    if (amd_mfma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 256 && Q->ne[0] != 512 && Q->ne[0] != 576) {
+    if (amd_mfma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 192 && Q->ne[0] != 256 && Q->ne[0] != 512 && Q->ne[0] != 576) {
        const int64_t eff_nq = Q->ne[1] * (gqa_opt_applies ? gqa_ratio : 1);
        // MMA vs tile crossover benchmarked on MI300X @ d32768:
        //   hsk=64  (gqa=4): MMA wins at eff >= 128 (+11%)
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2,6 +2,7 @@
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"

+#include "ggml-cuda/allreduce.cuh"
 #include "ggml-cuda/common.cuh"
 #include "ggml-cuda/acc.cuh"
 #include "ggml-cuda/add-id.cuh"
@@ -39,6 +40,7 @@
 #include "ggml-cuda/rope.cuh"
 #include "ggml-cuda/roll.cuh"
 #include "ggml-cuda/scale.cuh"
+#include "ggml-cuda/snake.cuh"
 #include "ggml-cuda/softcap.cuh"
 #include "ggml-cuda/softmax.cuh"
 #include "ggml-cuda/ssm-conv.cuh"
@@ -85,6 +87,9 @@

 static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");

+#define GGML_LOG_WARN_ONCE(str) \
+    { static std::once_flag warn_flag; std::call_once(warn_flag, []() { GGML_LOG_WARN(str); }); }
+
 [[noreturn]]
 void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
    int id = -1; // in case cudaGetDevice fails
@@ -1138,70 +1143,46 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
    /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
 };

-#ifdef GGML_USE_NCCL
+// Communication context for multi-GPU AllReduce during tensor parallelism.
+//
+// Created once per meta backend instance.  Resources for the selected mode
+// (NCCL communicators or the internal AllReduce pipeline) are initialised
+// eagerly during comm_init so any init failure surfaces at startup rather
+// than mid-run.
 struct ggml_backend_cuda_comm_context {
+    using try_allreduce_fn = bool(*)(ggml_backend_cuda_comm_context *, struct ggml_tensor **);
+
    std::vector<ggml_backend_t> backends;
-    std::vector<ncclComm_t> comms;
+    std::vector<int>            dev_ids;
+
+    // Set by the init chain (comm_init_{nccl, internal, none}) to one of
+    // try_allreduce_{nccl, internal, butterfly}.  nccl needs `comms`,
+    // internal needs `ar_pipeline`, butterfly needs nothing.  Per-call
+    // failures return false; the meta backend's generic implementation then
+    // handles that call.
+    try_allreduce_fn            try_allreduce = nullptr;
+
+    ggml_cuda_ar_pipeline *     ar_pipeline = nullptr;
+
+#ifdef GGML_USE_NCCL
+    std::vector<ncclComm_t>     comms;
+#endif // GGML_USE_NCCL

    ~ggml_backend_cuda_comm_context() {
+#ifdef GGML_USE_NCCL
        for (ncclComm_t comm : comms) {
            NCCL_CHECK(ncclCommDestroy(comm));
        }
+#endif // GGML_USE_NCCL
+        ggml_cuda_ar_pipeline_free(ar_pipeline);
    }
 };
-#endif // GGML_USE_NCCL

-static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
-#ifdef GGML_USE_NCCL
-    if (comm_ctx_v == nullptr) {
-        return;
-    }
-    ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
-    delete comm_ctx;
-#else
-    GGML_UNUSED(comm_ctx_v);
-#endif // GGML_USE_NCCL
-}
-
-static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
-#ifdef GGML_USE_NCCL
-    for (size_t i = 0; i < n_backends; i++) {
-        if (!ggml_backend_is_cuda(backends[i])) {
-            return nullptr;
-        }
-    }
-    ggml_backend_cuda_comm_context * ret = new ggml_backend_cuda_comm_context;
-    std::vector<int> dev_ids;
-    ret->backends.reserve(n_backends);
-    dev_ids.reserve(n_backends);
-    for (size_t i = 0; i < n_backends; i++) {
-        ret->backends.push_back(backends[i]);
-        ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
-        dev_ids.push_back(cuda_ctx->device);
-    }
-
-    ret->comms.resize(n_backends);
-    NCCL_CHECK(ncclCommInitAll(ret->comms.data(), n_backends, dev_ids.data()));
-    return ret;
-#else
-    // If NCCL is installed it is used by default for optimal performance.
-    // However, NVIDIA does not distribute NCCL with CUDA so users may be unwittingly missing this package.
-    // RCCL is disabled by default, users are explicitly opting in.
-    // Therefore print no warning for RCCL.
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    static bool warning_printed = false;
-    if (!warning_printed) {
-        GGML_LOG_WARN("%s: NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal\n", __func__);
-        warning_printed = true;
-    }
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    GGML_UNUSED_VARS(backends, n_backends);
-    return nullptr;
-#endif // GGML_USE_NCCL
-}
-
-static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
 #ifdef GGML_USE_NCCL
+// AllReduce via NCCL. Reduces as FP32 for small tensors and BF16 for large
+// tensors (bandwidth-bound), then converts back to FP32.
+static bool ggml_backend_cuda_comm_allreduce_nccl(
+        ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
    const int64_t ne = ggml_nelements(tensors[0]);
    // FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0
    // This then causes a crash in this function
@@ -1209,8 +1190,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
        return true;
    }

-    GGML_ASSERT(comm_ctx_v != nullptr);
-    ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
    const size_t n_backends = comm_ctx->backends.size();

    for (size_t i = 0; i < n_backends; ++i) {
@@ -1235,7 +1214,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
            NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, comm_ctx->comms[i], cuda_ctx->stream()));
        }
        NCCL_CHECK(ncclGroupEnd());
-
        return true;
    }

@@ -1274,10 +1252,184 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
    }

    return true;
-#else
-    GGML_UNUSED_VARS(comm_ctx_v, tensors);
-    return false;
+}
 #endif // GGML_USE_NCCL
+
+// Run the internal AR pipeline.  Returns false on unsupported / failed input
+// -- the caller decides whether to abort (env-forced) or fall back silently.
+static bool ggml_backend_cuda_comm_allreduce_internal(
+        ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
+    GGML_ASSERT(comm_ctx->ar_pipeline != nullptr);
+
+    const size_t n_backends = comm_ctx->backends.size();
+    GGML_ASSERT(n_backends == 2);
+    GGML_ASSERT(tensors[0] != nullptr);
+
+    const int64_t   ne   = ggml_nelements(tensors[0]);
+    const ggml_type type = tensors[0]->type;
+
+    if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16 && type != GGML_TYPE_BF16) {
+        GGML_LOG_DEBUG("%s: internal unsupported: type=%d\n", __func__, (int) type);
+        return false;
+    }
+
+    if (ne == 0) {
+        return true;
+    }
+
+    for (size_t i = 0; i < n_backends; ++i) {
+        if (tensors[i] == nullptr) {
+            GGML_LOG_ERROR("%s: internal failed: tensor[%zu] is null\n", __func__, i);
+            return false;
+        }
+        if (ggml_nelements(tensors[i]) != ne || tensors[i]->type != type) {
+            GGML_LOG_ERROR("%s: internal failed: tensor[%zu] ne=%" PRId64 " type=%d expected ne=%" PRId64 " type=%d\n",
+                           __func__, i, ggml_nelements(tensors[i]), (int) tensors[i]->type, ne, (int) type);
+            return false;
+        }
+        if (!ggml_is_contiguously_allocated(tensors[i])) {
+            GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] is not contiguously allocated: ne=%" PRId64 " nbytes=%zu packed=%zu type=%d\n",
+                           __func__, i, ne, ggml_nbytes(tensors[i]),
+                           (size_t) ne * ggml_type_size(type) / ggml_blck_size(type), (int) type);
+            return false;
+        }
+        if (((uintptr_t) tensors[i]->data & 0xF) != 0) {
+            GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] data pointer is not 16-byte aligned: %p type=%d ne=%" PRId64 "\n",
+                           __func__, i, tensors[i]->data, (int) type, ne);
+            return false;
+        }
+        GGML_ASSERT((ggml_nbytes(tensors[i]) & 0xF) == 0);
+    }
+
+    return ggml_cuda_ar_allreduce(comm_ctx->ar_pipeline, comm_ctx->backends.data(), tensors);
+}
+
+// ---------------------------------------------------------------------------
+// Per-call dispatch -- three variants, one per backend.  Each is set as
+// comm_ctx->try_allreduce by the matching init step.  Per-call failure
+// returns false; the meta backend's generic implementation handles that call.
+// ---------------------------------------------------------------------------
+
+#ifdef GGML_USE_NCCL
+static bool ggml_backend_cuda_comm_try_allreduce_nccl(
+        ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
+    return ggml_backend_cuda_comm_allreduce_nccl(comm_ctx, tensors);
+}
+#endif // GGML_USE_NCCL
+
+static bool ggml_backend_cuda_comm_try_allreduce_internal(
+        ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
+    return ggml_backend_cuda_comm_allreduce_internal(comm_ctx, tensors);
+}
+
+static bool ggml_backend_cuda_comm_try_allreduce_butterfly(
+        ggml_backend_cuda_comm_context *, struct ggml_tensor **) {
+    return false;
+}
+
+static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
+    if (comm_ctx_v == nullptr) {
+        return;
+    }
+    delete static_cast<ggml_backend_cuda_comm_context *>(comm_ctx_v);
+}
+
+// ---------------------------------------------------------------------------
+// Init -- chained nccl -> internal -> none.  Each step tries to bring up its
+// resource; on failure it warns and recurses into the next step.
+// ---------------------------------------------------------------------------
+static void ggml_backend_cuda_comm_init_none(ggml_backend_cuda_comm_context * ret) {
+    ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_butterfly;
+}
+
+static void ggml_backend_cuda_comm_init_internal(ggml_backend_cuda_comm_context * ret) {
+    ret->ar_pipeline = ggml_cuda_ar_pipeline_init(ret->dev_ids.data(), ret->dev_ids.size());
+    if (ret->ar_pipeline) {
+        ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_internal;
+        return;
+    }
+
+    // Clear sticky CUDA error from the failed init.
+    (void) cudaGetLastError();
+    GGML_LOG_WARN("internal AllReduce init failed (n_devices != 2?); "
+                  "falling back to meta-backend butterfly\n");
+    ggml_backend_cuda_comm_init_none(ret);
+}
+
+static void ggml_backend_cuda_comm_init_nccl(ggml_backend_cuda_comm_context * ret) {
+#ifdef GGML_USE_NCCL
+    const size_t n = ret->dev_ids.size();
+    ret->comms.resize(n);
+    ncclResult_t rc = ncclCommInitAll(ret->comms.data(), (int) n, ret->dev_ids.data());
+    if (rc == ncclSuccess) {
+        ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_nccl;
+        return;
+    }
+
+    ret->comms.clear();
+    GGML_LOG_WARN("NCCL init failed (%s); falling back to internal AllReduce\n",
+                  ncclGetErrorString(rc));
+#else // GGML_USE_NCCL
+#ifndef GGML_USE_HIP
+    GGML_LOG_WARN("NCCL not compiled in; falling back to internal AllReduce.  "
+                  "Recompile with -DGGML_CUDA_NCCL=ON for best multi-GPU performance.\n");
+#endif // !GGML_USE_HIP
+#endif // GGML_USE_NCCL
+
+    ggml_backend_cuda_comm_init_internal(ret);
+}
+
+// Top-level init.  Picks one of the three init paths based on
+// GGML_CUDA_ALLREDUCE (or the platform default) and lets the chain handle
+// any fallback.  Unrecognised env values warn and fall through to the
+// platform default.
+static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
+    for (size_t i = 0; i < n_backends; i++) {
+        if (!ggml_backend_is_cuda(backends[i])) {
+            return nullptr;
+        }
+    }
+
+    auto * ret = new ggml_backend_cuda_comm_context;
+    ret->backends.assign(backends, backends + n_backends);
+    ret->dev_ids.reserve(n_backends);
+    for (size_t i = 0; i < n_backends; i++) {
+        ret->dev_ids.push_back(static_cast<ggml_backend_cuda_context *>(backends[i]->context)->device);
+    }
+
+    const char * env = getenv("GGML_CUDA_ALLREDUCE");
+    if (!env) {
+        // Platform default: Linux uses NCCL, otherwise (generally Windows) internal
+#if defined(__linux__)
+        ggml_backend_cuda_comm_init_nccl(ret);
+#else
+        ggml_backend_cuda_comm_init_internal(ret);
+#endif // defined(__linux__)
+    } else {
+        std::string env_str(env);
+        if (env_str == "nccl") {
+            ggml_backend_cuda_comm_init_nccl(ret);
+        } else if (env_str == "internal") {
+            ggml_backend_cuda_comm_init_internal(ret);
+        } else if (env_str == "none") {
+            ggml_backend_cuda_comm_init_none(ret);
+        } else {
+            GGML_LOG_WARN("unknown GGML_CUDA_ALLREDUCE value: %s\n", env);
+            ggml_backend_cuda_comm_init_none(ret);
+        }
+    }
+
+    return ret;
+}
+
+// Top-level dispatch -- calls the function pointer chosen by comm_init.
+// Returns false to let the meta-backend's butterfly run.
+static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
+    if (comm_ctx_v == nullptr) {
+        return false;
+    }
+    auto * comm_ctx = static_cast<ggml_backend_cuda_comm_context *>(comm_ctx_v);
+    return comm_ctx->try_allreduce(comm_ctx, tensors);
 }

 ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
@@ -3757,6 +3909,35 @@ static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph
        return 2;
    }

+    // Snake activation: y = x + sin(a*x)^2 * inv_b
+    // Naive 5-op decomposition emitted by frontends: mul -> sin -> sqr -> mul -> add
+    if (ggml_can_fuse_subgraph(cgraph, i,
+            { GGML_OP_MUL, GGML_OP_SIN, GGML_OP_SQR, GGML_OP_MUL, GGML_OP_ADD },
+            { i + 4 })) {
+        const ggml_tensor * mul0 = cgraph->nodes[i];
+        const ggml_tensor * sqr  = cgraph->nodes[i + 2];
+        const ggml_tensor * mul1 = cgraph->nodes[i + 3];
+        ggml_tensor *       add  = cgraph->nodes[i + 4];
+
+        // x carries the full activation shape, a is the broadcast operand
+        const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1];
+        const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0];
+
+        // mul1 reads sqr and inv_b in either operand order
+        const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0];
+
+        // closure check: the trailing add must read the same x as the leading mul
+        const ggml_tensor * x_in_add = (add->src[0] == mul1) ? add->src[1] : add->src[0];
+
+        const bool type_ok  = (x->type == GGML_TYPE_F32 || x->type == GGML_TYPE_F16 || x->type == GGML_TYPE_BF16);
+        const bool shape_ok = ggml_are_same_shape(a, inv_b) && a->ne[0] == 1 && a->ne[1] == x->ne[1];
+
+        if (type_ok && shape_ok && x_in_add == x && add->type == x->type) {
+            ggml_cuda_op_snake_fused(*cuda_ctx, x, a, inv_b, add);
+            return 4;
+        }
+    }
+
    // multi-(add or mul)
    if (node->op == GGML_OP_ADD || node->op == GGML_OP_MUL) {
        int     n_fuse = 0;
@@ -5434,6 +5615,9 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                char pci_bus_id[32] = {};
                CUDA_CHECK(cudaDeviceGetPCIBusId(pci_bus_id, sizeof(pci_bus_id), i));
                dev_ctx->pci_bus_id = pci_bus_id;
+                for (char & c : dev_ctx->pci_bus_id) {
+                    c = std::tolower(c);
+                }
                dev_ctx->op_offload_min_batch_size = min_batch_size;

                ggml_backend_dev_t dev = new ggml_backend_device {
--- a/ggml/src/ggml-cuda/out-prod.cu
+++ b/ggml/src/ggml-cuda/out-prod.cu
@@ -54,15 +54,31 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const int64_t dps2 = ne2 / ne02;
    const int64_t dps3 = ne3 / ne03;

-    // TODO batched matrix multiplication
-    for (int64_t i3 = 0; i3 < ne3; ++i3) {
-        for (int64_t i2 = 0; i2 < ne2; ++i2) {
+    if (dps2 == 1 && ne2 > 1) {
+        // src0 has uniform stride s02 along dim 2; batch the inner loop with a strided GEMM
+        GGML_ASSERT(ne2 <= std::numeric_limits<int>::max());
+        const int batch_count = (int) ne2;
+        for (int64_t i3 = 0; i3 < ne3; ++i3) {
            CUBLAS_CHECK(
-                cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
+                cublasSgemmStridedBatched(handle, CUBLAS_OP_N, src1_cublas_op,
                        ne0, ne1, ne01,
-                        &alpha, src0_d + (i3/dps3)*s03 + (i2/dps2)*s02, lda,
-                                src1_d +  i3      *s13 +  i2      *s12, ldb,
-                        &beta,  dst_d  +  i3      *s3  +  i2      *s2,  ldc));
+                        &alpha, src0_d + (i3/dps3)*s03, lda, s02,
+                                src1_d +  i3     *s13, ldb, s12,
+                        &beta,  dst_d  +  i3     *s3,  ldc, s2,
+                        batch_count));
+        }
+    } else {
+        // Fallback: ne2 == 1 (no batching benefit) or dps2 > 1 (src0 broadcast along dim 2
+        // with non-uniform stride; would need cublasSgemmBatched with pointer arrays).
+        for (int64_t i3 = 0; i3 < ne3; ++i3) {
+            for (int64_t i2 = 0; i2 < ne2; ++i2) {
+                CUBLAS_CHECK(
+                    cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
+                            ne0, ne1, ne01,
+                            &alpha, src0_d + (i3/dps3)*s03 + (i2/dps2)*s02, lda,
+                                    src1_d +  i3      *s13 +  i2      *s12, ldb,
+                            &beta,  dst_d  +  i3      *s3  +  i2      *s2,  ldc));
+            }
        }
    }
 }
--- a/ggml/src/ggml-cuda/snake.cu
+++ b/ggml/src/ggml-cuda/snake.cu
@@ -0,0 +1,72 @@
+#include "snake.cuh"
+#include "convert.cuh"
+
+// Fused Snake activation: y = x + sin^2(a * x) * inv_b
+// x: [T, C] (T contiguous), a: [1, C], inv_b: [1, C]
+// Supports F32, F16, BF16 data with F32 compute.
+
+template <typename T>
+static __global__ void snake_kernel(
+        const T     * __restrict__ x,
+        const float * __restrict__ a,
+        const float * __restrict__ inv_b,
+        T           * __restrict__ dst,
+        const int    total,
+        const uint3  T_len_fastdiv) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total) return;
+
+    const int c = (int) fastdiv((uint32_t) idx, T_len_fastdiv);
+
+    const float xi = ggml_cuda_cast<float>(x[idx]);
+    const float s  = sinf(a[c] * xi);
+    dst[idx] = ggml_cuda_cast<T>(xi + s * s * inv_b[c]);
+}
+
+// Internal launcher with explicit x/a/inv_b/dst tensors.
+// Shared by the public op (reads dst->src) and the fusion path (explicit args).
+static void launch_snake(ggml_backend_cuda_context & ctx,
+                         const ggml_tensor * x,
+                         const ggml_tensor * a,
+                         const ggml_tensor * inv_b,
+                         ggml_tensor *       dst) {
+    const float * a_d     = (const float *)a->data;
+    const float * inv_b_d = (const float *)inv_b->data;
+
+    const int   T = (int)x->ne[0];
+    const int   C = (int)x->ne[1];
+    const int   total = T * C;
+    const uint3 T_len_fastdiv = init_fastdiv_values((uint64_t) T);
+
+    const int block_size = 256;
+    const int grid_size  = (total + block_size - 1) / block_size;
+
+    cudaStream_t stream = ctx.stream();
+
+    switch (x->type) {
+        case GGML_TYPE_F32: {
+            snake_kernel<<<grid_size, block_size, 0, stream>>>(
+                (const float *)x->data, a_d, inv_b_d, (float *)dst->data, total, T_len_fastdiv);
+        } break;
+        case GGML_TYPE_F16: {
+            snake_kernel<<<grid_size, block_size, 0, stream>>>(
+                (const half *)x->data, a_d, inv_b_d, (half *)dst->data, total, T_len_fastdiv);
+        } break;
+        case GGML_TYPE_BF16: {
+            snake_kernel<<<grid_size, block_size, 0, stream>>>(
+                (const nv_bfloat16 *)x->data, a_d, inv_b_d, (nv_bfloat16 *)dst->data, total, T_len_fastdiv);
+        } break;
+        default:
+            GGML_ABORT("snake: unsupported type");
+    }
+}
+
+// Fusion entry: caller supplies x/a/inv_b explicitly from the matched
+// mul -> sin -> sqr -> mul -> add pattern. The dst is the trailing add output.
+void ggml_cuda_op_snake_fused(ggml_backend_cuda_context & ctx,
+                              const ggml_tensor * x,
+                              const ggml_tensor * a,
+                              const ggml_tensor * inv_b,
+                              ggml_tensor *       dst) {
+    launch_snake(ctx, x, a, inv_b, dst);
+}
--- a/ggml/src/ggml-cuda/snake.cuh
+++ b/ggml/src/ggml-cuda/snake.cuh
@@ -0,0 +1,8 @@
+#include "common.cuh"
+
+// Fusion entry point. Caller supplies x/a/inv_b explicitly.
+void ggml_cuda_op_snake_fused(ggml_backend_cuda_context & ctx,
+                              const ggml_tensor * x,
+                              const ggml_tensor * a,
+                              const ggml_tensor * inv_b,
+                              ggml_tensor *       dst);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu
@@ -2,4 +2,5 @@

 #include "../fattn-mma-f16.cuh"

+DECL_FATTN_MMA_F16_CASE(192, 128, 1, 16);
 DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
@@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 1, 8);
 DECL_FATTN_MMA_F16_CASE(96, 96, 1, 8);
 DECL_FATTN_MMA_F16_CASE(112, 112, 1, 8);
 DECL_FATTN_MMA_F16_CASE(128, 128, 1, 8);
+DECL_FATTN_MMA_F16_CASE(192, 128, 1, 8);
 DECL_FATTN_MMA_F16_CASE(256, 256, 1, 8);
 DECL_FATTN_MMA_F16_CASE(512, 512, 1, 8);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu
@@ -2,4 +2,5 @@

 #include "../fattn-mma-f16.cuh"

+DECL_FATTN_MMA_F16_CASE(192, 128, 2, 16);
 DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
@@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 2, 8);
 DECL_FATTN_MMA_F16_CASE(96, 96, 2, 8);
 DECL_FATTN_MMA_F16_CASE(112, 112, 2, 8);
 DECL_FATTN_MMA_F16_CASE(128, 128, 2, 8);
+DECL_FATTN_MMA_F16_CASE(192, 128, 2, 8);
 DECL_FATTN_MMA_F16_CASE(256, 256, 2, 8);
 DECL_FATTN_MMA_F16_CASE(512, 512, 2, 8);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu
@@ -2,4 +2,5 @@

 #include "../fattn-mma-f16.cuh"

+DECL_FATTN_MMA_F16_CASE(192, 128, 4, 16);
 DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu
@@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 4, 8);
 DECL_FATTN_MMA_F16_CASE(96, 96, 4, 8);
 DECL_FATTN_MMA_F16_CASE(112, 112, 4, 8);
 DECL_FATTN_MMA_F16_CASE(128, 128, 4, 8);
+DECL_FATTN_MMA_F16_CASE(192, 128, 4, 8);
 DECL_FATTN_MMA_F16_CASE(256, 256, 4, 8);
 DECL_FATTN_MMA_F16_CASE(512, 512, 4, 8);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu
@@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 8, 8);
 DECL_FATTN_MMA_F16_CASE(96, 96, 8, 8);
 DECL_FATTN_MMA_F16_CASE(112, 112, 8, 8);
 DECL_FATTN_MMA_F16_CASE(128, 128, 8, 8);
+DECL_FATTN_MMA_F16_CASE(192, 128, 8, 8);
 DECL_FATTN_MMA_F16_CASE(256, 256, 8, 8);
 DECL_FATTN_MMA_F16_CASE(512, 512, 8, 8);
--- a/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-tile.cuh"
+
+DECL_FATTN_TILE_CASE(192, 128);
--- a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
+++ b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
@@ -3,7 +3,10 @@
 from glob import glob
 import os

-HEAD_SIZES_KQ = [40, 64, 72, 80, 96, 112, 128, 256, 320, 512, 576]
+HEAD_SIZES_KQ = [40, 64, 72, 80, 96, 112, 128, 192, 256, 320, 512, 576]
+
+# DKQ -> DV override for asymmetric head dims.
+HEAD_SIZES_V_OVERRIDE = {576: 512, 320: 256, 192: 128}

 TYPES_KV = ["GGML_TYPE_F16", "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_BF16"]

@@ -62,7 +65,7 @@ for filename in glob("*.cu"):
    os.remove(filename)

 for head_size_kq in HEAD_SIZES_KQ:
-    head_size_v = 256 if head_size_kq == 320 else (head_size_kq if head_size_kq != 576 else 512)
+    head_size_v = HEAD_SIZES_V_OVERRIDE.get(head_size_kq, head_size_kq)
    with open(f"fattn-tile-instance-dkq{head_size_kq}-dv{head_size_v}.cu", "w") as f:
        f.write(SOURCE_FATTN_TILE.format(head_size_kq=head_size_kq, head_size_v=head_size_v))

@@ -85,15 +88,17 @@ for ncols in [8, 16, 32, 64]:
                if head_size_kq == 72:
                    continue
                # Skip compilation of unused ncols2 values for niche head sizes:
+                if head_size_kq == 192 and ncols2 not in (8, 16): # MiMo-V2.5
+                    continue
                if head_size_kq == 320 and ncols2 != 32: # Mistral Small 4
                    continue
                if head_size_kq == 512 and ncols2 not in (4, 8): # Gemma 4
                    continue
                if head_size_kq == 576 and ncols2 not in (4, 16, 32): # Deepseek, GLM 4.7 Flash
                    continue
-                if head_size_kq not in (320, 576) and ncols2 in (16, 32):
+                if head_size_kq not in (192, 320, 576) and ncols2 in (16, 32):
                    continue
-                head_size_v = 256 if head_size_kq == 320 else (head_size_kq if head_size_kq != 576 else 512)
+                head_size_v = HEAD_SIZES_V_OVERRIDE.get(head_size_kq, head_size_kq)
                f.write(SOURCE_FATTN_MMA_CASE.format(ncols1=ncols1, ncols2=ncols2, head_size_kq=head_size_kq, head_size_v=head_size_v))

 for type in TYPES_MMQ:
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -48,6 +48,7 @@
 #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
 #define cublasSetStream hipblasSetStream
 #define cublasSgemm hipblasSgemm
+#define cublasSgemmStridedBatched hipblasSgemmStridedBatched
 #define cublasStatus_t hipblasStatus_t
 #define cublasOperation_t hipblasOperation_t
 #define cudaDevAttrCooperativeLaunch hipDeviceAttributeCooperativeLaunch
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@@ -32,6 +32,7 @@
 #define cublasSetMathMode mublasSetMathMode
 #define cublasSetStream mublasSetStream
 #define cublasSgemm mublasSgemm
+#define cublasSgemmStridedBatched mublasSgemmStridedBatched
 #define cublasStatus_t mublasStatus_t
 #define cublasOperation_t mublasOperation_t
 #define cublasGetStatusString mublasGetStatusString
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2261,6 +2261,58 @@ static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_sess
    return true;
 }

+static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * q     = op->src[0];
+    const struct ggml_tensor * k     = op->src[1];
+    const struct ggml_tensor * v     = op->src[2];
+    const struct ggml_tensor * g     = op->src[3];
+    const struct ggml_tensor * beta  = op->src[4];
+    const struct ggml_tensor * state = op->src[5];
+    const struct ggml_tensor * dst   = op;
+
+    if (!q || !k || !v || !g || !beta || !state) {
+        return false;
+    }
+
+    if (q->type != GGML_TYPE_F32 || k->type != GGML_TYPE_F32 || v->type != GGML_TYPE_F32 ||
+        g->type != GGML_TYPE_F32 || beta->type != GGML_TYPE_F32 || state->type != GGML_TYPE_F32 ||
+        dst->type != GGML_TYPE_F32) {
+        return false;
+    }
+
+    if (!ggml_is_contiguous_rows(q) || !ggml_is_contiguous_rows(k) || !ggml_is_contiguous_rows(v) ||
+        !ggml_is_contiguous(g) || !ggml_is_contiguous(beta) || !ggml_is_contiguous(state) ||
+        !ggml_is_contiguous(dst)) {
+        return false;
+    }
+
+    const int64_t S_v      = v->ne[0];
+    const int64_t H        = v->ne[1];
+    const int64_t n_tokens = v->ne[2];
+    const int64_t n_seqs   = v->ne[3];
+
+    if (S_v <= 0 || S_v > 128 || H <= 0 || n_tokens <= 0 || n_seqs <= 0) {
+        return false;
+    }
+    if (q->ne[0] != S_v || k->ne[0] != S_v || q->ne[1] <= 0 || k->ne[1] <= 0 ||
+        q->ne[2] != n_tokens || k->ne[2] != n_tokens || q->ne[3] <= 0 || k->ne[3] <= 0 ||
+        (n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) {
+        return false;
+    }
+    if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) {
+        return false;
+    }
+    if (ggml_nelements(state) != S_v * S_v * H * n_seqs) {
+        return false;
+    }
+    if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs) {
+        return false;
+    }
+
+    GGML_UNUSED(sess);
+    return true;
+}
+
 static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];
@@ -2420,8 +2472,8 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
        return false;
    }

-    // TODO: add support for non-contiguous elements within a row
-    if (!ggml_is_contiguous_rows(src0) || !ggml_is_contiguous_rows(dst)) {
+    // dst must be contiguous; src0 may be non-contiguous
+    if (!ggml_is_contiguous(dst)) {
        return false;
    }

@@ -2777,32 +2829,34 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {

 static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
    switch (t->op) {
-        case GGML_OP_FLASH_ATTN_EXT: return HTP_OP_FLASH_ATTN_EXT;
-        case GGML_OP_MUL_MAT:        return HTP_OP_MUL_MAT;
-        case GGML_OP_MUL_MAT_ID:     return HTP_OP_MUL_MAT_ID;
-        case GGML_OP_MUL:            return HTP_OP_MUL;
-        case GGML_OP_ADD:            return HTP_OP_ADD;
-        case GGML_OP_ADD_ID:         return HTP_OP_ADD_ID;
-        case GGML_OP_SUB:            return HTP_OP_SUB;
-        case GGML_OP_DIV:            return HTP_OP_DIV;
-        case GGML_OP_CPY:            return HTP_OP_CPY;
-        case GGML_OP_CONT:           return HTP_OP_CPY;
-        case GGML_OP_GET_ROWS:       return HTP_OP_GET_ROWS;
-        case GGML_OP_SET_ROWS:       return HTP_OP_SET_ROWS;
-        case GGML_OP_SUM_ROWS:       return HTP_OP_SUM_ROWS;
-        case GGML_OP_ARGSORT:        return HTP_OP_ARGSORT;
-        case GGML_OP_RMS_NORM:       return HTP_OP_RMS_NORM;
-        case GGML_OP_SCALE:          return HTP_OP_SCALE;
-        case GGML_OP_SQR:            return HTP_OP_SQR;
-        case GGML_OP_SQRT:           return HTP_OP_SQRT;
-        case GGML_OP_SOFT_MAX:       return HTP_OP_SOFTMAX;
-        case GGML_OP_SSM_CONV:       return HTP_OP_SSM_CONV;
-        case GGML_OP_ROPE:           return HTP_OP_ROPE;
-        case GGML_OP_REPEAT:         return HTP_OP_REPEAT;
-        case GGML_OP_CUMSUM:         return HTP_OP_CUMSUM;
-        case GGML_OP_FILL:           return HTP_OP_FILL;
-        case GGML_OP_DIAG:           return HTP_OP_DIAG;
-        case GGML_OP_SOLVE_TRI:      return HTP_OP_SOLVE_TRI;
+        case GGML_OP_FLASH_ATTN_EXT:  return HTP_OP_FLASH_ATTN_EXT;
+        case GGML_OP_MUL_MAT:         return HTP_OP_MUL_MAT;
+        case GGML_OP_MUL_MAT_ID:      return HTP_OP_MUL_MAT_ID;
+        case GGML_OP_MUL:             return HTP_OP_MUL;
+        case GGML_OP_ADD:             return HTP_OP_ADD;
+        case GGML_OP_ADD_ID:          return HTP_OP_ADD_ID;
+        case GGML_OP_SUB:             return HTP_OP_SUB;
+        case GGML_OP_DIV:             return HTP_OP_DIV;
+        case GGML_OP_CPY:             return HTP_OP_CPY;
+        case GGML_OP_CONT:            return HTP_OP_CPY;
+        case GGML_OP_GET_ROWS:        return HTP_OP_GET_ROWS;
+        case GGML_OP_SET_ROWS:        return HTP_OP_SET_ROWS;
+        case GGML_OP_SUM_ROWS:        return HTP_OP_SUM_ROWS;
+        case GGML_OP_ARGSORT:         return HTP_OP_ARGSORT;
+        case GGML_OP_L2_NORM:         return HTP_OP_L2_NORM;
+        case GGML_OP_RMS_NORM:        return HTP_OP_RMS_NORM;
+        case GGML_OP_SCALE:           return HTP_OP_SCALE;
+        case GGML_OP_SQR:             return HTP_OP_SQR;
+        case GGML_OP_SQRT:            return HTP_OP_SQRT;
+        case GGML_OP_SOFT_MAX:        return HTP_OP_SOFTMAX;
+        case GGML_OP_SSM_CONV:        return HTP_OP_SSM_CONV;
+        case GGML_OP_GATED_DELTA_NET: return HTP_OP_GATED_DELTA_NET;
+        case GGML_OP_ROPE:            return HTP_OP_ROPE;
+        case GGML_OP_REPEAT:          return HTP_OP_REPEAT;
+        case GGML_OP_CUMSUM:          return HTP_OP_CUMSUM;
+        case GGML_OP_FILL:            return HTP_OP_FILL;
+        case GGML_OP_DIAG:            return HTP_OP_DIAG;
+        case GGML_OP_SOLVE_TRI:       return HTP_OP_SOLVE_TRI;
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(t)) {
                case GGML_UNARY_OP_SILU:     return HTP_OP_UNARY_SILU;
@@ -3253,6 +3307,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
            supp = ggml_hexagon_supported_add_id(sess, op);
            break;

+        case GGML_OP_L2_NORM:
+            supp = ggml_hexagon_supported_unary(sess, op);
+            break;
+
        case GGML_OP_RMS_NORM:
        case GGML_OP_SCALE:
            supp = ggml_hexagon_supported_unary(sess, op);
@@ -3336,6 +3394,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
            supp = ggml_hexagon_supported_ssm_conv(sess, op);
            break;

+        case GGML_OP_GATED_DELTA_NET:
+            supp = ggml_hexagon_supported_gated_delta_net(sess, op);
+            break;
+
        case GGML_OP_CUMSUM:
            supp = ggml_hexagon_supported_cumsum(sess, op);
            break;
--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -37,6 +37,7 @@ add_library(${HTP_LIB} SHARED
    fill-ops.c
    diag-ops.c
    solve-tri-ops.c
+    gated-delta-net-ops.c
 )

 target_compile_definitions(${HTP_LIB} PRIVATE
--- a/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c
+++ b/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c
@@ -0,0 +1,955 @@
+#include <math.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "hvx-utils.h"
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+
+#ifndef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+#define HTP_GDN_MAX_SV 128
+
+struct htp_gdn_context {
+    struct htp_ops_context * octx;
+    uint32_t rows_per_thread;
+    size_t state_bytes;
+    bool use_vtcm;
+    uint8_t * vtcm_state_base;
+    size_t vtcm_state_per_thread;
+};
+
+static inline float gdn_mul_dot_f32(float * restrict dst, const float * restrict mul,
+        const float * restrict dot, uint32_t n) {
+    HVX_Vector acc = Q6_V_vzero();
+
+    const uint32_t epv = 128 / sizeof(float);
+    const uint32_t nvec = n / epv;
+    const uint32_t tail = n % epv;
+    for (uint32_t i = 0; i < nvec; ++i) {
+        HVX_Vector vd = hvx_vmemu(dst + i * epv);
+        HVX_Vector vm = hvx_vmem(mul + i * epv);
+        HVX_Vector vdot = hvx_vmem(dot + i * epv);
+        HVX_Vector out = hvx_vec_mul_f32_f32(vd, vm);
+        hvx_vmemu(dst + i * epv) = out;
+        acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot));
+    }
+
+    if (tail) {
+        const uint32_t off = nvec * epv;
+        HVX_Vector vd = hvx_vmemu(dst + off);
+        HVX_Vector vm = hvx_vmem(mul + off);
+        HVX_Vector vdot = hvx_vmem(dot + off);
+        HVX_Vector out = hvx_vec_mul_f32_f32(vd, vm);
+        hvx_vec_store_u(dst + off, tail * sizeof(float), out);
+        HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
+        HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot);
+        acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero()));
+    }
+
+    return hvx_vec_get_f32(hvx_vec_reduce_sum_f32(acc));
+}
+
+static inline float gdn_mul_scalar_dot_f32(float * restrict dst, float mul,
+        const float * restrict dot, uint32_t n) {
+    HVX_Vector acc = Q6_V_vzero();
+    const HVX_Vector vmul = hvx_vec_splat_f32(mul);
+
+    const uint32_t epv = 128 / sizeof(float);
+    const uint32_t nvec = n / epv;
+    const uint32_t tail = n % epv;
+    for (uint32_t i = 0; i < nvec; ++i) {
+        HVX_Vector vd = hvx_vmemu(dst + i * epv);
+        HVX_Vector vdot = hvx_vmem(dot + i * epv);
+        HVX_Vector out = hvx_vec_mul_f32_f32(vd, vmul);
+        hvx_vmemu(dst + i * epv) = out;
+        acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot));
+    }
+
+    if (tail) {
+        const uint32_t off = nvec * epv;
+        HVX_Vector vd = hvx_vmemu(dst + off);
+        HVX_Vector vdot = hvx_vmem(dot + off);
+        HVX_Vector out = hvx_vec_mul_f32_f32(vd, vmul);
+        hvx_vec_store_u(dst + off, tail * sizeof(float), out);
+        HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
+        HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot);
+        acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero()));
+    }
+
+    return hvx_vec_get_f32(hvx_vec_reduce_sum_f32(acc));
+}
+
+static inline float gdn_add_scaled_dot_f32(float * restrict dst, const float * restrict src,
+        float scale, const float * restrict dot, uint32_t n) {
+    HVX_Vector acc = Q6_V_vzero();
+    const HVX_Vector vscale = hvx_vec_splat_f32(scale);
+
+    const uint32_t epv = 128 / sizeof(float);
+    const uint32_t nvec = n / epv;
+    const uint32_t tail = n % epv;
+    for (uint32_t i = 0; i < nvec; ++i) {
+        HVX_Vector vd = hvx_vmemu(dst + i * epv);
+        HVX_Vector vs = hvx_vmem(src + i * epv);
+        HVX_Vector vdot = hvx_vmem(dot + i * epv);
+        HVX_Vector out = hvx_vec_add_f32_f32(vd, hvx_vec_mul_f32_f32(vs, vscale));
+        hvx_vmemu(dst + i * epv) = out;
+        acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot));
+    }
+
+    if (tail) {
+        const uint32_t off = nvec * epv;
+        HVX_Vector vd = hvx_vmemu(dst + off);
+        HVX_Vector vs = hvx_vmem(src + off);
+        HVX_Vector vdot = hvx_vmem(dot + off);
+        HVX_Vector out = hvx_vec_add_f32_f32(vd, hvx_vec_mul_f32_f32(vs, vscale));
+        hvx_vec_store_u(dst + off, tail * sizeof(float), out);
+        HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
+        HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot);
+        acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero()));
+    }
+
+    return hvx_vec_get_f32(hvx_vec_reduce_sum_f32(acc));
+}
+
+static inline void gdn_mul_dot4_f32(float * restrict dst0, float * restrict dst1,
+        float * restrict dst2, float * restrict dst3, const float * restrict mul,
+        const float * restrict dot, uint32_t n, float * restrict sums) {
+    HVX_Vector acc0 = Q6_V_vzero();
+    HVX_Vector acc1 = Q6_V_vzero();
+    HVX_Vector acc2 = Q6_V_vzero();
+    HVX_Vector acc3 = Q6_V_vzero();
+
+    const uint32_t epv = 128 / sizeof(float);
+    const uint32_t nvec = n / epv;
+    const uint32_t tail = n % epv;
+    for (uint32_t i = 0; i < nvec; ++i) {
+        HVX_Vector vm = hvx_vmem(mul + i * epv);
+        HVX_Vector vdot = hvx_vmem(dot + i * epv);
+
+        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vm);
+        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vm);
+        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vm);
+        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vm);
+
+        hvx_vmemu(dst0 + i * epv) = out0;
+        hvx_vmemu(dst1 + i * epv) = out1;
+        hvx_vmemu(dst2 + i * epv) = out2;
+        hvx_vmemu(dst3 + i * epv) = out3;
+
+        acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
+        acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
+        acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
+        acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
+    }
+
+    if (tail) {
+        const uint32_t off = nvec * epv;
+        HVX_Vector vm = hvx_vmem(mul + off);
+        HVX_Vector vdot = hvx_vmem(dot + off);
+        HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
+        HVX_Vector zero = Q6_V_vzero();
+
+        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vm);
+        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vm);
+        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vm);
+        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vm);
+
+        hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
+        hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
+        hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
+        hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
+
+        acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
+        acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
+        acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
+        acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
+    }
+
+    HVX_Vector_x4 acc = { .v = { acc0, acc1, acc2, acc3 } };
+    hvx_vec_store_u(sums, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(acc));
+}
+
+static inline void gdn_mul_scalar_dot4_f32(float * restrict dst0, float * restrict dst1,
+        float * restrict dst2, float * restrict dst3, float mul,
+        const float * restrict dot, uint32_t n, float * restrict sums) {
+    HVX_Vector acc0 = Q6_V_vzero();
+    HVX_Vector acc1 = Q6_V_vzero();
+    HVX_Vector acc2 = Q6_V_vzero();
+    HVX_Vector acc3 = Q6_V_vzero();
+    const HVX_Vector vmul = hvx_vec_splat_f32(mul);
+
+    const uint32_t epv = 128 / sizeof(float);
+    const uint32_t nvec = n / epv;
+    const uint32_t tail = n % epv;
+    for (uint32_t i = 0; i < nvec; ++i) {
+        HVX_Vector vdot = hvx_vmem(dot + i * epv);
+
+        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vmul);
+        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vmul);
+        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vmul);
+        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vmul);
+
+        hvx_vmemu(dst0 + i * epv) = out0;
+        hvx_vmemu(dst1 + i * epv) = out1;
+        hvx_vmemu(dst2 + i * epv) = out2;
+        hvx_vmemu(dst3 + i * epv) = out3;
+
+        acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
+        acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
+        acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
+        acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
+    }
+
+    if (tail) {
+        const uint32_t off = nvec * epv;
+        HVX_Vector vdot = hvx_vmem(dot + off);
+        HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
+        HVX_Vector zero = Q6_V_vzero();
+
+        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vmul);
+        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vmul);
+        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vmul);
+        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vmul);
+
+        hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
+        hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
+        hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
+        hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
+
+        acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
+        acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
+        acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
+        acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
+    }
+
+    HVX_Vector_x4 acc = { .v = { acc0, acc1, acc2, acc3 } };
+    hvx_vec_store_u(sums, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(acc));
+}
+
+static inline void gdn_add_scaled_dot4_f32(float * restrict dst0, float * restrict dst1,
+        float * restrict dst2, float * restrict dst3, const float * restrict src,
+        const float * restrict scale, const float * restrict dot, uint32_t n,
+        float * restrict sums) {
+    HVX_Vector acc0 = Q6_V_vzero();
+    HVX_Vector acc1 = Q6_V_vzero();
+    HVX_Vector acc2 = Q6_V_vzero();
+    HVX_Vector acc3 = Q6_V_vzero();
+    const HVX_Vector scale0 = hvx_vec_splat_f32(scale[0]);
+    const HVX_Vector scale1 = hvx_vec_splat_f32(scale[1]);
+    const HVX_Vector scale2 = hvx_vec_splat_f32(scale[2]);
+    const HVX_Vector scale3 = hvx_vec_splat_f32(scale[3]);
+
+    const uint32_t epv = 128 / sizeof(float);
+    const uint32_t nvec = n / epv;
+    const uint32_t tail = n % epv;
+    for (uint32_t i = 0; i < nvec; ++i) {
+        HVX_Vector vs = hvx_vmem(src + i * epv);
+        HVX_Vector vdot = hvx_vmem(dot + i * epv);
+
+        HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + i * epv), hvx_vec_mul_f32_f32(vs, scale0));
+        HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + i * epv), hvx_vec_mul_f32_f32(vs, scale1));
+        HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + i * epv), hvx_vec_mul_f32_f32(vs, scale2));
+        HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + i * epv), hvx_vec_mul_f32_f32(vs, scale3));
+
+        hvx_vmemu(dst0 + i * epv) = out0;
+        hvx_vmemu(dst1 + i * epv) = out1;
+        hvx_vmemu(dst2 + i * epv) = out2;
+        hvx_vmemu(dst3 + i * epv) = out3;
+
+        acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
+        acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
+        acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
+        acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
+    }
+
+    if (tail) {
+        const uint32_t off = nvec * epv;
+        HVX_Vector vs = hvx_vmem(src + off);
+        HVX_Vector vdot = hvx_vmem(dot + off);
+        HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
+        HVX_Vector zero = Q6_V_vzero();
+
+        HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + off), hvx_vec_mul_f32_f32(vs, scale0));
+        HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + off), hvx_vec_mul_f32_f32(vs, scale1));
+        HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + off), hvx_vec_mul_f32_f32(vs, scale2));
+        HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + off), hvx_vec_mul_f32_f32(vs, scale3));
+
+        hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
+        hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
+        hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
+        hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
+
+        acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
+        acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
+        acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
+        acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
+    }
+
+    HVX_Vector_x4 acc = { .v = { acc0, acc1, acc2, acc3 } };
+    hvx_vec_store_u(sums, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(acc));
+}
+
+static inline void gdn_mul_dot8_f32(float * restrict dst0, float * restrict dst1,
+        float * restrict dst2, float * restrict dst3, float * restrict dst4,
+        float * restrict dst5, float * restrict dst6, float * restrict dst7,
+        const float * restrict mul, const float * restrict dot, uint32_t n,
+        float * restrict sums) {
+    HVX_Vector acc0 = Q6_V_vzero();
+    HVX_Vector acc1 = Q6_V_vzero();
+    HVX_Vector acc2 = Q6_V_vzero();
+    HVX_Vector acc3 = Q6_V_vzero();
+    HVX_Vector acc4 = Q6_V_vzero();
+    HVX_Vector acc5 = Q6_V_vzero();
+    HVX_Vector acc6 = Q6_V_vzero();
+    HVX_Vector acc7 = Q6_V_vzero();
+
+    const uint32_t epv = 128 / sizeof(float);
+    const uint32_t nvec = n / epv;
+    const uint32_t tail = n % epv;
+    for (uint32_t i = 0; i < nvec; ++i) {
+        HVX_Vector vm = hvx_vmem(mul + i * epv);
+        HVX_Vector vdot = hvx_vmem(dot + i * epv);
+
+        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vm);
+        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vm);
+        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vm);
+        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vm);
+        HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + i * epv), vm);
+        HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + i * epv), vm);
+        HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + i * epv), vm);
+        HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + i * epv), vm);
+
+        hvx_vmemu(dst0 + i * epv) = out0;
+        hvx_vmemu(dst1 + i * epv) = out1;
+        hvx_vmemu(dst2 + i * epv) = out2;
+        hvx_vmemu(dst3 + i * epv) = out3;
+        hvx_vmemu(dst4 + i * epv) = out4;
+        hvx_vmemu(dst5 + i * epv) = out5;
+        hvx_vmemu(dst6 + i * epv) = out6;
+        hvx_vmemu(dst7 + i * epv) = out7;
+
+        acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
+        acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
+        acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
+        acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
+        acc4 = hvx_vec_add_f32_f32(acc4, hvx_vec_mul_f32_f32(out4, vdot));
+        acc5 = hvx_vec_add_f32_f32(acc5, hvx_vec_mul_f32_f32(out5, vdot));
+        acc6 = hvx_vec_add_f32_f32(acc6, hvx_vec_mul_f32_f32(out6, vdot));
+        acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot));
+    }
+
+    if (tail) {
+        const uint32_t off = nvec * epv;
+        HVX_Vector vm = hvx_vmem(mul + off);
+        HVX_Vector vdot = hvx_vmem(dot + off);
+        HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
+        HVX_Vector zero = Q6_V_vzero();
+
+        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vm);
+        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vm);
+        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vm);
+        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vm);
+        HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + off), vm);
+        HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + off), vm);
+        HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + off), vm);
+        HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + off), vm);
+
+        hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
+        hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
+        hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
+        hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
+        hvx_vec_store_u(dst4 + off, tail * sizeof(float), out4);
+        hvx_vec_store_u(dst5 + off, tail * sizeof(float), out5);
+        hvx_vec_store_u(dst6 + off, tail * sizeof(float), out6);
+        hvx_vec_store_u(dst7 + off, tail * sizeof(float), out7);
+
+        acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
+        acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
+        acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
+        acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
+        acc4 = hvx_vec_add_f32_f32(acc4, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out4, vdot), zero));
+        acc5 = hvx_vec_add_f32_f32(acc5, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out5, vdot), zero));
+        acc6 = hvx_vec_add_f32_f32(acc6, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out6, vdot), zero));
+        acc7 = hvx_vec_add_f32_f32(acc7, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out7, vdot), zero));
+    }
+
+    HVX_Vector_x4 accA = { .v = { acc0, acc1, acc2, acc3 } };
+    HVX_Vector_x4 accB = { .v = { acc4, acc5, acc6, acc7 } };
+    hvx_vec_store_u(sums + 0, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accA));
+    hvx_vec_store_u(sums + 4, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accB));
+}
+
+static inline void gdn_mul_scalar_dot8_f32(float * restrict dst0, float * restrict dst1,
+        float * restrict dst2, float * restrict dst3, float * restrict dst4,
+        float * restrict dst5, float * restrict dst6, float * restrict dst7,
+        float mul, const float * restrict dot, uint32_t n, float * restrict sums) {
+    HVX_Vector acc0 = Q6_V_vzero();
+    HVX_Vector acc1 = Q6_V_vzero();
+    HVX_Vector acc2 = Q6_V_vzero();
+    HVX_Vector acc3 = Q6_V_vzero();
+    HVX_Vector acc4 = Q6_V_vzero();
+    HVX_Vector acc5 = Q6_V_vzero();
+    HVX_Vector acc6 = Q6_V_vzero();
+    HVX_Vector acc7 = Q6_V_vzero();
+    const HVX_Vector vmul = hvx_vec_splat_f32(mul);
+
+    const uint32_t epv = 128 / sizeof(float);
+    const uint32_t nvec = n / epv;
+    const uint32_t tail = n % epv;
+    for (uint32_t i = 0; i < nvec; ++i) {
+        HVX_Vector vdot = hvx_vmem(dot + i * epv);
+
+        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vmul);
+        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vmul);
+        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vmul);
+        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vmul);
+        HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + i * epv), vmul);
+        HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + i * epv), vmul);
+        HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + i * epv), vmul);
+        HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + i * epv), vmul);
+
+        hvx_vmemu(dst0 + i * epv) = out0;
+        hvx_vmemu(dst1 + i * epv) = out1;
+        hvx_vmemu(dst2 + i * epv) = out2;
+        hvx_vmemu(dst3 + i * epv) = out3;
+        hvx_vmemu(dst4 + i * epv) = out4;
+        hvx_vmemu(dst5 + i * epv) = out5;
+        hvx_vmemu(dst6 + i * epv) = out6;
+        hvx_vmemu(dst7 + i * epv) = out7;
+
+        acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
+        acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
+        acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
+        acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
+        acc4 = hvx_vec_add_f32_f32(acc4, hvx_vec_mul_f32_f32(out4, vdot));
+        acc5 = hvx_vec_add_f32_f32(acc5, hvx_vec_mul_f32_f32(out5, vdot));
+        acc6 = hvx_vec_add_f32_f32(acc6, hvx_vec_mul_f32_f32(out6, vdot));
+        acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot));
+    }
+
+    if (tail) {
+        const uint32_t off = nvec * epv;
+        HVX_Vector vdot = hvx_vmem(dot + off);
+        HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
+        HVX_Vector zero = Q6_V_vzero();
+
+        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vmul);
+        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vmul);
+        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vmul);
+        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vmul);
+        HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + off), vmul);
+        HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + off), vmul);
+        HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + off), vmul);
+        HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + off), vmul);
+
+        hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
+        hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
+        hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
+        hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
+        hvx_vec_store_u(dst4 + off, tail * sizeof(float), out4);
+        hvx_vec_store_u(dst5 + off, tail * sizeof(float), out5);
+        hvx_vec_store_u(dst6 + off, tail * sizeof(float), out6);
+        hvx_vec_store_u(dst7 + off, tail * sizeof(float), out7);
+
+        acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
+        acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
+        acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
+        acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
+        acc4 = hvx_vec_add_f32_f32(acc4, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out4, vdot), zero));
+        acc5 = hvx_vec_add_f32_f32(acc5, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out5, vdot), zero));
+        acc6 = hvx_vec_add_f32_f32(acc6, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out6, vdot), zero));
+        acc7 = hvx_vec_add_f32_f32(acc7, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out7, vdot), zero));
+    }
+
+    HVX_Vector_x4 accA = { .v = { acc0, acc1, acc2, acc3 } };
+    HVX_Vector_x4 accB = { .v = { acc4, acc5, acc6, acc7 } };
+    hvx_vec_store_u(sums + 0, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accA));
+    hvx_vec_store_u(sums + 4, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accB));
+}
+
+static inline void gdn_add_scaled_dot8_f32(float * restrict dst0, float * restrict dst1,
+        float * restrict dst2, float * restrict dst3, float * restrict dst4,
+        float * restrict dst5, float * restrict dst6, float * restrict dst7,
+        const float * restrict src, const float * restrict scale,
+        const float * restrict dot, uint32_t n, float * restrict sums) {
+    HVX_Vector acc0 = Q6_V_vzero();
+    HVX_Vector acc1 = Q6_V_vzero();
+    HVX_Vector acc2 = Q6_V_vzero();
+    HVX_Vector acc3 = Q6_V_vzero();
+    HVX_Vector acc4 = Q6_V_vzero();
+    HVX_Vector acc5 = Q6_V_vzero();
+    HVX_Vector acc6 = Q6_V_vzero();
+    HVX_Vector acc7 = Q6_V_vzero();
+    const HVX_Vector scale0 = hvx_vec_splat_f32(scale[0]);
+    const HVX_Vector scale1 = hvx_vec_splat_f32(scale[1]);
+    const HVX_Vector scale2 = hvx_vec_splat_f32(scale[2]);
+    const HVX_Vector scale3 = hvx_vec_splat_f32(scale[3]);
+    const HVX_Vector scale4 = hvx_vec_splat_f32(scale[4]);
+    const HVX_Vector scale5 = hvx_vec_splat_f32(scale[5]);
+    const HVX_Vector scale6 = hvx_vec_splat_f32(scale[6]);
+    const HVX_Vector scale7 = hvx_vec_splat_f32(scale[7]);
+
+    const uint32_t epv = 128 / sizeof(float);
+    const uint32_t nvec = n / epv;
+    const uint32_t tail = n % epv;
+    for (uint32_t i = 0; i < nvec; ++i) {
+        HVX_Vector vs = hvx_vmem(src + i * epv);
+        HVX_Vector vdot = hvx_vmem(dot + i * epv);
+
+        HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + i * epv), hvx_vec_mul_f32_f32(vs, scale0));
+        HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + i * epv), hvx_vec_mul_f32_f32(vs, scale1));
+        HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + i * epv), hvx_vec_mul_f32_f32(vs, scale2));
+        HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + i * epv), hvx_vec_mul_f32_f32(vs, scale3));
+        HVX_Vector out4 = hvx_vec_add_f32_f32(hvx_vmemu(dst4 + i * epv), hvx_vec_mul_f32_f32(vs, scale4));
+        HVX_Vector out5 = hvx_vec_add_f32_f32(hvx_vmemu(dst5 + i * epv), hvx_vec_mul_f32_f32(vs, scale5));
+        HVX_Vector out6 = hvx_vec_add_f32_f32(hvx_vmemu(dst6 + i * epv), hvx_vec_mul_f32_f32(vs, scale6));
+        HVX_Vector out7 = hvx_vec_add_f32_f32(hvx_vmemu(dst7 + i * epv), hvx_vec_mul_f32_f32(vs, scale7));
+
+        hvx_vmemu(dst0 + i * epv) = out0;
+        hvx_vmemu(dst1 + i * epv) = out1;
+        hvx_vmemu(dst2 + i * epv) = out2;
+        hvx_vmemu(dst3 + i * epv) = out3;
+        hvx_vmemu(dst4 + i * epv) = out4;
+        hvx_vmemu(dst5 + i * epv) = out5;
+        hvx_vmemu(dst6 + i * epv) = out6;
+        hvx_vmemu(dst7 + i * epv) = out7;
+
+        acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
+        acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
+        acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
+        acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
+        acc4 = hvx_vec_add_f32_f32(acc4, hvx_vec_mul_f32_f32(out4, vdot));
+        acc5 = hvx_vec_add_f32_f32(acc5, hvx_vec_mul_f32_f32(out5, vdot));
+        acc6 = hvx_vec_add_f32_f32(acc6, hvx_vec_mul_f32_f32(out6, vdot));
+        acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot));
+    }
+
+    if (tail) {
+        const uint32_t off = nvec * epv;
+        HVX_Vector vs = hvx_vmem(src + off);
+        HVX_Vector vdot = hvx_vmem(dot + off);
+        HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
+        HVX_Vector zero = Q6_V_vzero();
+
+        HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + off), hvx_vec_mul_f32_f32(vs, scale0));
+        HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + off), hvx_vec_mul_f32_f32(vs, scale1));
+        HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + off), hvx_vec_mul_f32_f32(vs, scale2));
+        HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + off), hvx_vec_mul_f32_f32(vs, scale3));
+        HVX_Vector out4 = hvx_vec_add_f32_f32(hvx_vmemu(dst4 + off), hvx_vec_mul_f32_f32(vs, scale4));
+        HVX_Vector out5 = hvx_vec_add_f32_f32(hvx_vmemu(dst5 + off), hvx_vec_mul_f32_f32(vs, scale5));
+        HVX_Vector out6 = hvx_vec_add_f32_f32(hvx_vmemu(dst6 + off), hvx_vec_mul_f32_f32(vs, scale6));
+        HVX_Vector out7 = hvx_vec_add_f32_f32(hvx_vmemu(dst7 + off), hvx_vec_mul_f32_f32(vs, scale7));
+
+        hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
+        hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
+        hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
+        hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
+        hvx_vec_store_u(dst4 + off, tail * sizeof(float), out4);
+        hvx_vec_store_u(dst5 + off, tail * sizeof(float), out5);
+        hvx_vec_store_u(dst6 + off, tail * sizeof(float), out6);
+        hvx_vec_store_u(dst7 + off, tail * sizeof(float), out7);
+
+        acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
+        acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
+        acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
+        acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
+        acc4 = hvx_vec_add_f32_f32(acc4, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out4, vdot), zero));
+        acc5 = hvx_vec_add_f32_f32(acc5, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out5, vdot), zero));
+        acc6 = hvx_vec_add_f32_f32(acc6, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out6, vdot), zero));
+        acc7 = hvx_vec_add_f32_f32(acc7, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out7, vdot), zero));
+    }
+
+    HVX_Vector_x4 accA = { .v = { acc0, acc1, acc2, acc3 } };
+    HVX_Vector_x4 accB = { .v = { acc4, acc5, acc6, acc7 } };
+    hvx_vec_store_u(sums + 0, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accA));
+    hvx_vec_store_u(sums + 4, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accB));
+}
+
+static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_gdn_context * gctx = (struct htp_gdn_context *) data;
+    struct htp_ops_context * octx = gctx->octx;
+
+    const struct htp_tensor * q     = octx->src[0];
+    const struct htp_tensor * k     = octx->src[1];
+    const struct htp_tensor * v     = octx->src[2];
+    const struct htp_tensor * g     = octx->src[3];
+    const struct htp_tensor * beta  = octx->src[4];
+    const struct htp_tensor * state = octx->src[5];
+    const struct htp_tensor * dst   = octx->dst;
+
+    const uint32_t S_v      = v->ne[0];
+    const uint32_t H        = v->ne[1];
+    const uint32_t n_tokens = v->ne[2];
+    const uint32_t n_seqs   = v->ne[3];
+
+    const uint32_t total_rows = H * n_seqs;
+    if (ith >= total_rows) {
+        return;
+    }
+
+    const uint32_t rq3 = n_seqs / q->ne[3];
+    const uint32_t rk3 = n_seqs / k->ne[3];
+    const float scale = 1.0f / sqrtf((float) S_v);
+
+    float * dst_base       = (float *) (uintptr_t) dst->data;
+    float * state_out_base = dst_base + (uint64_t) S_v * H * n_tokens * n_seqs;
+    const float * state_in_base = (const float *) (uintptr_t) state->data;
+
+    const bool kda = (g->ne[0] == S_v);
+    float local_gate[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
+    float local_q[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
+    float local_k[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
+    float local_sums[4] __attribute__((aligned(128)));
+
+    for (uint32_t ir = ith; ir < total_rows; ir += nth) {
+        const uint32_t iv1 = ir % H;
+        const uint32_t iv3 = ir / H;
+
+        const uint32_t iq1 = iv1 % q->ne[1];
+        const uint32_t ik1 = iv1 % k->ne[1];
+        const uint32_t iq3 = iv3 / rq3;
+        const uint32_t ik3 = iv3 / rk3;
+
+        float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
+        const float * s_in = state_in_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
+
+        memcpy(s_out, s_in, gctx->state_bytes);
+        float * s_work = s_out;
+
+        float * attn_data = dst_base + ((uint64_t) iv3 * n_tokens * H + iv1) * S_v;
+
+        for (uint32_t t = 0; t < n_tokens; ++t) {
+            const float * q_t = (const float *) ((const uint8_t *) (uintptr_t) q->data +
+                    (uint64_t) iq3 * q->nb[3] + (uint64_t) t * q->nb[2] + (uint64_t) iq1 * q->nb[1]);
+            const float * k_t = (const float *) ((const uint8_t *) (uintptr_t) k->data +
+                    (uint64_t) ik3 * k->nb[3] + (uint64_t) t * k->nb[2] + (uint64_t) ik1 * k->nb[1]);
+            const float * v_t = (const float *) ((const uint8_t *) (uintptr_t) v->data +
+                    (uint64_t) iv3 * v->nb[3] + (uint64_t) t * v->nb[2] + (uint64_t) iv1 * v->nb[1]);
+            const float * g_t = (const float *) ((const uint8_t *) (uintptr_t) g->data +
+                    (uint64_t) iv3 * g->nb[3] + (uint64_t) t * g->nb[2] + (uint64_t) iv1 * g->nb[1]);
+            const float beta_val = *(const float *) ((const uint8_t *) (uintptr_t) beta->data +
+                    (uint64_t) iv3 * beta->nb[3] + (uint64_t) t * beta->nb[2] + (uint64_t) iv1 * beta->nb[1]);
+
+            memcpy(local_q, q_t, (size_t) S_v * sizeof(float));
+            memcpy(local_k, k_t, (size_t) S_v * sizeof(float));
+
+            if (kda) {
+                hvx_exp_f32((uint8_t *) local_gate, (const uint8_t *) g_t, S_v, false);
+
+                uint32_t j = 0;
+                for (; j + 4 <= S_v; j += 4) {
+                    float * row0 = s_work + (uint64_t) (j + 0) * S_v;
+                    float * row1 = s_work + (uint64_t) (j + 1) * S_v;
+                    float * row2 = s_work + (uint64_t) (j + 2) * S_v;
+                    float * row3 = s_work + (uint64_t) (j + 3) * S_v;
+                    gdn_mul_dot4_f32(row0, row1, row2, row3, local_gate, local_k, S_v, local_sums);
+                    float local_delta_b[4] __attribute__((aligned(128)));
+                    for (uint32_t r = 0; r < 4; ++r) {
+                        local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
+                    }
+                    gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
+                    for (uint32_t r = 0; r < 4; ++r) {
+                        attn_data[j + r] = local_sums[r] * scale;
+                    }
+                }
+                for (; j < S_v; ++j) {
+                    float * row = s_work + (uint64_t) j * S_v;
+                    const float sum = gdn_mul_dot_f32(row, local_gate, local_k, S_v);
+                    const float dj = (v_t[j] - sum) * beta_val;
+                    attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
+                }
+            } else {
+                const float gate = expf(g_t[0]);
+                uint32_t j = 0;
+                for (; j + 4 <= S_v; j += 4) {
+                    float * row0 = s_work + (uint64_t) (j + 0) * S_v;
+                    float * row1 = s_work + (uint64_t) (j + 1) * S_v;
+                    float * row2 = s_work + (uint64_t) (j + 2) * S_v;
+                    float * row3 = s_work + (uint64_t) (j + 3) * S_v;
+                    gdn_mul_scalar_dot4_f32(row0, row1, row2, row3, gate, local_k, S_v, local_sums);
+                    float local_delta_b[4] __attribute__((aligned(128)));
+                    for (uint32_t r = 0; r < 4; ++r) {
+                        local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
+                    }
+                    gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
+                    for (uint32_t r = 0; r < 4; ++r) {
+                        attn_data[j + r] = local_sums[r] * scale;
+                    }
+                }
+                for (; j < S_v; ++j) {
+                    float * row = s_work + (uint64_t) j * S_v;
+                    const float sum = gdn_mul_scalar_dot_f32(row, gate, local_k, S_v);
+                    const float dj = (v_t[j] - sum) * beta_val;
+                    attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
+                }
+            }
+
+            attn_data += (uint64_t) S_v * H;
+        }
+    }
+}
+
+static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_gdn_context * gctx = (struct htp_gdn_context *) data;
+    struct htp_ops_context * octx = gctx->octx;
+
+    const struct htp_tensor * q     = octx->src[0];
+    const struct htp_tensor * k     = octx->src[1];
+    const struct htp_tensor * v     = octx->src[2];
+    const struct htp_tensor * g     = octx->src[3];
+    const struct htp_tensor * beta  = octx->src[4];
+    const struct htp_tensor * state = octx->src[5];
+    const struct htp_tensor * dst   = octx->dst;
+
+    const uint32_t S_v      = v->ne[0];
+    const uint32_t H        = v->ne[1];
+    const uint32_t n_seqs   = v->ne[3];
+
+    const uint32_t total_rows = H * n_seqs;
+    if (ith >= total_rows) {
+        return;
+    }
+
+    const uint32_t rq3 = n_seqs / q->ne[3];
+    const uint32_t rk3 = n_seqs / k->ne[3];
+    const float scale = 1.0f / sqrtf((float) S_v);
+
+    float * dst_base       = (float *) (uintptr_t) dst->data;
+    float * state_out_base = dst_base + (uint64_t) S_v * H * n_seqs;
+    const float * state_in_base = (const float *) (uintptr_t) state->data;
+
+    const bool kda = (g->ne[0] == S_v);
+    float local_gate[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
+    float local_q[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
+    float local_k[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
+    float local_sums[8] __attribute__((aligned(128)));
+
+    dma_queue * dma = octx->ctx->dma[ith];
+
+    uint8_t * spad = NULL;
+    if (gctx->use_vtcm) {
+        spad = gctx->vtcm_state_base + gctx->vtcm_state_per_thread * ith;
+    }
+
+    for (uint32_t ir = ith; ir < total_rows; ir += nth) {
+        const uint32_t iv1 = ir % H;
+        const uint32_t iv3 = ir / H;
+
+        const uint32_t iq1 = iv1 % q->ne[1];
+        const uint32_t ik1 = iv1 % k->ne[1];
+        const uint32_t iq3 = iv3 / rq3;
+        const uint32_t ik3 = iv3 / rk3;
+
+        float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
+        const float * s_in = state_in_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
+        float * s_work;
+
+        if (spad) {
+            dma_queue_push(dma, dma_make_ptr(spad, s_in),
+                           S_v * sizeof(float), S_v * sizeof(float),
+                           S_v * sizeof(float), S_v);
+            dma_queue_pop(dma);
+            s_work = (float *) spad;
+        } else {
+            s_work = s_out;
+            memcpy(s_work, s_in, gctx->state_bytes);
+        }
+
+        float * attn_data = dst_base + ((uint64_t) iv3 * H + iv1) * S_v;
+
+        const float * q_t = (const float *) ((const uint8_t *) (uintptr_t) q->data +
+                (uint64_t) iq3 * q->nb[3] + (uint64_t) iq1 * q->nb[1]);
+        const float * k_t = (const float *) ((const uint8_t *) (uintptr_t) k->data +
+                (uint64_t) ik3 * k->nb[3] + (uint64_t) ik1 * k->nb[1]);
+        const float * v_t = (const float *) ((const uint8_t *) (uintptr_t) v->data +
+                (uint64_t) iv3 * v->nb[3] + (uint64_t) iv1 * v->nb[1]);
+        const float * g_t = (const float *) ((const uint8_t *) (uintptr_t) g->data +
+                (uint64_t) iv3 * g->nb[3] + (uint64_t) iv1 * g->nb[1]);
+        const float beta_val = *(const float *) ((const uint8_t *) (uintptr_t) beta->data +
+                (uint64_t) iv3 * beta->nb[3] + (uint64_t) iv1 * beta->nb[1]);
+
+        memcpy(local_q, q_t, (size_t) S_v * sizeof(float));
+        memcpy(local_k, k_t, (size_t) S_v * sizeof(float));
+
+        if (kda) {
+            hvx_exp_f32((uint8_t *) local_gate, (const uint8_t *) g_t, S_v, false);
+
+            uint32_t j = 0;
+            for (; j + 8 <= S_v; j += 8) {
+                float * row0 = s_work + (uint64_t) (j + 0) * S_v;
+                float * row1 = s_work + (uint64_t) (j + 1) * S_v;
+                float * row2 = s_work + (uint64_t) (j + 2) * S_v;
+                float * row3 = s_work + (uint64_t) (j + 3) * S_v;
+                float * row4 = s_work + (uint64_t) (j + 4) * S_v;
+                float * row5 = s_work + (uint64_t) (j + 5) * S_v;
+                float * row6 = s_work + (uint64_t) (j + 6) * S_v;
+                float * row7 = s_work + (uint64_t) (j + 7) * S_v;
+                gdn_mul_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
+                                 local_gate, local_k, S_v, local_sums);
+                float local_delta_b[8] __attribute__((aligned(128)));
+                for (uint32_t r = 0; r < 8; ++r) {
+                    local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
+                }
+                gdn_add_scaled_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
+                                        local_k, local_delta_b, local_q, S_v, local_sums);
+                for (uint32_t r = 0; r < 8; ++r) {
+                    attn_data[j + r] = local_sums[r] * scale;
+                }
+            }
+            for (; j + 4 <= S_v; j += 4) {
+                float * row0 = s_work + (uint64_t) (j + 0) * S_v;
+                float * row1 = s_work + (uint64_t) (j + 1) * S_v;
+                float * row2 = s_work + (uint64_t) (j + 2) * S_v;
+                float * row3 = s_work + (uint64_t) (j + 3) * S_v;
+                gdn_mul_dot4_f32(row0, row1, row2, row3, local_gate, local_k, S_v, local_sums);
+                float local_delta_b[4] __attribute__((aligned(128)));
+                for (uint32_t r = 0; r < 4; ++r) {
+                    local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
+                }
+                gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
+                for (uint32_t r = 0; r < 4; ++r) {
+                    attn_data[j + r] = local_sums[r] * scale;
+                }
+            }
+            for (; j < S_v; ++j) {
+                float * row = s_work + (uint64_t) j * S_v;
+                const float sum = gdn_mul_dot_f32(row, local_gate, local_k, S_v);
+                const float dj = (v_t[j] - sum) * beta_val;
+                attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
+            }
+        } else {
+            const float gate = expf(g_t[0]);
+            uint32_t j = 0;
+            for (; j + 8 <= S_v; j += 8) {
+                float * row0 = s_work + (uint64_t) (j + 0) * S_v;
+                float * row1 = s_work + (uint64_t) (j + 1) * S_v;
+                float * row2 = s_work + (uint64_t) (j + 2) * S_v;
+                float * row3 = s_work + (uint64_t) (j + 3) * S_v;
+                float * row4 = s_work + (uint64_t) (j + 4) * S_v;
+                float * row5 = s_work + (uint64_t) (j + 5) * S_v;
+                float * row6 = s_work + (uint64_t) (j + 6) * S_v;
+                float * row7 = s_work + (uint64_t) (j + 7) * S_v;
+                gdn_mul_scalar_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
+                                        gate, local_k, S_v, local_sums);
+                float local_delta_b[8] __attribute__((aligned(128)));
+                for (uint32_t r = 0; r < 8; ++r) {
+                    local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
+                }
+                gdn_add_scaled_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
+                                        local_k, local_delta_b, local_q, S_v, local_sums);
+                for (uint32_t r = 0; r < 8; ++r) {
+                    attn_data[j + r] = local_sums[r] * scale;
+                }
+            }
+            for (; j + 4 <= S_v; j += 4) {
+                float * row0 = s_work + (uint64_t) (j + 0) * S_v;
+                float * row1 = s_work + (uint64_t) (j + 1) * S_v;
+                float * row2 = s_work + (uint64_t) (j + 2) * S_v;
+                float * row3 = s_work + (uint64_t) (j + 3) * S_v;
+                gdn_mul_scalar_dot4_f32(row0, row1, row2, row3, gate, local_k, S_v, local_sums);
+                float local_delta_b[4] __attribute__((aligned(128)));
+                for (uint32_t r = 0; r < 4; ++r) {
+                    local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
+                }
+                gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
+                for (uint32_t r = 0; r < 4; ++r) {
+                    attn_data[j + r] = local_sums[r] * scale;
+                }
+            }
+            for (; j < S_v; ++j) {
+                float * row = s_work + (uint64_t) j * S_v;
+                const float sum = gdn_mul_scalar_dot_f32(row, gate, local_k, S_v);
+                const float dj = (v_t[j] - sum) * beta_val;
+                attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
+            }
+        }
+
+        if (spad) {
+            dma_queue_push(dma, dma_make_ptr(s_out, spad),
+                           S_v * sizeof(float), S_v * sizeof(float),
+                           S_v * sizeof(float), S_v);
+            dma_queue_pop(dma);
+        }
+    }
+}
+
+int op_gated_delta_net(struct htp_ops_context * octx) {
+    const struct htp_tensor * q     = octx->src[0];
+    const struct htp_tensor * k     = octx->src[1];
+    const struct htp_tensor * v     = octx->src[2];
+    const struct htp_tensor * g     = octx->src[3];
+    const struct htp_tensor * beta  = octx->src[4];
+    const struct htp_tensor * state = octx->src[5];
+    const struct htp_tensor * dst   = octx->dst;
+
+    if (!q || !k || !v || !g || !beta || !state || !dst) {
+        return HTP_STATUS_INVAL_PARAMS;
+    }
+
+    if (q->type != HTP_TYPE_F32 || k->type != HTP_TYPE_F32 || v->type != HTP_TYPE_F32 ||
+        g->type != HTP_TYPE_F32 || beta->type != HTP_TYPE_F32 || state->type != HTP_TYPE_F32 ||
+        dst->type != HTP_TYPE_F32) {
+        return HTP_STATUS_NO_SUPPORT;
+    }
+
+    const uint32_t S_v      = v->ne[0];
+    const uint32_t H        = v->ne[1];
+    const uint32_t n_tokens = v->ne[2];
+    const uint32_t n_seqs   = v->ne[3];
+
+    if (S_v == 0 || S_v > HTP_GDN_MAX_SV || H == 0 || n_tokens == 0 || n_seqs == 0) {
+        return HTP_STATUS_NO_SUPPORT;
+    }
+    if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) {
+        return HTP_STATUS_NO_SUPPORT;
+    }
+    if (q->ne[0] != S_v || k->ne[0] != S_v || q->ne[1] == 0 || k->ne[1] == 0 ||
+        q->ne[2] != n_tokens || k->ne[2] != n_tokens || q->ne[3] == 0 || k->ne[3] == 0 ||
+        (n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) {
+        return HTP_STATUS_NO_SUPPORT;
+    }
+    if (state->ne[0] * state->ne[1] * state->ne[2] * state->ne[3] != S_v * S_v * H * n_seqs) {
+        return HTP_STATUS_NO_SUPPORT;
+    }
+    if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs) {
+        return HTP_STATUS_NO_SUPPORT;
+    }
+
+    if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
+        return HTP_STATUS_OK;
+    }
+
+    struct htp_gdn_context gctx;
+    gctx.octx = octx;
+    gctx.rows_per_thread = (H * n_seqs + octx->n_threads - 1) / octx->n_threads;
+    gctx.state_bytes = (size_t) S_v * S_v * sizeof(float);
+
+    size_t state_aligned = (size_t) S_v * S_v * sizeof(float);
+    state_aligned = (state_aligned + 127) & ~(size_t)127;
+
+    gctx.use_vtcm = false;
+    gctx.vtcm_state_base = NULL;
+    gctx.vtcm_state_per_thread = 0;
+
+    if (n_tokens == 1 && octx->ctx->vtcm_base) {
+        size_t vtcm_total = state_aligned * octx->n_threads;
+        if (octx->ctx->vtcm_size >= vtcm_total) {
+            gctx.use_vtcm = true;
+            gctx.vtcm_state_base = octx->ctx->vtcm_base;
+            gctx.vtcm_state_per_thread = state_aligned;
+        }
+    }
+
+    if (n_tokens == 1) {
+        worker_pool_run_func(octx->ctx->worker_pool, gated_delta_net_f32_tg_thread, &gctx, octx->n_threads);
+    } else {
+        worker_pool_run_func(octx->ctx->worker_pool, gated_delta_net_f32_pp_thread, &gctx, octx->n_threads);
+    }
+
+    return HTP_STATUS_OK;
+}
--- a/ggml/src/ggml-hexagon/htp/htp-ctx.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h
@@ -106,5 +106,6 @@ int op_cumsum(struct htp_ops_context * octx);
 int op_fill(struct htp_ops_context * octx);
 int op_diag(struct htp_ops_context * octx);
 int op_solve_tri(struct htp_ops_context * octx);
+int op_gated_delta_net(struct htp_ops_context * octx);

 #endif /* HTP_CTX_H */
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -83,6 +83,9 @@ enum htp_op_code {
    HTP_OP_FILL,
    HTP_OP_DIAG,
    HTP_OP_SOLVE_TRI,
+    HTP_OP_L2_NORM,
+    HTP_OP_GATED_DELTA_NET,
+
    HTP_OP_INVALID
 };

--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -542,6 +542,7 @@ static int execute_op(struct htp_ops_context * octx) {
        case HTP_OP_UNARY_SIGMOID:
        case HTP_OP_UNARY_NEG:
        case HTP_OP_UNARY_EXP:
+        case HTP_OP_L2_NORM:
            return op_unary(octx);

        case HTP_OP_UNARY_SILU:
@@ -593,6 +594,9 @@ static int execute_op(struct htp_ops_context * octx) {
        case HTP_OP_SOLVE_TRI:
            return op_solve_tri(octx);

+        case HTP_OP_GATED_DELTA_NET:
+            return op_gated_delta_net(octx);
+
        case HTP_OP_INVALID:
            break;

--- a/ggml/src/ggml-hexagon/htp/unary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/unary-ops.c
@@ -298,6 +298,81 @@ static void softplus_f32(const float * restrict src,
    }
 }

+// --- L2_NORM HVX kernel ---
+// Computes y[i] = x[i] / fmax(sqrt(sum(x[j]^2)), epsilon) for each row.
+// scale = 1/fmax(sqrt(sum), epsilon) is computed entirely in HVX registers
+// using rsqrt + inverse to avoid scalar extraction.
+static void hvx_fast_l2_norm_f32(const uint8_t * restrict src,
+                                 uint8_t * restrict dst,
+                                 uint8_t * restrict pad,
+                                 const int num_elems,
+                                 float     epsilon) {
+    (void)pad;
+
+    const HVX_Vector * restrict v_src = (HVX_Vector *) src;
+    HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;
+
+    HVX_Vector sum_v = hvx_vec_splat_f32(0.0f);
+
+    const int nvec = num_elems / VLEN_FP32;
+    const int nloe = num_elems % VLEN_FP32;
+
+    #pragma unroll(4)
+    for (int i = 0; i < nvec; i++) {
+        HVX_Vector v1 = v_src[i];
+        HVX_Vector sq = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
+        sum_v         = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, sq);
+    }
+
+    // Include tail elements in the sum-of-squares using a predicate mask
+    if (nloe > 0) {
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
+        HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
+        HVX_Vector sq = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
+        sum_v         = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, sq);
+    }
+
+    // Compute scale = 1/fmax(sqrt(sum), epsilon) entirely in HVX registers.
+    // hvx_vec_rsqrt_f32 + hvx_vec_inverse_f32 avoids scalar extraction.
+    HVX_Vector sum_sf    = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v));
+    HVX_Vector rsqrt_v   = hvx_vec_rsqrt_f32(sum_sf);              // 1/sqrt(sum)
+    HVX_Vector sqrt_v    = hvx_vec_inverse_f32(rsqrt_v);            // sqrt(sum)
+    HVX_Vector epsilon_v = hvx_vec_splat_f32(epsilon);
+    HVX_Vector denom_v   = Q6_Vsf_vmax_VsfVsf(sqrt_v, epsilon_v);  // fmax(sqrt(sum), epsilon)
+    HVX_Vector scale_v   = hvx_vec_inverse_f32(denom_v);            // 1/fmax(sqrt(sum), epsilon)
+
+    #pragma unroll(4)
+    for (int i = 0; i < nvec; i++) {
+        HVX_Vector v1 = v_src[i];
+        v_dst[i]      = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(v1, scale_v));
+    }
+
+    if (nloe > 0) {
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
+        HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
+        HVX_Vector result = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(v1, scale_v));
+        hvx_vec_store_a(&v_dst[nvec], nloe * 4, result);
+    }
+}
+
+static void l2_norm_f32(const float * restrict src,
+                        float * restrict dst,
+                        uint8_t * restrict spad,
+                        const uint32_t num_rows,
+                        const uint32_t row_elems,
+                        const size_t   row_size,
+                        int32_t *      op_params) {
+    float epsilon = 0.f;
+    memcpy(&epsilon, op_params, sizeof(float));
+
+    for (uint32_t ir = 0; ir < num_rows; ir++) {
+        const float * restrict src_f = (const float *)((const uint8_t *)src + (ir * row_size));
+        float * restrict dst_f       = (float *)((uint8_t *)dst + (ir * row_size));
+
+        hvx_fast_l2_norm_f32((const uint8_t *)src_f, (uint8_t *)dst_f, spad, row_elems, epsilon);
+    }
+}
+
 static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
    const struct htp_unary_context * uctx = (const struct htp_unary_context *) data;
    struct htp_ops_context * octx = uctx->octx;
@@ -402,6 +477,9 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
            case HTP_OP_UNARY_SOFTPLUS:
                softplus_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
                break;
+            case HTP_OP_L2_NORM:
+                l2_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
+                break;
            default:
                break;
        }
@@ -469,6 +547,9 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
        case HTP_OP_UNARY_SOFTPLUS:
            op_type = "softplus-f32";
            break;
+        case HTP_OP_L2_NORM:
+            op_type = "l2norm-f32";
+            break;

        default:
            FARF(ERROR, "Unsupported unary Op %u\n", octx->op);
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -87,17 +87,17 @@ static void ggml_backend_metal_buffer_shared_clear(ggml_backend_buffer_t buffer,
 }

 static ggml_backend_buffer_i ggml_backend_metal_buffer_shared_i = {
-    /* .free_buffer     = */ ggml_backend_metal_buffer_shared_free_buffer,
-    /* .get_base        = */ ggml_backend_metal_buffer_shared_get_base,
-    /* .init_tensor     = */ NULL,
-    /* .memset_tensor   = */ ggml_backend_metal_buffer_shared_memset_tensor,
-    /* .set_tensor      = */ ggml_backend_metal_buffer_shared_set_tensor,
-    /* .get_tensor      = */ ggml_backend_metal_buffer_shared_get_tensor,
-    /* .set_tensor_2d   = */ NULL,
-    /* .get_tensor_2d   = */ NULL,
-    /* .cpy_tensor      = */ ggml_backend_metal_buffer_shared_cpy_tensor,
-    /* .clear           = */ ggml_backend_metal_buffer_shared_clear,
-    /* .reset           = */ NULL,
+    /* .free_buffer   = */ ggml_backend_metal_buffer_shared_free_buffer,
+    /* .get_base      = */ ggml_backend_metal_buffer_shared_get_base,
+    /* .init_tensor   = */ NULL,
+    /* .memset_tensor = */ ggml_backend_metal_buffer_shared_memset_tensor,
+    /* .set_tensor    = */ ggml_backend_metal_buffer_shared_set_tensor,
+    /* .get_tensor    = */ ggml_backend_metal_buffer_shared_get_tensor,
+    /* .set_tensor_2d = */ NULL,
+    /* .get_tensor_2d = */ NULL,
+    /* .cpy_tensor    = */ ggml_backend_metal_buffer_shared_cpy_tensor,
+    /* .clear         = */ ggml_backend_metal_buffer_shared_clear,
+    /* .reset         = */ NULL,
 };

 // private buffer
@@ -163,17 +163,17 @@ static void ggml_backend_metal_buffer_private_clear(ggml_backend_buffer_t buffer
 }

 static ggml_backend_buffer_i ggml_backend_metal_buffer_private_i = {
-    /* .free_buffer             = */ ggml_backend_metal_buffer_private_free_buffer,
-    /* .get_base                = */ ggml_backend_metal_buffer_private_get_base,
-    /* .init_tensor             = */ NULL,
-    /* .memset_tensor           = */ ggml_backend_metal_buffer_private_memset_tensor,
-    /* .set_tensor              = */ ggml_backend_metal_buffer_private_set_tensor,
-    /* .get_tensor              = */ ggml_backend_metal_buffer_private_get_tensor,
-    /* .set_tensor_2d           = */ NULL,
-    /* .get_tensor_2d           = */ NULL,
-    /* .cpy_tensor              = */ ggml_backend_metal_buffer_private_cpy_tensor,
-    /* .clear                   = */ ggml_backend_metal_buffer_private_clear,
-    /* .reset                   = */ NULL,
+    /* .free_buffer   = */ ggml_backend_metal_buffer_private_free_buffer,
+    /* .get_base      = */ ggml_backend_metal_buffer_private_get_base,
+    /* .init_tensor   = */ NULL,
+    /* .memset_tensor = */ ggml_backend_metal_buffer_private_memset_tensor,
+    /* .set_tensor    = */ ggml_backend_metal_buffer_private_set_tensor,
+    /* .get_tensor    = */ ggml_backend_metal_buffer_private_get_tensor,
+    /* .set_tensor_2d = */ NULL,
+    /* .get_tensor_2d = */ NULL,
+    /* .cpy_tensor    = */ ggml_backend_metal_buffer_private_cpy_tensor,
+    /* .clear         = */ ggml_backend_metal_buffer_private_clear,
+    /* .reset         = */ NULL,
 };

 static bool ggml_backend_buffer_is_metal(ggml_backend_buffer_t buffer) {
--- a/ggml/src/ggml-opencl/CMakeLists.txt
+++ b/ggml/src/ggml-opencl/CMakeLists.txt
@@ -102,6 +102,8 @@ set(GGML_OPENCL_KERNELS
    mul_mv_id_q8_0_f32_flat
    mul_mv_id_mxfp4_f32
    mul_mv_id_mxfp4_f32_flat
+    gemm_moe_q4_0_f32_ns
+    gemv_moe_q4_0_f32_ns
    gemm_moe_mxfp4_f32
    gemv_moe_mxfp4_f32
    gemm_moe_mxfp4_f32_ns
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -28,6 +28,7 @@
 #include <memory>
 #include <charconv>
 #include <mutex>
+#include <regex>

 #undef MIN
 #undef MAX
@@ -396,6 +397,8 @@ struct ggml_backend_opencl_context {
    bool has_vector_subgroup_broadcast;
    bool disable_fusion;

+    std::regex *opfilter = nullptr; // regex of ops to not claim
+
    bool adreno_has_large_buffer;
    bool adreno_use_large_buffer;
    ggml_cl_compiler_version adreno_cl_compiler_version;
@@ -539,6 +542,7 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_mul_mm_f16_f32_kq;
    cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
    cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
+    cl_kernel kernel_convert_block_q4_0_trans4_ns, kernel_restore_block_q4_0_trans4_ns;
    cl_kernel kernel_convert_block_q4_1, kernel_restore_block_q4_1;
    cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans;
    cl_kernel kernel_convert_block_mxfp4_trans4_ns, kernel_restore_block_mxfp4_trans4_ns;
@@ -597,6 +601,7 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_conv_2d_f16_f32;
    cl_kernel kernel_ssm_conv_f32_f32, kernel_ssm_conv_f32_f32_4;
    cl_kernel kernel_timestep_embedding;
+    cl_kernel kernel_gemv_moe_q4_0_f32_ns, kernel_gemm_moe_q4_0_f32_ns;
    cl_kernel kernel_gemv_moe_mxfp4_f32, kernel_gemm_moe_mxfp4_f32;
    cl_kernel kernel_gemv_moe_mxfp4_f32_ns, kernel_gemm_moe_mxfp4_f32_ns;
    cl_kernel kernel_moe_reorder_b;
@@ -947,6 +952,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
        CL_CHECK((backend_ctx->kernel_restore_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_noshuffle", &err), err));
        CL_CHECK((backend_ctx->kernel_convert_block_q4_0  = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
        CL_CHECK((backend_ctx->kernel_restore_block_q4_0  = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
+        CL_CHECK((backend_ctx->kernel_convert_block_q4_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_trans4_ns", &err), err));
+        CL_CHECK((backend_ctx->kernel_restore_block_q4_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_trans4_ns", &err), err));
        CL_CHECK((backend_ctx->kernel_convert_block_q4_1_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_1_noshuffle", &err), err));
        CL_CHECK((backend_ctx->kernel_restore_block_q4_1_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1_noshuffle", &err), err));
        CL_CHECK((backend_ctx->kernel_convert_block_q4_1  = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_1", &err), err));
@@ -2881,6 +2888,40 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
        GGML_LOG_CONT(".");
    }

+    // gemv_moe_q4_0_f32_ns
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "gemv_moe_q4_0_f32_ns.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("gemv_moe_q4_0_f32_ns.cl");
+#endif
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_gemv_moe_q4_0_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_q4_0_f32_ns", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
+    }
+
+    // gemm_moe_q4_0_f32_ns
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "gemm_moe_q4_0_f32_ns.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("gemm_moe_q4_0_f32_ns.cl");
+#endif
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_gemm_moe_q4_0_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_q4_0_f32_ns", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
+    }
+
    // gemv_moe_mxfp4_f32_ns
    {
 #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -3494,6 +3535,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {

    backend_ctx->disable_fusion = getenv("GGML_OPENCL_DISABLE_FUSION") != nullptr;

+    const char * str_opfilter = getenv("GGML_OPENCL_OPFILTER");
+    if (str_opfilter) {
+        backend_ctx->opfilter = new std::regex(str_opfilter, std::regex_constants::icase);
+        GGML_LOG_INFO("ggml_opencl: opfilter regex = \"%s\"\n", str_opfilter);
+    }
+
    dev_ctx->backend_ctx = backend_ctx.release();
    return dev_ctx->backend_ctx;
 }
@@ -3648,11 +3695,14 @@ struct ggml_tensor_extra_cl_q4_0 {
            CL_CHECK(clReleaseMemObject(d));
            d = nullptr;
        }
+        if (q_img != nullptr) {
+            CL_CHECK(clReleaseMemObject(q_img));
+            q_img = nullptr;
+        }
        // Currently, q_img and d_img are only initialized when SMALL_ALLOC is
        // enabled. They point to the images in ggml_backend_opencl_buffer_context.
        // So, there is no need to release them here.
        // TODO: initialize them for non SMALL_PATH path, or remove them.
-        q_img = nullptr;
        d_img = nullptr;
        size_q = 0;
        size_d = 0;
@@ -4143,6 +4193,11 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
    ggml_backend_opencl_device_context * dev_ctx     = (ggml_backend_opencl_device_context *)dev->context;
    ggml_backend_opencl_context *        backend_ctx = dev_ctx->backend_ctx;

+    // reject ops that match the opfilter regex
+    if (backend_ctx->opfilter && std::regex_match(std::string(ggml_op_desc(op)), *backend_ctx->opfilter)) {
+        return false;
+    }
+
    switch (op->op) {
        case GGML_OP_NONE:
            return true;
@@ -4912,17 +4967,53 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
        CL_CHECK(err);

-        //cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
-    #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        // Adreno moe q4_0 kernel needs special transpose and unshuffling
+        if (use_adreno_moe_kernels(backend_ctx, tensor)) {
+            cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0_trans4_ns;
+
+            int ne00 = tensor->ne[0];
+            int ne01 = tensor->ne[1];
+            int ne02 = tensor->ne[2];
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &ne00));
+            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01));
+
+            size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
+            size_t local_work_size[3] = {64, 2, 1};
+
+            cl_event evt;
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+            CL_CHECK(clWaitForEvents(1, &evt));
+            CL_CHECK(clReleaseMemObject(data_device));
+
+            // Create image for Q
+            cl_image_format img_format_q = {CL_R, CL_UNSIGNED_INT32};
+            cl_image_desc img_desc_q = {
+                CL_MEM_OBJECT_IMAGE1D_BUFFER,
+                static_cast<size_t>(ggml_nelements(tensor) / 8),
+                0, 0, 0, 0, 0, 0, 0,
+                { extra->q }
+            };
+            extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err);
+            tensor->extra = extra;
+
+            return;
+        }
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
        cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;

        // The optimized kernels need weights in natural order, so unshuffle.
        if (use_adreno_kernels(backend_ctx, tensor)) {
            kernel = backend_ctx->kernel_convert_block_q4_0_noshuffle;
        }
-    #else
+#else
        cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
-    #endif // GGML_OPENCL_USE_ADRENO_KERNELS
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
@@ -4938,7 +5029,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
        tensor->extra = extra;

        // transpose the weights and scales
-    #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
        // Only do transpose for large, non batched matrix
        // TODO: use preallocated images instead of sub-buffer then image
        if (use_adreno_kernels(backend_ctx, tensor)) {
@@ -4952,10 +5043,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
            // Transpose d as ushort
            transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M);
        }
-    #endif // GGML_OPENCL_USE_ADRENO_KERNELS
-
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
        return;
-
    }
    if (tensor->type == GGML_TYPE_Q4_1) {
        ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
@@ -5675,6 +5764,36 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
        ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;

 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        if (use_adreno_moe_kernels(backend_ctx, tensor)) {
+            cl_int err;
+            cl_kernel kernel = backend_ctx->kernel_restore_block_q4_0_trans4_ns;
+
+            cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                ggml_nbytes(tensor), NULL, &err);
+            CL_CHECK(err);
+
+            int ne00 = tensor->ne[0];
+            int ne01 = tensor->ne[1];
+            int ne02 = tensor->ne[2];
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00));
+            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01));
+
+            size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
+            size_t local_work_size[3] = {64, 2, 1};
+
+            cl_event evt;
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
+                global_work_size, local_work_size, 0, NULL, &evt));
+            CL_CHECK(clWaitForEvents(1, &evt));
+            CL_CHECK(clEnqueueReadBuffer(
+                queue, data_device, CL_TRUE, offset,
+                size, data, 0, NULL, NULL));
+            CL_CHECK(clReleaseMemObject(data_device));
+            return;
+        }
        if (use_adreno_kernels(backend_ctx, tensor)) {
            ggml_cl_buffer buf_trans_q;
            ggml_cl_buffer buf_trans_d;
@@ -12797,6 +12916,179 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
    // subgroup mat vec
    switch (src0->type) {
        case GGML_TYPE_Q4_0: {
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+            if (use_adreno_moe_kernels(backend_ctx, src0)) {
+                cl_int status;
+
+                size_t local_size[3] = {64, 2, 1};
+                size_t global_size[3] = {64, 2, 1};
+
+                if (ne12 == 1) { // for gemv
+                    kernel = backend_ctx->kernel_gemv_moe_q4_0_f32_ns;
+
+                    cl_mem src1_sub_buffer, buf_src1_image, buf_src2;
+
+                    // create a sub_buffer for src2
+                    cl_buffer_region region;
+                    region.origin = offset2;
+                    region.size = ne20 * ne21 * sizeof(int);
+                    buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    // set thread grid
+                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[1] = 4;
+                    global_size[2] = static_cast<size_t>(ne20);
+                    local_size[1] = 4;
+
+                    // create a sub_buffer for src1
+                    region.origin = offset1;
+                    region.size = ne10 * ne11 * ne12 * sizeof(float);
+                    src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    // create image for src1
+                    cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
+                    cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}};
+                    buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
+                    CL_CHECK(status);
+
+                    // Set kernel args
+                    int arg_idx = 0;
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q4_0->q));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q4_0->d));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_src1_image));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_src2));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extrad->data_device));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong),  &offsetd));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne00));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne01));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne11));
+
+                    // launch kernel
+                    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
+
+                    // deallocate sub buffers and images
+                    CL_CHECK(clReleaseMemObject(src1_sub_buffer));
+                    CL_CHECK(clReleaseMemObject(buf_src1_image));
+                    CL_CHECK(clReleaseMemObject(buf_src2));
+
+                } else { // for gemm
+                    kernel = backend_ctx->kernel_gemm_moe_q4_0_f32_ns;
+
+                    // Reorder router if called from test-backend-ops or when new router is generated.
+                    // Otherwise reuse the reordered result from previous mul_mat_id call.
+                    if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) {
+                        moe_router_reoerder(backend, src2, ne20);
+                        backend_ctx->toggle_reorder = false;
+                    }
+
+                    cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image;
+                    cl_mem buf_src2, buf_src2_emap;
+
+                    cl_buffer_region region;
+                    region.origin = 0;
+                    region.size = sizeof(int) * max_post_router_tile * n_tile_size;
+                    buf_src2 = clCreateSubBuffer(backend_ctx->prealloc_post_router.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    region.origin = 0;
+                    region.size = sizeof(short) * max_post_router_tile;
+                    buf_src2_emap = clCreateSubBuffer(backend_ctx->prealloc_emap.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    // Reorder activations
+                    // create a sub_buffer for src1
+                    region.origin = offset1;
+                    region.size = ne10 * ne11 * ne12 * sizeof(float);
+                    sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+                    CL_CHECK(status);
+
+                    // Create image for reordered src1
+                    // Use pre-allocated placeholder
+                    region.origin = 0;
+                    region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float);
+                    backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size);
+                    buf_src1_reordered = clCreateSubBuffer(
+                        backend_ctx->prealloc_act_trans.buffer,
+                        0,
+                        CL_BUFFER_CREATE_TYPE_REGION,
+                        &region,
+                        &status);
+                    CL_CHECK(status);
+                    cl_image_format image_format_buf_src1;
+                    cl_image_desc image_desc_buf_src1;
+                    image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
+                    image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}};
+                    image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
+                    CL_CHECK(status);
+
+                    unsigned short map_ratio = ne20 / ne11;
+                    GGML_ASSERT(((map_ratio == 1) || (map_ratio == ne20)) && "Map ratio not supported\n");
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 0, sizeof(cl_mem),        &sub_buf_src1_pre));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 1, sizeof(cl_mem),        &buf_src2));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 2, sizeof(cl_mem),        &buf_src1_reordered));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 3, sizeof(cl_mem),        &(backend_ctx->prealloc_total_tiles.buffer)));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 4, sizeof(unsigned int),  &ne00));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 5, sizeof(unsigned short),  &map_ratio));
+                    CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 6, sizeof(unsigned int),  &n_tile_size));
+
+                    size_t reorder_b_local_size[3] = {256, 1, 1};
+                    size_t reorder_b_global_size[3] = {static_cast<size_t>(((ne00 / 4) + 255) / 256 * 256), static_cast<size_t>(max_post_router_tile * n_tile_size), 1};
+
+                    // Dispatch reorder kernel
+                    backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst);
+
+                    // MoE kernel prepare
+                    // Create sub buffer for dst
+                    region.origin = offsetd;
+                    region.size = ne0 * ne1 * ne2 * sizeof(float);
+                    sub_buf_dst = clCreateSubBuffer(
+                        extrad->data_device,
+                        0,
+                        CL_BUFFER_CREATE_TYPE_REGION,
+                        &region,
+                        &status);
+                    CL_CHECK(status);
+                    // Create image for dst
+                    cl_image_format image_format_buf_dst = {CL_R, CL_FLOAT};
+                    cl_image_desc image_desc_buf_dst = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne0 * ne1 * ne2), 0,0,0,0,0,0,0, {sub_buf_dst}};
+                    buf_dst_image = clCreateImage(backend_ctx->context, CL_MEM_WRITE_ONLY, &image_format_buf_dst, &image_desc_buf_dst, NULL, &status);
+                    CL_CHECK(status);
+
+                    // Set kernel args
+                    int arg_idx = 0;
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q4_0->q_img));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_q4_0->d));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &image_src1_reordered));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_src2));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_src2_emap));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_dst_image));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &(backend_ctx->prealloc_total_tiles.buffer)));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne00));
+                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne01));
+
+                    // set thread grid
+                    global_size[1] = static_cast<size_t>((ne01 + 63) / 64);
+                    global_size[2] = static_cast<size_t>(max_post_router_tile);
+                    local_size[1] = 1;
+                    local_size[2] = 1;
+
+                    // Dispatch kernel
+                    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
+
+                    clReleaseMemObject(sub_buf_src1_pre);
+                    clReleaseMemObject(buf_src1_reordered);
+                    clReleaseMemObject(image_src1_reordered);
+                    clReleaseMemObject(buf_src2);
+                    clReleaseMemObject(buf_src2_emap);
+                    clReleaseMemObject(sub_buf_dst);
+                    clReleaseMemObject(buf_dst_image);
+                }
+                return;
+            } // fallback to generic Q4_0 MoE kernel
+
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
            kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;

            if (backend_ctx->gpu_family == INTEL) {
--- a/ggml/src/ggml-opencl/kernels/cvt.cl
+++ b/ggml/src/ggml-opencl/kernels/cvt.cl
@@ -190,6 +190,92 @@ kernel void kernel_restore_block_q4_0_noshuffle(
    }
 }

+kernel void kernel_convert_block_q4_0_trans4_ns(
+    global struct block_q4_0 * src0,
+    __global uint * dst_q,
+    __global half * dst_d,
+    uint ne00,
+    uint ne01
+) {
+    uint i00 = get_global_id(1);
+    uint i01 = get_global_id(0);
+    uint i02 = get_global_id(2);
+
+    uint ne00_blk = ne00 / QK4_0;
+    uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
+    uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
+
+    global struct block_q4_0 * b = src0 + src_blk_offset;
+    dst_d[dst_blk_offset] = b->d;
+
+    // extract quantization and unshuffle
+    ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
+
+    ushort8 post_block = (ushort8)(0);
+
+    uchar * pre_block_ptr = (uchar *)(&pre_block);
+    uchar * post_block_ptr = (uchar *)(&post_block);
+
+    for (int i = 0; i < QK4_0 / 4; ++i) {
+        uchar x0 = pre_block_ptr[2*i + 0];
+        uchar x1 = pre_block_ptr[2*i + 1];
+
+        post_block_ptr[i + 0        ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
+        post_block_ptr[i + QK4_0 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
+    }
+
+    uint4 q_block = as_uint4(post_block);
+
+    uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
+    dst_q[offset] = q_block.x;
+    dst_q[offset + ne01] = q_block.y;
+    dst_q[offset + ne01 * 2] = q_block.z;
+    dst_q[offset + ne01 * 3] = q_block.w;
+}
+
+kernel void kernel_restore_block_q4_0_trans4_ns(
+    __global uint * src_q,
+    __global half * src_d,
+    __global struct block_q4_0 * dst0,
+    uint ne00,
+    uint ne01
+) {
+    uint i00 = get_global_id(1);
+    uint i01 = get_global_id(0);
+    uint i02 = get_global_id(2);
+
+    uint ne00_blk = ne00 / QK4_0;
+    uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
+    uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
+
+    __global struct block_q4_0 * b = dst0 + dst_blk_offset;
+    b->d = src_d[src_d_offset];
+
+    // collect transposed quantization parts for a block
+    uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
+    uint4 q_block;
+    q_block.x = src_q[src_q_offset];
+    q_block.y = src_q[src_q_offset + ne01];
+    q_block.z = src_q[src_q_offset + ne01 * 2];
+    q_block.w = src_q[src_q_offset + ne01 * 3];
+
+    ushort8 post_block = as_ushort8(q_block);
+    ushort8 pre_block = (ushort8)(0);
+
+    uchar * pre_block_ptr = (uchar *)(&pre_block);
+    uchar * post_block_ptr = (uchar *)(&post_block);
+
+    for (int i = 0; i < QK4_0 / 4; ++i) {
+        uchar x0 = post_block_ptr[i + 0];
+        uchar x1 = post_block_ptr[i + QK4_0 / 4];
+
+        pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
+        pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
+    }
+
+    ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
+}
+
 //------------------------------------------------------------------------------
 // kernel_convert_block_q4_1
 // Convert the block_q4_1 format to 2 separate arrays (AOS -> SOA).
--- a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl
@@ -0,0 +1,252 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
+#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
+
+#define TILESIZE_K 16
+#define TILESIZE_M 64
+#define TILESIZE_N 32
+
+
+#define dequantize_q4_0(q4, a_f16, scale) \
+    a_f16.s0 = (half)((q4.s0 & 0x000F) - 8) * scale; \
+    a_f16.s1 = (half)(((q4.s0 & 0x00F0) >> 4) - 8) * scale; \
+    a_f16.s2 = (half)(((q4.s0 & 0x0F00) >> 8) - 8) * scale; \
+    a_f16.s3 = (half)(((q4.s0 & 0xF000) >> 12) - 8) * scale; \
+    a_f16.s4 = (half)((q4.s1 & 0x000F) - 8) * scale; \
+    a_f16.s5 = (half)(((q4.s1 & 0x00F0) >> 4) - 8) * scale; \
+    a_f16.s6 = (half)(((q4.s1 & 0x0F00) >> 8) - 8) * scale; \
+    a_f16.s7 = (half)(((q4.s1 & 0xF000) >> 12) - 8) * scale; \
+    a_f16.s8 = (half)((q4.s2 & 0x000F) - 8) * scale; \
+    a_f16.s9 = (half)(((q4.s2 & 0x00F0) >> 4) - 8) * scale; \
+    a_f16.sa = (half)(((q4.s2 & 0x0F00) >> 8) - 8) * scale; \
+    a_f16.sb = (half)(((q4.s2 & 0xF000) >> 12) - 8) * scale; \
+    a_f16.sc = (half)((q4.s3 & 0x000F) - 8) * scale; \
+    a_f16.sd = (half)(((q4.s3 & 0x00F0) >> 4) - 8) * scale; \
+    a_f16.se = (half)(((q4.s3 & 0x0F00) >> 8) - 8) * scale; \
+    a_f16.sf = (half)(((q4.s3 & 0xF000) >> 12) - 8) * scale; \
+
+
+#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \
+    acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \
+    acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \
+    acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \
+    acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \
+    acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \
+    acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \
+    acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \
+    acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \
+    acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \
+    acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \
+    acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \
+    acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \
+    acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \
+    acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \
+    acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \
+    acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \
+    acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \
+    acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \
+    acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \
+    acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \
+    acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \
+    acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \
+    acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \
+    acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \
+    acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \
+    acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \
+    acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \
+    acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \
+    acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \
+    acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \
+    acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \
+    acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \
+    c_reg.lo += convert_float8(acc.lo); \
+    c_reg.hi += convert_float8(acc.hi); \
+    acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \
+    acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \
+    acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \
+    acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \
+    acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \
+    acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \
+    acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \
+    acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \
+    acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \
+    acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \
+    acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \
+    acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \
+    acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \
+    acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \
+    acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \
+    acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \
+    acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \
+    acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \
+    acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \
+    acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \
+    acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \
+    acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \
+    acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \
+    acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \
+    acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \
+    acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \
+    acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \
+    acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \
+    acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \
+    acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \
+    acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \
+    acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \
+    c_reg.lo += convert_float8(acc.lo); \
+    c_reg.hi += convert_float8(acc.hi); \
+
+
+__attribute__((qcom_wave_pair_mode(1))) // 1=force single 2=force pair
+kernel void kernel_gemm_moe_q4_0_f32_ns(
+        __read_only  image1d_buffer_t src0_q,
+        __global     half *           src0_d,
+        __read_only  image1d_buffer_t src1,
+        __global     uint *           src2,
+        __global     ushort *         src2_emap,
+        __write_only image1d_buffer_t dst,
+        __global     int *            total_tiles,
+        uint ne00,
+        uint ne01
+) {
+    uint block_id_m = get_global_id(1); // m_tile
+    uint block_id_n = get_global_id(2); // n_tile
+
+    // Boundary check
+    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+        return;
+    }
+
+    __private half16 reg_a;
+    __private float32 reg_c = (float32)(0);
+    __local half4 shared_b[128];
+
+    const ushort expert_id = src2_emap[block_id_n];
+
+    const uint row = block_id_m * TILESIZE_M;
+    const uint col = block_id_n * TILESIZE_N;
+
+    uint sub_block_id_m = get_local_id(0);
+    uint2 b_global_offset;
+    b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00;
+    b_global_offset.y = b_global_offset.x + (16 * ne00);
+    uint2 b_local_offset;
+    b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2);
+    b_local_offset.y = b_local_offset.x + 16;
+
+    // Loop along K axis, 32 elements (one block) for each iteration, divided into 2 sub-blocks
+    for (uint step = 0; step < ne00; step += TILESIZE_K * 2) {
+        // First sub-block
+        uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
+        uint s_sub_offset = row + ((ne01 * step) >> 5) + ((expert_id * ne00 * ne01) >> 5);
+        uint b_sub_offset = col * ne00 + step;
+
+        // Load scale for current Q4_0 block
+        uint s_offset = s_sub_offset + get_global_id(0);
+        half s = src0_d[s_offset];
+
+        // Load 16 q (64-bits) in transposed layout
+        uint2 q4x16;
+        q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x;
+        q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x;
+
+        // Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
+        float8 bx8_f32;
+        bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
+        bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
+        // Convert to half and store to LM to share within the subgroup
+        half8 bx8_f16 = convert_half8(bx8_f32);
+        shared_b[b_local_offset.x] = bx8_f16.lo;
+        shared_b[b_local_offset.y] = bx8_f16.hi;
+
+        // Dequantization
+        dequantize_q4_0(as_ushort4(q4x16), reg_a, s);
+
+        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        // 32 16x16 fp16 dot product with 8 elements reduction for better precision
+        half16 acc;
+        dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
+        dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
+
+        // Repeat for second sub-block
+        uint half_step = step + TILESIZE_K;
+        q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
+        b_sub_offset = col * ne00 + half_step;
+
+        // Load next 16 q (64-bits) in transposed layout
+        q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x;
+        q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x;
+
+        // Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
+        bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
+        bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
+        // Convert to half and store to LM to share within the subgroup
+        bx8_f16 = convert_half8(bx8_f32);
+        shared_b[b_local_offset.x] = bx8_f16.lo;
+        shared_b[b_local_offset.y] = bx8_f16.hi;
+
+        // Dequantization
+        dequantize_q4_0(as_ushort4(q4x16), reg_a, s);
+
+        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        // 32 16x16 fp16 dot product with 3-levels reduction for better precision
+        dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
+        dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
+    }
+
+    // Load poster router and share in LM
+    __local uint out_idx[TILESIZE_N];
+
+    if (get_local_id(0) < TILESIZE_N) {
+        uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)];
+        if (idx == 0xFFFFFFFF) {
+            idx = src2[block_id_n * TILESIZE_N + 0];
+        }
+        out_idx[get_local_id(0)] = idx * ne01;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Scatter results back to original position in output grid
+    uint m_offset = row + get_local_id(0);
+
+    write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1));
+    write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2));
+    write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3));
+    write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4));
+    write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5));
+    write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6));
+    write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7));
+    write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8));
+    write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9));
+    write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa));
+    write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb));
+    write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc));
+    write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd));
+    write_imagef(dst, out_idx[14] + m_offset, (reg_c.se));
+    write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf));
+    write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg));
+    write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh));
+    write_imagef(dst, out_idx[18] + m_offset, (reg_c.si));
+    write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj));
+    write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk));
+    write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl));
+    write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm));
+    write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn));
+    write_imagef(dst, out_idx[24] + m_offset, (reg_c.so));
+    write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp));
+    write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq));
+    write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr));
+    write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss));
+    write_imagef(dst, out_idx[29] + m_offset, (reg_c.st));
+    write_imagef(dst, out_idx[30] + m_offset, (reg_c.su));
+    write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv));
+
+    // Store zero padding parts to the index of first output in tile, override correct result in the end
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0));
+}
--- a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl
@@ -0,0 +1,116 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+#define QK_Q4_0 32
+#define N_SIMDGROUP 4
+#define SIMDGROUP_WIDTH 64
+
+static inline float8 q4_0_to_fp32_packed8(ushort2 q4x8) {
+    float8 fp32x8;
+    fp32x8.s0 = (float)((q4x8.s0 & 0x000F) - 8);
+    fp32x8.s1 = (float)(((q4x8.s0 & 0x00F0) >> 4) - 8);
+    fp32x8.s2 = (float)(((q4x8.s0 & 0x0F00) >> 8) - 8);
+    fp32x8.s3 = (float)(((q4x8.s0 & 0xF000) >> 12) - 8);
+    fp32x8.s4 = (float)((q4x8.s1 & 0x000F) - 8);
+    fp32x8.s5 = (float)(((q4x8.s1 & 0x00F0) >> 4) - 8);
+    fp32x8.s6 = (float)(((q4x8.s1 & 0x0F00) >> 8) - 8);
+    fp32x8.s7 = (float)(((q4x8.s1 & 0xF000) >> 12) - 8);
+    return fp32x8;
+}
+
+
+__attribute__((qcom_reqd_sub_group_size("half")))
+__kernel void kernel_gemv_moe_q4_0_f32_ns(
+    __global uint * src0_q,
+    __global half * src0_d,
+    __read_only image1d_buffer_t src1,
+    __global uint * src2,
+    __global float * dst,
+    ulong         offsetd,
+    int           ne00,
+    int           ne01,
+    int           ne11
+) {
+    uint i01  = get_global_id(0);
+    uint i20  = get_global_id(2);
+    uint sgid = get_local_id(1);
+    uint slid = get_sub_group_local_id();
+
+    uint i11 = i20 % ne11;
+
+    uint expert_id = src2[i20];
+    uint expert_offset = expert_id * ne00 * ne01 / 32;
+
+    __private float sum = 0.0f; // each thread calculate partial sum of one output
+
+    // loop along ne00 in block granularity, skip 4 blocks every iter
+    for (uint ib00 = sgid; ib00 < (ne00 / QK_Q4_0); ib00 += N_SIMDGROUP) {
+
+        // load one block of q
+        uint4 regQ;
+        uint block_offset = expert_offset * 4 + ib00 * ne01 * 4 + i01;
+
+        regQ.s0 = src0_q[block_offset];
+        regQ.s1 = src0_q[block_offset + ne01];
+        regQ.s2 = src0_q[block_offset + ne01 * 2];
+        regQ.s3 = src0_q[block_offset + ne01 * 3];
+
+        uint offset = i11 * ne00 / 4 + ib00 * 8;
+
+        float8 fp32x8 = q4_0_to_fp32_packed8(as_ushort2(regQ.s0));
+
+        float4 shared_y4;
+        shared_y4 = read_imagef(src1, (offset + 0));
+        float4 acc = shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (offset + 1));
+        acc += shared_y4 * fp32x8.hi;
+
+        fp32x8 = q4_0_to_fp32_packed8(as_ushort2(regQ.s1));
+
+        shared_y4 = read_imagef(src1, (offset + 2));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (offset + 3));
+        acc += shared_y4 * fp32x8.hi;
+
+
+        fp32x8 = q4_0_to_fp32_packed8(as_ushort2(regQ.s2));
+
+        shared_y4 = read_imagef(src1, (offset + 4));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (offset + 5));
+        acc += shared_y4 * fp32x8.hi;
+
+
+        fp32x8 = q4_0_to_fp32_packed8(as_ushort2(regQ.s3));
+
+        shared_y4 = read_imagef(src1, (offset + 6));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (offset + 7));
+        acc += shared_y4 * fp32x8.hi;
+
+        half regS = src0_d[ib00 * ne01 + i01 + expert_offset];
+        sum += (float)(regS) * ((acc.s0 + acc.s1) + (acc.s2 + acc.s3));
+    }
+
+    // reduction in local memory, assumes #subgroups=4
+    __local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)];
+    if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum;
+    if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum;
+    if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
+
+    // 1 outputs per thread in subgroup 0
+    if (sgid == 0) {
+        dst = dst + (offsetd >> 2);
+        dst[i01 + i20 * ne01] = sum;
+    }
+
+}
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -135,7 +135,11 @@ endif()

 if (GGML_SYCL_TARGET STREQUAL "INTEL")
    add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
-    target_link_options(ggml-sycl PRIVATE  -Xs   -ze-intel-greater-than-4GB-buffer-required)
+    if (NOT GGML_SYCL_DEVICE_ARCH)
+        target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required)
+    else()
+        message(STATUS "Skipping -ze-intel-greater-than-4GB-buffer-required for spir64_gen AOT")
+    endif()

    # Link against Intel oneMKL
    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
@@ -160,7 +164,15 @@ if (GGML_SYCL_HOST_MEM_FALLBACK)
 endif()

 if (GGML_SYCL_DEVICE_ARCH)
-    target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
-    target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
+    message(STATUS "GGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} (AOT via spir64_gen)")
+    target_compile_options(
+        ggml-sycl PRIVATE
+        -fsycl-targets=spir64_gen
+        "SHELL:-Xsycl-target-backend=spir64_gen \"-device ${GGML_SYCL_DEVICE_ARCH}\""
+    )
+    target_link_options(
+        ggml-sycl PRIVATE
+        -fsycl-targets=spir64_gen
+        "SHELL:-Xsycl-target-backend=spir64_gen \"-device ${GGML_SYCL_DEVICE_ARCH}\""
+    )
 endif()
-
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -25,6 +25,7 @@
 #include "presets.hpp"
 #include "type.hpp"
 #include "sycl_hw.hpp"
+#include "fattn-buffers.hpp"

 namespace syclexp = sycl::ext::oneapi::experimental;

@@ -404,12 +405,16 @@ struct ggml_backend_sycl_context {
    std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
    std::unordered_map<sycl::queue *, std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>>> scratchpad_map;

+    std::unique_ptr<ggml_sycl_fattn_kv_buffers> fattn_bufs[GGML_SYCL_MAX_DEVICES];
+
    std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];

    static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);

    static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);

+    static std::unique_ptr<ggml_sycl_fattn_kv_buffers> new_fattn_kv_buffers(queue_ptr qptr, int device);
+
    ggml_sycl_pool & pool(int device) {
        if (pools[device] == nullptr) {
            pools[device] = new_pool_for_device(stream(device,0), device);
@@ -421,6 +426,17 @@ struct ggml_backend_sycl_context {
        return pool(device);
    }

+    ggml_sycl_fattn_kv_buffers & fattn_buffers(int device) {
+        if (fattn_bufs[device] == nullptr) {
+            fattn_bufs[device] = new_fattn_kv_buffers(stream(device, 0), device);
+        }
+        return *fattn_bufs[device];
+    }
+
+    ggml_sycl_fattn_kv_buffers & fattn_buffers() {
+        return fattn_buffers(device);
+    }
+
 #ifdef GGML_SYCL_GRAPH
    std::unique_ptr<sycl_ex::command_graph<sycl_ex::graph_state::executable>> exec_graph = nullptr;
 #endif
--- a/ggml/src/ggml-sycl/convert.cpp
+++ b/ggml/src/ggml-sycl/convert.cpp
@@ -252,6 +252,23 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
 #endif
 }

+template <typename dst_t>
+static void dequantize_row_q5_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
+    const int64_t nb = k / QK_K;
+
+    dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
+
+    stream->submit([&](sycl::handler & cgh) {
+        sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(K_SCALE_SIZE), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
+            [=](sycl::nd_item<3> item_ct1) {
+                dequantize_block_q5_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
+            });
+    });
+}
+
 template <typename dst_t>
 static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k,
                                     dpct::queue_ptr stream) {
@@ -643,7 +660,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
                return dequantize_row_q4_K_sycl;
            }
        case GGML_TYPE_Q5_K:
-            return dequantize_row_q5_K_sycl;
+            if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
+                return dequantize_row_q5_K_sycl_reorder;
+            } else {
+                return dequantize_row_q5_K_sycl;
+            }
        case GGML_TYPE_Q6_K:
            if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
                return dequantize_row_q6_K_sycl_reorder;
@@ -718,7 +739,11 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
                return dequantize_row_q4_K_sycl;
            }
        case GGML_TYPE_Q5_K:
-            return dequantize_row_q5_K_sycl;
+            if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
+                return dequantize_row_q5_K_sycl_reorder;
+            } else {
+                return dequantize_row_q5_K_sycl;
+            }
        case GGML_TYPE_Q6_K:
            if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
                return dequantize_row_q6_K_sycl_reorder;
--- a/ggml/src/ggml-sycl/cumsum.cpp
+++ b/ggml/src/ggml-sycl/cumsum.cpp
@@ -0,0 +1,148 @@
+#include "cumsum.hpp"
+#include "common.hpp"
+
+#include <algorithm>
+
+#define SYCL_CUMSUM_BLOCK_SIZE 256
+
+static __dpct_inline__ float warp_prefix_inclusive_sum_f32(float x, const sycl::nd_item<3> & item) {
+    return sycl::inclusive_scan_over_group(item.get_sub_group(), x, sycl::plus<float>());
+}
+
+static void cumsum_f32_kernel(
+        const float * __restrict__ src, float * __restrict__ dst,
+        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+        const int64_t s01, const int64_t s02, const int64_t s03,
+        const int64_t  d1, const int64_t  d2, const int64_t  d3,
+        const sycl::nd_item<3> & item, float * smem) {
+
+    const int tid = item.get_local_id(2);
+    const int block_size = item.get_local_range(2);
+    const int lane = tid % WARP_SIZE;
+    const int warp = tid / WARP_SIZE;
+    const int warps_per_block = block_size / WARP_SIZE;
+
+    float * s_vals      = smem;
+    float * s_warp_sums = smem + block_size;
+    float * s_carry     = smem + block_size + warps_per_block;
+
+    if (tid == 0) {
+        s_carry[0] = 0.0f;
+    }
+    item.barrier(sycl::access::fence_space::local_space);
+
+    const int64_t i3 = item.get_group(0);
+    const int64_t i2 = item.get_group(1);
+    const int64_t i1 = item.get_group(2);
+    if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
+        return;
+    }
+
+    const float * src_row = src + i1 * s01 + i2 * s02 + i3 * s03;
+    float       * dst_row = dst + i1 * d1  + i2 * d2  + i3 * d3;
+
+    constexpr int num_unroll = 4;
+    float temp[num_unroll];
+
+    for (int64_t i = 0; i < ne00; i += num_unroll * block_size) {
+        int64_t idx = i + tid * num_unroll;
+
+        temp[0] = (idx < ne00 ? src_row[idx] : 0.0f);
+#pragma unroll
+        for (int j = 1; j < num_unroll; j++) {
+            temp[j] = temp[j - 1];
+            if (idx + j < ne00) {
+                temp[j] += src_row[idx + j];
+            }
+        }
+
+        float val = (idx < ne00) ? temp[num_unroll - 1] : 0.0f;
+
+        val = warp_prefix_inclusive_sum_f32(val, item);
+        s_vals[tid] = val;
+
+        if (lane == WARP_SIZE - 1) {
+            s_warp_sums[warp] = val;
+        }
+        item.barrier(sycl::access::fence_space::local_space);
+
+        if (warp == 0) {
+            float w = (tid < warps_per_block) ? s_warp_sums[tid] : 0.0f;
+            float inc = warp_prefix_inclusive_sum_f32(w, item);
+            if (tid < warps_per_block) {
+                s_warp_sums[tid] = inc - w;
+            }
+            if (tid == warps_per_block - 1) {
+                s_carry[1] = inc;
+            }
+        }
+        item.barrier(sycl::access::fence_space::local_space);
+
+        float carry = s_carry[0];
+        float final_offset = s_vals[tid] + s_warp_sums[warp] + carry - temp[num_unroll - 1];
+
+#pragma unroll
+        for (int j = 0; j < num_unroll; j++) {
+            if (idx + j < ne00) {
+                dst_row[idx + j] = temp[j] + final_offset;
+            }
+        }
+
+        item.barrier(sycl::access::fence_space::local_space);
+
+        if (tid == 0) {
+            s_carry[0] += s_carry[1];
+        }
+    }
+}
+
+inline void ggml_sycl_op_cumsum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    dpct::queue_ptr stream = ctx.stream();
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+
+    const float * src_d = static_cast<const float *>(src0->data);
+    float       * dst_d = static_cast<float *>(dst->data);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const size_t ts = sizeof(float);
+    const int64_t s01 = src0->nb[1] / ts;
+    const int64_t s02 = src0->nb[2] / ts;
+    const int64_t s03 = src0->nb[3] / ts;
+    const int64_t d1  = dst->nb[1] / ts;
+    const int64_t d2  = dst->nb[2] / ts;
+    const int64_t d3  = dst->nb[3] / ts;
+
+    const int num_warps = (ne00 + WARP_SIZE - 1) / WARP_SIZE;
+    int block_size = num_warps * WARP_SIZE;
+    block_size = std::min(block_size, SYCL_CUMSUM_BLOCK_SIZE);
+    const int warps_per_block = block_size / WARP_SIZE;
+    const int smem_size = block_size + warps_per_block + 2;
+
+    const sycl::range<3> grid(ne03, ne02, ne01);
+    const sycl::range<3> block(1, 1, block_size);
+
+    stream->submit([&](sycl::handler & cgh) {
+        sycl::local_accessor<float, 1> smem_acc(sycl::range<1>(smem_size), cgh);
+        cgh.parallel_for(
+            sycl::nd_range<3>(grid * block, block),
+            [=](sycl::nd_item<3> item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                cumsum_f32_kernel(src_d, dst_d, ne00, ne01, ne02, ne03,
+                                  s01, s02, s03, d1, d2, d3,
+                                  item, get_pointer(smem_acc));
+            });
+    });
+}
+
+void ggml_sycl_cumsum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    ggml_sycl_op_cumsum(ctx, dst);
+}
--- a/ggml/src/ggml-sycl/cumsum.hpp
+++ b/ggml/src/ggml-sycl/cumsum.hpp
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "common.hpp"
+
+void ggml_sycl_cumsum(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-sycl/dequantize.hpp
+++ b/ggml/src/ggml-sycl/dequantize.hpp
@@ -537,6 +537,63 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
 #endif
 }

+template <typename dst_t>
+static void dequantize_block_q5_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                          uint8_t * scales_local, const sycl::nd_item<3> & item_ct1, int64_t n_blocks) {
+    const int64_t ib = item_ct1.get_group(2);
+
+#if QK_K == 256
+    // assume 64 threads
+    const int64_t tid = item_ct1.get_local_id(2);
+    const int64_t il  = tid / 16;   // 0...3
+    const int64_t ir  = tid % 16;   // 0...15
+    const int64_t is  = 2 * il;
+
+    dst_t * y = yy + ib * QK_K + 64 * il + 2 * ir;
+
+    const uint8_t * base = static_cast<const uint8_t *>(vx);
+
+    // Reordered layout: [qs (QK_K/2 per block)] [qh (QK_K/8 per block)] [scales (K_SCALE_SIZE per block)] [dm (half2 per block)]
+    const size_t qs_offset     = ib * (QK_K / 2);
+    const size_t qh_offset     = n_blocks * (QK_K / 2) + ib * (QK_K / 8);
+    const size_t scales_offset = n_blocks * (QK_K / 2) + n_blocks * (QK_K / 8) + ib * K_SCALE_SIZE;
+    const size_t dm_offset     = n_blocks * (QK_K / 2) + n_blocks * (QK_K / 8) + n_blocks * K_SCALE_SIZE + ib * sizeof(ggml_half2);
+
+    const uint8_t *  qs_ptr     = base + qs_offset;
+    const uint8_t *  qh_ptr     = base + qh_offset;
+    const uint8_t *  scales_ptr = base + scales_offset;
+    const ggml_half2 dm_values  = *reinterpret_cast<const ggml_half2 *>(base + dm_offset);
+
+    const float dall = dm_values.x();
+    const float dmin = dm_values.y();
+
+    const uint8_t * ql = qs_ptr + 32 * il + 2 * ir;
+    const uint8_t * qh = qh_ptr + 2 * ir;
+
+    if (tid < K_SCALE_SIZE) {
+        scales_local[tid] = scales_ptr[tid];
+    }
+
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, scales_local, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, scales_local, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+
+    uint8_t hm  = 1 << (2 * il);
+    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
+    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
+    hm <<= 1;
+    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
+    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
+#else
+    GGML_UNUSED(ib); GGML_UNUSED(tid); GGML_UNUSED(yy); GGML_UNUSED(scales_local); GGML_UNUSED(n_blocks);
+    GGML_ABORT("Q5_K reorder dequantize not supported for QK_K != 256");
+#endif
+}
+
 template<typename dst_t>
 static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
                                  const sycl::nd_item<3> &item_ct1) {
--- a/ggml/src/ggml-sycl/diag.cpp
+++ b/ggml/src/ggml-sycl/diag.cpp
@@ -0,0 +1,67 @@
+#include "diag.hpp"
+#include "common.hpp"
+
+#define SYCL_DIAG_BLOCK_SIZE 256
+
+template <typename T>
+static void diag_kernel(T * __restrict__ dst, const T * __restrict__ src,
+                        const int64_t ne0, const int64_t ne1,
+                        const int64_t ne2, const int64_t ne3,
+                        const int64_t total_elements,
+                        const sycl::nd_item<1> & item) {
+    const int64_t i = item.get_global_id(0);
+    if (i >= total_elements) {
+        return;
+    }
+
+    const int64_t i0 = i % ne0;
+    const int64_t i1 = (i / ne0) % ne1;
+    const int64_t i2 = (i / (ne0 * ne1)) % ne2;
+    const int64_t i3 = i / (ne0 * ne1 * ne2);
+
+    const int64_t dst_idx = ((i3 * ne2 + i2) * ne1 + i1) * ne0 + i0;
+
+    if (i0 == i1) {
+        const int64_t batch_idx = i3 * ne2 + i2;
+        dst[dst_idx] = src[batch_idx * ne0 + i0];
+    } else {
+        dst[dst_idx] = T(0);
+    }
+
+    (void)ne3;
+}
+
+inline void ggml_sycl_op_diag(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(src0->ne[1] == 1);
+
+    dpct::queue_ptr stream = ctx.stream();
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+
+    const void * src0_d = src0->data;
+    void * dst_d = dst->data;
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+    const int64_t ne2 = dst->ne[2];
+    const int64_t ne3 = dst->ne[3];
+    const int64_t n_elems = ggml_nelements(dst);
+    const int64_t num_blocks = (n_elems + SYCL_DIAG_BLOCK_SIZE - 1) / SYCL_DIAG_BLOCK_SIZE;
+
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    stream->parallel_for(
+        sycl::nd_range<1>(num_blocks * SYCL_DIAG_BLOCK_SIZE, SYCL_DIAG_BLOCK_SIZE),
+        [=](sycl::nd_item<1> item) {
+            diag_kernel(static_cast<float *>(dst_d),
+                        static_cast<const float *>(src0_d),
+                        ne0, ne1, ne2, ne3, n_elems, item);
+        });
+}
+
+void ggml_sycl_diag(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    ggml_sycl_op_diag(ctx, dst);
+}
--- a/ggml/src/ggml-sycl/diag.hpp
+++ b/ggml/src/ggml-sycl/diag.hpp
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "common.hpp"
+
+void ggml_sycl_diag(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-sycl/fattn-buffers.cpp
+++ b/ggml/src/ggml-sycl/fattn-buffers.cpp
@@ -0,0 +1,56 @@
+//
+// MIT license
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#include "common.hpp"
+
+sycl::half * ggml_sycl_fattn_kv_buffers::kv_buffer::ensure_half(size_t n_elems) {
+    const size_t need_bytes = n_elems * sizeof(sycl::half);
+
+    if (capacity >= need_bytes) {
+        return ptr;
+    }
+
+    if (ptr) {
+        SYCL_CHECK(CHECK_TRY_ERROR(qptr->wait()));
+        SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
+        ptr = nullptr;
+        capacity = 0;
+    }
+
+    size_t cap = 0;
+    while (cap < need_bytes) {
+        cap += CHUNK_SIZE;
+    }
+
+    void * dev_ptr;
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(dev_ptr = sycl::malloc_device(
+                        cap, *qptr)));
+
+    if (!dev_ptr) {
+        GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, cap);
+        GGML_ABORT("fattn buffer alloc failed");
+    }
+
+    ptr = static_cast<sycl::half *>(dev_ptr);
+    capacity = cap;
+    return ptr;
+}
+
+ggml_sycl_fattn_kv_buffers::kv_buffer::~kv_buffer() {
+#ifdef DEBUG_SYCL_POOL
+    GGML_LOG_INFO("ggml_sycl_fattn_kv_buffer[%d]: %.2f MiB\n", device, capacity / 1024.0 / 1024.0);
+#endif
+    if (ptr) {
+        SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
+    }
+}
--- a/ggml/src/ggml-sycl/fattn-buffers.hpp
+++ b/ggml/src/ggml-sycl/fattn-buffers.hpp
@@ -0,0 +1,63 @@
+//
+// MIT license
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_FATTN_BUFFERS_HPP
+#define GGML_SYCL_FATTN_BUFFERS_HPP
+
+#include <sycl/sycl.hpp>
+
+typedef sycl::queue *queue_ptr;
+
+struct ggml_sycl_fattn_kv_buffers {
+    // buffers grow in chunks of this size
+    static constexpr size_t CHUNK_SIZE = 16ull << 20; // 16 MiB
+
+    struct kv_buffer {
+        kv_buffer(queue_ptr qptr_, int device_) : qptr(qptr_), device(device_) {}
+        ~kv_buffer();
+
+        kv_buffer(const kv_buffer &) = delete;
+        kv_buffer & operator=(const kv_buffer &) = delete;
+
+        sycl::half * ensure_half(size_t n_elems);
+
+    private:
+        sycl::half * ptr      = nullptr;
+        size_t       capacity = 0;
+        queue_ptr    qptr     = nullptr;
+        [[maybe_unused]] int device = 0;
+    };
+
+    kv_buffer K;
+    kv_buffer V;
+
+    ggml_sycl_fattn_kv_buffers(queue_ptr qptr, int device) : K(qptr, device), V(qptr, device) {}
+
+    ggml_sycl_fattn_kv_buffers(const ggml_sycl_fattn_kv_buffers &) = delete;
+    ggml_sycl_fattn_kv_buffers & operator=(const ggml_sycl_fattn_kv_buffers &) = delete;
+};
+
+/**
+ * Imitates `ggml_sycl_pool_alloc` to keep the code calling alloc unchanged.
+ */
+struct ggml_sycl_fattn_alloc {
+    ggml_sycl_fattn_kv_buffers::kv_buffer & buf;
+    sycl::half *                         ptr = nullptr;
+
+    explicit ggml_sycl_fattn_alloc(ggml_sycl_fattn_kv_buffers::kv_buffer & buf_) : buf(buf_) {}
+
+    sycl::half * alloc(size_t n_elems) {
+        ptr = buf.ensure_half(n_elems);
+        return ptr;
+    }
+};
+#endif
--- a/ggml/src/ggml-sycl/fattn-common.hpp
+++ b/ggml/src/ggml-sycl/fattn-common.hpp
@@ -5,6 +5,7 @@
 #include "common.hpp"
 #include "convert.hpp"
 #include "vecdotq.hpp"
+#include "fattn-buffers.hpp"

 #include "ggml.h"

@@ -918,12 +919,13 @@ void launch_fattn(
    GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);

    ggml_sycl_pool & pool = ctx.pool();
+    ggml_sycl_fattn_kv_buffers & fbuf = ctx.fattn_buffers();
    dpct::queue_ptr  main_stream = ctx.stream();
    const int id  = ggml_sycl_get_device();
    const int nsm = ggml_sycl_info().devices[id].nsm;

-    ggml_sycl_pool_alloc<sycl::half>   K_f16(pool);
-    ggml_sycl_pool_alloc<sycl::half>   V_f16(pool);
+    ggml_sycl_fattn_alloc        K_f16(fbuf.K);
+    ggml_sycl_fattn_alloc        V_f16(fbuf.V);
    ggml_sycl_pool_alloc<int>    KV_max(pool);
    ggml_sycl_pool_alloc<float>  dst_tmp(pool);
    ggml_sycl_pool_alloc<sycl::float2> dst_tmp_meta(pool);
--- a/ggml/src/ggml-sycl/fill.cpp
+++ b/ggml/src/ggml-sycl/fill.cpp
@@ -0,0 +1,55 @@
+#include "fill.hpp"
+#include "common.hpp"
+
+#define SYCL_FILL_BLOCK_SIZE 256
+
+template <typename T>
+static void fill_kernel(T * dst, const int64_t k, const T value,
+                        const sycl::nd_item<1> & item) {
+    const int64_t i = (int64_t)item.get_global_id(0);
+    if (i >= k) {
+        return;
+    }
+    dst[i] = value;
+}
+
+inline void ggml_sycl_op_fill(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    dpct::queue_ptr stream = ctx.stream();
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+
+    float value;
+    memcpy(&value, dst->op_params, sizeof(float));
+
+    const int64_t k = ggml_nelements(dst);
+    const int64_t num_blocks = (k + SYCL_FILL_BLOCK_SIZE - 1) / SYCL_FILL_BLOCK_SIZE;
+    void * dst_d = dst->data;
+
+    switch (dst->type) {
+        case GGML_TYPE_F32:
+            stream->parallel_for(
+                sycl::nd_range<1>(num_blocks * SYCL_FILL_BLOCK_SIZE, SYCL_FILL_BLOCK_SIZE),
+                [=](sycl::nd_item<1> item) {
+                    fill_kernel(static_cast<float *>(dst_d), k, value, item);
+                });
+            break;
+        case GGML_TYPE_F16:
+            {
+                sycl::half h_value = sycl::half(value);
+                stream->parallel_for(
+                    sycl::nd_range<1>(num_blocks * SYCL_FILL_BLOCK_SIZE, SYCL_FILL_BLOCK_SIZE),
+                    [=](sycl::nd_item<1> item) {
+                        fill_kernel(static_cast<sycl::half *>(dst_d), k, h_value, item);
+                    });
+            }
+            break;
+        default:
+            GGML_ABORT("unsupported type");
+    }
+}
+
+void ggml_sycl_fill(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/0);
+    ggml_sycl_op_fill(ctx, dst);
+}
--- a/ggml/src/ggml-sycl/fill.hpp
+++ b/ggml/src/ggml-sycl/fill.hpp
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "common.hpp"
+
+void ggml_sycl_fill(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-sycl/gated_delta_net.hpp
+++ b/ggml/src/ggml-sycl/gated_delta_net.hpp
@@ -5,4 +5,5 @@
 #include "common.hpp"
 #include "ggml.h"

+void ggml_sycl_op_gated_delta_net(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 void ggml_sycl_gated_delta_net(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-sycl/getrows.cpp
+++ b/ggml/src/ggml-sycl/getrows.cpp
@@ -183,6 +183,10 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
            get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const sycl::half *)dst->src[0]->data,
                                src1_i32, (float *)dst->data, ctx.stream());
            break;
+        case GGML_TYPE_BF16:
+            get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const sycl::ext::oneapi::bfloat16 *)dst->src[0]->data,
+                                src1_i32, (float *)dst->data, ctx.stream());
+            break;
        case GGML_TYPE_F32:
            get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
            src1_i32, (float *)dst->data, ctx.stream());
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -54,7 +54,12 @@
 #include "ggml-sycl/set.hpp"
 #include "ggml-sycl/ssm_conv.hpp"
 #include "ggml-sycl/sycl_hw.hpp"
-
+#include "ggml-sycl/ssm_scan.hpp"
+#include "ggml-sycl/fill.hpp"
+#include "ggml-sycl/cumsum.hpp"
+#include "ggml-sycl/diag.hpp"
+#include "ggml-sycl/solve_tri.hpp"
+#include "ggml-sycl/gated_delta_net.hpp"

 static bool g_sycl_loaded = false;
 int g_ggml_sycl_debug = 0;
@@ -1281,6 +1286,23 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
    explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {}

    ~ggml_sycl_pool_leg() {
+#ifdef DEBUG_SYCL_POOL
+        int    n_cached    = 0;
+        size_t bytes_cached = 0;
+        for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
+            if (buffer_pool[i].ptr != nullptr) {
+                ++n_cached;
+                bytes_cached += buffer_pool[i].size;
+            }
+        }
+        GGML_LOG_INFO("%s: %d buffers, cached = %.2f MiB\n", __func__,
+                      n_cached, bytes_cached / 1024.0 / 1024.0);
+        const auto slots = format_slots_in_alloc_order();
+        if (!slots.empty()) {
+            GGML_LOG_INFO("%s: slots MiB: %s\n", __func__, slots.c_str());
+        }
+#endif
+
        for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
            ggml_sycl_buffer & b = buffer_pool[i];
            if (b.ptr != nullptr) {
@@ -1291,6 +1313,26 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
        GGML_ASSERT(pool_size == 0);
    }

+#ifdef DEBUG_SYCL_POOL
+    std::string format_slots_in_alloc_order() const {
+        std::string line;
+        char buf[32];
+        bool first = true;
+        for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
+            if (buffer_pool[i].ptr == nullptr) {
+                continue;
+            }
+            if (!first) {
+                line += '/';
+            }
+            first = false;
+            snprintf(buf, sizeof(buf), "%.2f", buffer_pool[i].size / 1024.0 / 1024.0);
+            line += buf;
+        }
+        return line;
+    }
+#endif
+
    void * alloc(size_t size, size_t * actual_size) override {
 #ifdef DEBUG_sycl_MALLOC
        int nnz = 0;
@@ -1454,6 +1496,10 @@ std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(q
   return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
 }

+std::unique_ptr<ggml_sycl_fattn_kv_buffers> ggml_backend_sycl_context::new_fattn_kv_buffers(queue_ptr qptr, int device) {
+    return std::unique_ptr<ggml_sycl_fattn_kv_buffers>(new ggml_sycl_fattn_kv_buffers(qptr, device));
+}
+
 // TBD pool with virtual memory management
 // struct ggml_sycl_pool_vmm : public ggml_sycl_pool

@@ -3298,6 +3344,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
        case GGML_TYPE_Q8_0:
            return true;
        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
            return !g_ggml_sycl_prioritize_dmmv;
        default:
@@ -3320,6 +3367,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
            return true;
        default:
@@ -3536,6 +3584,54 @@ static bool reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
    return true;
 }

+static bool reorder_qw_q5_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
+    GGML_ASSERT(size % sizeof(block_q5_K) == 0);
+    GGML_ASSERT(offset % sizeof(block_q5_K) == 0);
+
+    const int nblocks = size / sizeof(block_q5_K);
+
+    sycl_reorder_temp_buffer tmp(stream, size);
+    if (!tmp) {
+        GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
+        return false;
+    }
+    uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
+
+    sycl::event copy_event;
+    SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
+    if (!g_ggml_sycl_use_async_mem_op) {
+        copy_event.wait();
+    }
+
+    auto * qs_ptr     = data_device;
+    auto * qh_ptr     = qs_ptr + (QK_K / 2) * nblocks;
+    auto * scales_ptr = qh_ptr + (QK_K / 8) * nblocks;
+    auto * dm_ptr     = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
+
+    auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
+        const block_q5_K * x  = (const block_q5_K *) tmp_buf;
+        const int          ib = i;
+
+        for (int j = 0; j < QK_K / 2; ++j) {
+            qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
+        }
+
+        for (int j = 0; j < QK_K / 8; ++j) {
+            qh_ptr[ib * (QK_K / 8) + j] = x[ib].qh[j];
+        }
+
+        for (int j = 0; j < K_SCALE_SIZE; ++j) {
+            scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
+        }
+
+        dm_ptr[ib] = x[ib].dm;
+    });
+    if (!g_ggml_sycl_use_async_mem_op) {
+        reorder_event.wait_and_throw();
+    }
+    return true;
+}
+
 static bool reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
    GGML_ASSERT(size % sizeof(block_q6_K) == 0);
    GGML_ASSERT(offset % sizeof(block_q6_K) == 0);
@@ -3602,6 +3698,8 @@ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
            return reorder_qw_q8_0(data_device, ncols, nrows, size, 0, stream);
        case GGML_TYPE_Q4_K:
            return reorder_qw_q4_k(data_device, size, 0, stream);
+        case GGML_TYPE_Q5_K:
+            return reorder_qw_q5_k(data_device, size, 0, stream);
        case GGML_TYPE_Q6_K:
            return reorder_qw_q6_k(data_device, size, 0, stream);
        default:
@@ -4394,6 +4492,21 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
        case GGML_OP_SSM_CONV:
            ggml_sycl_ssm_conv(ctx, dst);
            break;
+        case GGML_OP_SSM_SCAN:
+            ggml_sycl_ssm_scan(ctx, dst);
+            break;
+        case GGML_OP_FILL:
+            ggml_sycl_fill(ctx, dst);
+            break;
+        case GGML_OP_CUMSUM:
+            ggml_sycl_cumsum(ctx, dst);
+            break;
+        case GGML_OP_DIAG:
+            ggml_sycl_diag(ctx, dst);
+            break;
+        case GGML_OP_SOLVE_TRI:
+            ggml_sycl_solve_tri(ctx, dst);
+            break;
        case GGML_OP_ROLL:
            ggml_sycl_roll(ctx, dst);
            break;
@@ -4902,6 +5015,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
            {
                switch (op->src[0]->type) {
                    case GGML_TYPE_F16:
+                    case GGML_TYPE_BF16:
                    case GGML_TYPE_F32:
                    case GGML_TYPE_Q4_0:
                    case GGML_TYPE_Q4_1:
@@ -5084,11 +5198,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_ACC:
            return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
        case GGML_OP_PAD:
-            // TODO: add circular padding support for syscl, see https://github.com/ggml-org/llama.cpp/pull/16985
            if (ggml_get_op_params_i32(op, 8) != 0) {
                return false;
            }
-            return ggml_is_contiguous(op->src[0]);
+            return true;
        case GGML_OP_LEAKY_RELU:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_RWKV_WKV6:
@@ -5104,6 +5217,21 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
            return op->type == GGML_TYPE_F32;
        case GGML_OP_ARANGE:
            return op->type == GGML_TYPE_F32;
+        case GGML_OP_SSM_SCAN:
+            if (op->src[3]->ne[0] == 1) {
+                // Mamba2
+                // (kernel only supports (d_state == 128 || d_state == 256) && d_head % WARP_SIZE == 0)
+                return (op->src[0]->ne[0] == 128 || op->src[0]->ne[0] == 256) && op->src[0]->ne[1] % WARP_SIZE == 0;
+            } else {
+                // TODO Mamba-1 not yet ported to SYCL
+                return false;
+            }
+        case GGML_OP_FILL:
+        case GGML_OP_CUMSUM:
+        case GGML_OP_DIAG:
+            return true;
+        case GGML_OP_SOLVE_TRI:
+            return op->src[0]->ne[0] <= SYCL_SOLVE_TRI_MAX_N && op->src[1]->ne[0] <= SYCL_SOLVE_TRI_MAX_K;
        case GGML_OP_FLASH_ATTN_EXT:
            return ggml_sycl_flash_attn_ext_supported(device, op);
        default:
--- a/ggml/src/ggml-sycl/mmvq.cpp
+++ b/ggml/src/ggml-sycl/mmvq.cpp
@@ -839,6 +839,26 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
    }
 }

+static void reorder_mul_mat_vec_q5_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
+                                               const int nrows, dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+
+    const int        block_num_y   = ceil_div(nrows, GGML_SYCL_MMV_Y);
+    constexpr size_t num_subgroups = 16;
+    GGML_ASSERT(block_num_y % num_subgroups == 0);
+
+    const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
+    const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
+
+    stream->submit([&](sycl::handler & cgh) {
+        cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
+                            [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q5_K>>(vx, vy, dst, ncols,
+                                                                                            nrows, nd_item);
+                            });
+    });
+}
+
 static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
                                               const int nrows, dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK_K == 0);
@@ -1125,6 +1145,7 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
                    GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q8_0_q8_1_sycl\n");
                    reorder_mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
                } else {
+                    GGML_SYCL_DEBUG("Calling mul_mat_vec_q8_0_q8_1_sycl\n");
                    mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
                }
                break;
@@ -1145,7 +1166,14 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
                }
                break;
            case GGML_TYPE_Q5_K:
-                mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
+                    ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
+                    GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q5_k_q8_1_sycl\n");
+                    reorder_mul_mat_vec_q5_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                } else {
+                    GGML_SYCL_DEBUG("Calling mul_mat_vec_q5_K_q8_1_sycl\n");
+                    mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                }
                break;
            case GGML_TYPE_Q6_K:
                if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
--- a/ggml/src/ggml-sycl/pad.cpp
+++ b/ggml/src/ggml-sycl/pad.cpp
@@ -13,7 +13,8 @@
 //#include "common.hpp"
 #include "pad.hpp"

-static void pad_f32(const float * src, float * dst,
+static void pad_f32(const float * src, size_t s00, size_t s01, size_t s02, size_t s03,
+                    float * dst,
                    const int lp0, const int rp0, const int lp1, const int rp1,
                    const int lp2, const int rp2, const int lp3, const int rp3,
                    const int ne0, const int ne1, const int ne2, const int ne3,
@@ -27,7 +28,6 @@ static void pad_f32(const float * src, float * dst,
        return;
    }

-    // operation
    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
    if ((i0 >= lp0 && i0 < ne0 - rp0) &&
        (i1 >= lp1 && i1 < ne1 - rp1) &&
@@ -37,12 +37,8 @@ static void pad_f32(const float * src, float * dst,
        const int64_t i01 = i1 - lp1;
        const int64_t i02 = i2 - lp2;
        const int64_t i03 = i3 - lp3;
-        const int64_t ne02 = ne2 - lp2 - rp2;
-        const int64_t ne01 = ne1 - lp1 - rp1;
-        const int64_t ne00 = ne0 - lp0 - rp0;

-        const int64_t src_idx = i03 * (ne00 * ne01 * ne02) +
-                                i02 * (ne00 * ne01) + i01 * ne00 + i00;
+        const int64_t src_idx = i03 * s03 + i02 * s02 + i01 * s01 + i00 * s00;

        dst[dst_idx] = src[src_idx];
    } else {
@@ -50,20 +46,19 @@ static void pad_f32(const float * src, float * dst,
    }
 }

-static void pad_f32_sycl(const float *src, float *dst, const int lp0,
-                         const int rp0, const int lp1, const int rp1,
-                         const int lp2, const int rp2, const int lp3,
-                         const int rp3, const int ne0, const int ne1,
-                         const int ne2, const int ne3,
+static void pad_f32_sycl(const float * src, size_t s00, size_t s01, size_t s02, size_t s03,
+                         float * dst, const int lp0, const int rp0, const int lp1, const int rp1,
+                         const int lp2, const int rp2, const int lp3, const int rp3,
+                         const int ne0, const int ne1, const int ne2, const int ne3,
                         dpct::queue_ptr stream) {
    int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
-    dpct::dim3 gridDim(num_blocks, ne1, ne2 * ne3);
+    sycl::range<3> grid(ne2 * ne3, ne1, num_blocks);
    stream->parallel_for(
-        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
+        sycl::nd_range<3>(grid * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
                          sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
        [=](sycl::nd_item<3> item_ct1) {
-            pad_f32(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1,
-                    ne2, ne3, item_ct1);
+            pad_f32(src, s00, s01, s02, s03, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
+                    ne0, ne1, ne2, ne3, item_ct1);
        });
 }

@@ -71,22 +66,27 @@ void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
    float * dst_d = (float *)dst->data;
-    dpct::queue_ptr     stream = ctx.stream();
+    dpct::queue_ptr stream = ctx.stream();

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(ggml_is_contiguous(src0));

-    const int32_t lp0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t rp0 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t lp1 = ((const int32_t*)(dst->op_params))[2];
-    const int32_t rp1 = ((const int32_t*)(dst->op_params))[3];
-    const int32_t lp2 = ((const int32_t*)(dst->op_params))[4];
-    const int32_t rp2 = ((const int32_t*)(dst->op_params))[5];
-    const int32_t lp3 = ((const int32_t*)(dst->op_params))[6];
-    const int32_t rp3 = ((const int32_t*)(dst->op_params))[7];
+    const size_t ts = ggml_type_size(src0->type);
+    const size_t s00 = src0->nb[0] / ts;
+    const size_t s01 = src0->nb[1] / ts;
+    const size_t s02 = src0->nb[2] / ts;
+    const size_t s03 = src0->nb[3] / ts;

-    pad_f32_sycl(src0_d, dst_d,
+    const int32_t lp0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t rp0 = ((const int32_t *)(dst->op_params))[1];
+    const int32_t lp1 = ((const int32_t *)(dst->op_params))[2];
+    const int32_t rp1 = ((const int32_t *)(dst->op_params))[3];
+    const int32_t lp2 = ((const int32_t *)(dst->op_params))[4];
+    const int32_t rp2 = ((const int32_t *)(dst->op_params))[5];
+    const int32_t lp3 = ((const int32_t *)(dst->op_params))[6];
+    const int32_t rp3 = ((const int32_t *)(dst->op_params))[7];
+
+    pad_f32_sycl(src0_d, s00, s01, s02, s03, dst_d,
                 lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
                 dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
 }
--- a/ggml/src/ggml-sycl/quants.hpp
+++ b/ggml/src/ggml-sycl/quants.hpp
@@ -79,6 +79,31 @@ template <> struct block_q_t<GGML_TYPE_Q4_K> {
    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
 };

+template <> struct block_q_t<GGML_TYPE_Q5_K> {
+    struct traits {
+        static constexpr uint32_t qk       = QK_K;
+        static constexpr uint32_t qi       = QI5_K;
+        static constexpr uint32_t qr       = QR5_K;
+        static constexpr uint32_t vdr_mmvq = 2;
+    };
+
+    // Reordered layout: [qs (QK_K/2 per block)] [qh (QK_K/8 per block)] [scales] [dm]
+    static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
+        auto qs_offset = block_index * (QK_K / 2);
+        auto qh_offset = n_blocks * (QK_K / 2) + block_index * (QK_K / 8);
+        return { qs_offset, qh_offset };
+    }
+
+    static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
+        auto nblocks        = (nrows * (ncols / QK_K));
+        auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 8);
+        return { total_qs_bytes + block_index * K_SCALE_SIZE,
+                 total_qs_bytes + nblocks * K_SCALE_SIZE + block_index * sizeof(ggml_half2) };
+    }
+
+    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
+};
+
 template <> struct block_q_t<GGML_TYPE_Q6_K> {
    struct traits {
        static constexpr uint32_t qk       = QK_K;
--- a/ggml/src/ggml-sycl/solve_tri.cpp
+++ b/ggml/src/ggml-sycl/solve_tri.cpp
@@ -0,0 +1,172 @@
+#include "solve_tri.hpp"
+#include "common.hpp"
+#include <oneapi/mkl/blas.hpp>
+
+template <int n_template, int k_template>
+static void solve_tri_f32_fast(const float * __restrict__ A,
+                               const float * __restrict__ B,
+                               float * __restrict__ X,
+                               const int64_t ne02, [[maybe_unused]] const int64_t ne03,
+                               const int64_t nb02, const int64_t nb03,
+                               const int64_t nb12, const int64_t nb13,
+                               const int64_t nb2,  const int64_t nb3,
+                               const int n_arg, const int k_arg,
+                               const sycl::nd_item<2> & item, float * sA) {
+
+    const int n = n_template == 0 ? n_arg : n_template;
+    const int k = k_template == 0 ? k_arg : k_template;
+
+    const int batch_idx = item.get_group(1);
+    const int lane      = item.get_local_id(1) % WARP_SIZE;
+    const int col_idx   = item.get_local_id(0);
+
+    if (col_idx >= k) {
+        return;
+    }
+
+    const int64_t i03 = batch_idx / ne02;
+    const int64_t i02 = batch_idx % ne02;
+
+    const float * A_batch = (const float *) ((const char *) A + i02 * nb02 + i03 * nb03);
+    const float * B_batch = (const float *) ((const char *) B + i02 * nb12 + i03 * nb13);
+    float *       X_batch = (float *)       ((char *)       X + i02 * nb2  + i03 * nb3);
+
+    const int offset = item.get_local_id(1) + item.get_local_id(0) * item.get_local_range(1);
+
+#pragma unroll
+    for (int i = 0; i < n * n; i += k * WARP_SIZE) {
+        const int i0 = i + offset;
+        if (i0 < n * n) {
+            sA[i0] = A_batch[i0];
+        }
+    }
+
+    item.barrier(sycl::access::fence_space::local_space);
+
+    float x_low  = (lane < n) ? B_batch[lane * k + col_idx] : 0.0f;
+    float x_high = (WARP_SIZE + lane < n) ? B_batch[(WARP_SIZE + lane) * k + col_idx] : 0.0f;
+
+    const int half      = WARP_SIZE;
+    const int nrows_low = (n < half) ? n : half;
+
+#pragma unroll
+    for (int row = 0; row < nrows_low; ++row) {
+        float sum = 0.0f;
+        if (lane < row) {
+            sum += sA[row * n + lane] * x_low;
+        }
+        sum = warp_reduce_sum<WARP_SIZE>(sum);
+        if (lane == row) {
+            x_low = (x_low - sum) / sA[row * n + row];
+        }
+    }
+
+#pragma unroll
+    for (int row = half; row < n; ++row) {
+        float     sum = sA[row * n + lane] * x_low;
+        const int j   = half + lane;
+        if (j < row) {
+            sum += sA[row * n + j] * x_high;
+        }
+        sum = warp_reduce_sum<WARP_SIZE>(sum);
+        if (lane == row - half) {
+            x_high = (x_high - sum) / sA[row * n + row];
+        }
+    }
+
+#pragma unroll
+    for (int rr = 0; rr < 2; ++rr) {
+        const int row = rr * WARP_SIZE + lane;
+        if (row < n) {
+            const float val            = (row < half) ? x_low : x_high;
+            X_batch[row * k + col_idx] = val;
+        }
+    }
+}
+
+static void solve_tri_f32_mkl(dpct::queue_ptr stream,
+                               const float * A, float * X,
+                               int n, int k,
+                               int64_t ne02, [[maybe_unused]] int64_t ne03,
+                               int64_t nb02, [[maybe_unused]] int64_t nb03,
+                               int64_t nb2,  [[maybe_unused]] int64_t nb3) {
+    const float alpha = 1.0f;
+    const int64_t total_batches = ne02 * ne03;
+    if (total_batches == 0) {
+        return;
+    }
+
+    const int64_t stride_a = nb02 / sizeof(float);
+    const int64_t stride_x = nb2 / sizeof(float);
+
+    oneapi::mkl::blas::trsm_batch(
+        *stream,
+        oneapi::mkl::side::right,
+        oneapi::mkl::uplo::upper,
+        oneapi::mkl::transpose::nontrans,
+        oneapi::mkl::diag::nonunit,
+        k, n, alpha,
+        A, n, stride_a,
+        X, k, stride_x,
+        total_batches);
+}
+
+inline void ggml_sycl_op_solve_tri(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+    dpct::queue_ptr stream = ctx.stream();
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+
+    const int n    = src0->ne[0];
+    const int k    = src1->ne[0];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    GGML_ASSERT(n <= SYCL_SOLVE_TRI_MAX_N && k <= SYCL_SOLVE_TRI_MAX_K);
+
+    const float * A_d = static_cast<const float *>(src0->data);
+    const float * B_d = static_cast<const float *>(src1->data);
+    float * X_d       = static_cast<float *>(dst->data);
+
+    if (X_d != B_d) {
+        const int64_t total_elements = (int64_t)n * k * ne02 * ne03;
+        stream->memcpy(X_d, B_d, total_elements * sizeof(float));
+    }
+
+    const int64_t nb02 = src0->nb[2];
+    const int64_t nb03 = src0->nb[3];
+    const int64_t nb12 = src1->nb[2];
+    const int64_t nb13 = src1->nb[3];
+    const int64_t nb2  = dst->nb[2];
+    const int64_t nb3  = dst->nb[3];
+
+    const int64_t total_batches = ne02 * ne03;
+
+    if (n <= 2 * WARP_SIZE && k <= 32) {
+        const int smem_size = 2 * WARP_SIZE * 2 * WARP_SIZE;
+        const sycl::range<2> grid(1, total_batches);
+        const sycl::range<2> block(k, WARP_SIZE);
+        stream->submit([&](sycl::handler & cgh) {
+            sycl::local_accessor<float, 1> smem_acc(sycl::range<1>(smem_size), cgh);
+            cgh.parallel_for(
+                sycl::nd_range<2>(grid * block, block),
+                [=](sycl::nd_item<2> item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                    solve_tri_f32_fast<0, 0>(A_d, B_d, X_d, ne02, ne03,
+                                              nb02, nb03, nb12, nb13, nb2, nb3,
+                                              n, k, item, get_pointer(smem_acc));
+                });
+        });
+    } else {
+        solve_tri_f32_mkl(stream, A_d, X_d, n, k, ne02, ne03, nb02, nb03, nb2, nb3);
+    }
+}
+
+void ggml_sycl_solve_tri(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
+    ggml_sycl_op_solve_tri(ctx, dst);
+}
--- a/ggml/src/ggml-sycl/solve_tri.hpp
+++ b/ggml/src/ggml-sycl/solve_tri.hpp
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "common.hpp"
+
+#define SYCL_SOLVE_TRI_MAX_N 64
+#define SYCL_SOLVE_TRI_MAX_K 64
+
+void ggml_sycl_solve_tri(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-sycl/ssm_conv.cpp
+++ b/ggml/src/ggml-sycl/ssm_conv.cpp
@@ -63,7 +63,7 @@ static void kernel_ssm_conv(
    });
 }

-void ggml_sycl_ssm_conv(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+inline void ggml_sycl_op_ssm_conv(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_tensor * src0 = dst->src[0];
    ggml_tensor * src1 = dst->src[1];

@@ -125,3 +125,8 @@ void ggml_sycl_ssm_conv(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
        throw;
    }
 }
+
+void ggml_sycl_ssm_conv(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
+    ggml_sycl_op_ssm_conv(ctx, dst);
+}
--- a/ggml/src/ggml-sycl/ssm_scan.cpp
+++ b/ggml/src/ggml-sycl/ssm_scan.cpp
@@ -0,0 +1,156 @@
+#include "ssm_scan.hpp"
+#include "common.hpp"
+
+template <int c_factor, int d_state>
+static void ssm_scan_f32_group(
+        const float * __restrict__ src0, const float * __restrict__ src1, const float * __restrict__ src2,
+        const float * __restrict__ src3, const float * __restrict__ src4, const float * __restrict__ src5,
+        const int32_t * __restrict__ src6, float * __restrict__ dst,
+        const int src0_nb2, const int src0_nb3, const int src1_nb2, const int src1_nb3,
+        const int src2_nb1, const int src2_nb2, const int src3_nb1,
+        const int src4_nb2, const int src4_nb3, const int src5_nb2, const int src5_nb3,
+        const int64_t s_off, const int64_t n_head, const int64_t d_head, const int64_t n_group, const int64_t n_tok,
+        const sycl::nd_item<2> & item) {
+
+    const int lane     = item.get_local_id(1) % WARP_SIZE;
+    const int warp     = item.get_local_id(1) / WARP_SIZE;
+    const int warp_idx = item.get_group(1) * c_factor + warp;
+    const int seq_idx  = item.get_group(0);
+
+    const int head_idx = warp_idx / d_head;
+    const int head_off = (warp_idx % d_head) * sizeof(float);
+    const int group_off = (head_idx / (n_head / n_group)) * d_state * sizeof(float);
+
+    const float * s0_warp = (const float *) ((const char *) src0 + src6[seq_idx] * src0_nb3 + head_idx * src0_nb2 + head_off * d_state);
+    const float * x_warp  = (const float *) ((const char *) src1 + (seq_idx * src1_nb3) + (warp_idx * sizeof(float)));
+    const float * dt_warp = (const float *) ((const char *) src2 + (seq_idx * src2_nb2) + head_idx * sizeof(float));
+    const float * A_warp  = (const float *) ((const char *) src3 + head_idx * src3_nb1);
+    const float * B_warp  = (const float *) ((const char *) src4 + (seq_idx * src4_nb3) + (group_off));
+    const float * C_warp  = (const float *) ((const char *) src5 + (seq_idx * src5_nb3) + (group_off));
+    float *       y_warp  = dst + (seq_idx * n_tok * n_head * d_head) + warp_idx;
+    float *       s_warp  = (float *) ((char *) dst + s_off + seq_idx * src0_nb3 + head_idx * src0_nb2 + head_off * d_state);
+
+    const int stride_x  = src1_nb2 / sizeof(float);
+    const int stride_dt = src2_nb1 / sizeof(float);
+    const int stride_B  = src4_nb2 / sizeof(float);
+    const int stride_C  = src5_nb2 / sizeof(float);
+    const int stride_y  = n_head * d_head;
+
+    float state[c_factor];
+    float state_sum = 0.0f;
+
+#pragma unroll
+    for (int j = 0; j < c_factor; j++) {
+        state[j] = s0_warp[WARP_SIZE * j + lane];
+    }
+
+    for (int64_t i = 0; i < n_tok; i++) {
+        const float dt_val = dt_warp[i * stride_dt];
+        const float dt_soft_plus = (dt_val <= 20.0f ? sycl::log1p(sycl::exp(dt_val)) : dt_val);
+
+        state_sum = 0.0f;
+        const float dA   = sycl::exp(dt_soft_plus * A_warp[0]);
+        const float x_dt = x_warp[i * stride_x] * dt_soft_plus;
+#pragma unroll
+        for (int j = 0; j < c_factor; j++) {
+            const float B_val = B_warp[i * stride_B + WARP_SIZE * j + lane];
+            const float C_val = C_warp[i * stride_C + WARP_SIZE * j + lane];
+            state[j] = (state[j] * dA) + (B_val * x_dt);
+            state_sum += state[j] * C_val;
+        }
+
+        state_sum = warp_reduce_sum<WARP_SIZE>(state_sum);
+
+        if (lane == 0) {
+            y_warp[i * stride_y] = state_sum;
+        }
+    }
+
+#pragma unroll
+    for (int j = 0; j < c_factor; j++) {
+        s_warp[WARP_SIZE * j + lane] = state[j];
+    }
+}
+
+static void ssm_scan_f32_sycl(
+        const float * src0, const float * src1, const float * src2, const float * src3,
+        const float * src4, const float * src5, const int32_t * src6, float * dst,
+        const int src0_nb2, const int src0_nb3, const int src1_nb2, const int src1_nb3, const int src2_nb1,
+        const int src2_nb2, const int src3_nb1, const int src4_nb2, const int src4_nb3, const int src5_nb2,
+        const int src5_nb3, const int64_t s_off, const int64_t d_state, const int64_t head_dim,
+        const int64_t n_head, const int64_t n_group, const int64_t n_tok, const int64_t n_seq,
+        dpct::queue_ptr stream) {
+
+    // NOTE: if you change conditions here, be sure to update the corresponding supports_op condition!
+    GGML_ASSERT(src3_nb1 == sizeof(float));
+    if (d_state == 128) {
+        constexpr int threads   = 128;
+        constexpr int num_warps = threads / WARP_SIZE;
+        const sycl::range<2> grid(n_seq, (n_head * head_dim + num_warps - 1) / num_warps);
+        const sycl::range<2> block(1, threads);
+        stream->parallel_for(
+            sycl::nd_range<2>(grid * block, block),
+            [=](sycl::nd_item<2> item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                ssm_scan_f32_group<128 / WARP_SIZE, 128>(
+                    src0, src1, src2, src3, src4, src5, src6, dst,
+                    src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
+                    src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok, item);
+            });
+    } else if (d_state == 256) {
+        constexpr int threads   = 256;
+        constexpr int num_warps = threads / WARP_SIZE;
+        const sycl::range<2> grid(n_seq, (n_head * head_dim + num_warps - 1) / num_warps);
+        const sycl::range<2> block(1, threads);
+        stream->parallel_for(
+            sycl::nd_range<2>(grid * block, block),
+            [=](sycl::nd_item<2> item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                ssm_scan_f32_group<256 / WARP_SIZE, 256>(
+                    src0, src1, src2, src3, src4, src5, src6, dst,
+                    src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
+                    src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok, item);
+            });
+    } else {
+        GGML_ABORT("ssm_scan: unsupported d_state (must be 128 or 256)");
+    }
+}
+
+inline void ggml_sycl_op_ssm_scan(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+    const ggml_tensor * src2 = dst->src[2];
+    const ggml_tensor * src3 = dst->src[3];
+    const ggml_tensor * src4 = dst->src[4];
+    const ggml_tensor * src5 = dst->src[5];
+    const ggml_tensor * src6 = dst->src[6];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src6->type == GGML_TYPE_I32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+
+    const int64_t nc  = src0->ne[0];
+    const int64_t nr  = src0->ne[1];
+    const int64_t nh  = src1->ne[1];
+    const int64_t ng  = src4->ne[1];
+    const int64_t n_t = src1->ne[2];
+    const int64_t n_s = src1->ne[3];
+    const int64_t s_off = ggml_nelements(src1) * sizeof(float);
+
+    GGML_ASSERT(ggml_nelements(src1) + nc * nr * nh * n_s == ggml_nelements(dst));
+
+    dpct::queue_ptr stream = ctx.stream();
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+
+    ssm_scan_f32_sycl(
+        static_cast<const float *>(src0->data), static_cast<const float *>(src1->data),
+        static_cast<const float *>(src2->data), static_cast<const float *>(src3->data),
+        static_cast<const float *>(src4->data), static_cast<const float *>(src5->data),
+        static_cast<const int32_t *>(src6->data), static_cast<float *>(dst->data),
+        src0->nb[2], src0->nb[3], src1->nb[2], src1->nb[3], src2->nb[1], src2->nb[2],
+        src3->nb[1], src4->nb[2], src4->nb[3], src5->nb[2], src5->nb[3],
+        s_off, nc, nr, nh, ng, n_t, n_s, stream);
+}
+
+void ggml_sycl_ssm_scan(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/7);
+    ggml_sycl_op_ssm_scan(ctx, dst);
+}
--- a/ggml/src/ggml-sycl/ssm_scan.hpp
+++ b/ggml/src/ggml-sycl/ssm_scan.hpp
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "common.hpp"
+
+void ggml_sycl_ssm_scan(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-sycl/vecdotq.hpp
+++ b/ggml/src/ggml-sycl/vecdotq.hpp
@@ -357,38 +357,31 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q8_0> {
    using q8_0_block  = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q8_0>;
    using q8_0_traits = typename q8_0_block::traits;

-    __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int * v, const int * u, const float & d8_0, const sycl::half2 & ds8) {
-        int sumi = 0;
-
-#pragma unroll
-        for (size_t i = 0; i < q8_0_traits::vdr_mmvq; ++i) {
-            // Q8_0 values are signed int8, no nibble extraction needed
-            // Direct dp4a: each int packs 4 int8 values
-            sumi = dpct::dp4a(v[i], u[i], sumi);
-        }
-
-        const sycl::float2 ds8f = ds8.convert<float, sycl::rounding_mode::automatic>();
-
-        // Q8_0 has no bias term (values are signed), so just scale
-        return d8_0 * sumi * ds8f.x();
-    }
-
    __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
                                     const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
                                     const sycl::half2 * q8_1_ds, const int & iqs) {
-        const int8_t * bq8_0 = static_cast<const int8_t *>(vbq) + ibx_offset.first;
-        const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset.first));
-        int             v[q8_0_traits::vdr_mmvq];
-        int             u[q8_0_traits::vdr_mmvq];
+        const uint8_t * base = static_cast<const uint8_t *>(vbq);
+        const int8_t *  qs   = reinterpret_cast<const int8_t *>(base + ibx_offset.first);
+        const ggml_half  d   = *reinterpret_cast<const ggml_half *>(base + d_offset.first);
+
+        int v[q8_0_traits::vdr_mmvq];
+        int u[q8_0_traits::vdr_mmvq];

 #pragma unroll
        for (size_t i = 0; i < q8_0_traits::vdr_mmvq; ++i) {
-            v[i] = get_int_from_int8(bq8_0, iqs + i);
+            v[i] = get_int_from_int8(qs, iqs + i);
            u[i] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i);
        }

-        return vec_dot_q8_0_q8_1_impl(v, u, d, *q8_1_ds);
-    };
+        int sumi = 0;
+#pragma unroll
+        for (size_t i = 0; i < q8_0_traits::vdr_mmvq; ++i) {
+            sumi = dpct::dp4a(v[i], u[i], sumi);
+        }
+
+        const sycl::half2 ds_values = *q8_1_ds;
+        return static_cast<float>(d) * static_cast<float>(ds_values[0]) * sumi;
+    }
 };

 static inline float vec_dot_q4_K_q8_1_common(const int * __restrict__ q4, const uint16_t * __restrict__ scales,
@@ -481,6 +474,65 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
    }
 };

+template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q5_K> {
+    static constexpr ggml_type gtype = GGML_TYPE_Q5_K;
+
+    using q5_k_block  = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q5_K>;
+    using q5_k_traits = typename q5_k_block::traits;
+
+    __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
+                                     const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
+                                     const sycl::half2 * q8_1_ds, const int & iqs) {
+        const uint8_t *    base           = static_cast<const uint8_t *>(vbq);
+        const uint8_t *    qs             = base + ibx_offset.first;   // low 4 bits
+        const uint8_t *    qh_base        = base + ibx_offset.second;  // high bit
+        const uint8_t *    scs            = base + d_offset.first;
+        const ggml_half2 * dms            = reinterpret_cast<const ggml_half2 *>(base + d_offset.second);
+
+        const int        bq8_offset = QR5_K * ((iqs / 2) / (QI8_1 / 2));
+        const int *      ql_ptr     = (const int *) (qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
+        const int *      qh_ptr     = (const int *) (qh_base + 4 * ((iqs / 2) % 4));
+        const uint16_t * scales     = (const uint16_t *) scs;
+
+        int   vl[2];
+        int   vh[2];
+        int   u[2 * QR5_K];
+        float d8[QR5_K];
+
+        vl[0] = ql_ptr[0];
+        vl[1] = ql_ptr[4];
+
+        vh[0] = qh_ptr[0] >> bq8_offset;
+        vh[1] = qh_ptr[4] >> bq8_offset;
+
+        uint16_t  aux[2];
+        const int j = (QR5_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
+        if (j < 2) {
+            aux[0] = scales[j + 0] & 0x3f3f;
+            aux[1] = scales[j + 2] & 0x3f3f;
+        } else {
+            aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
+            aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
+        }
+
+        const uint8_t * sc = (const uint8_t *) aux;
+        const uint8_t * m  = sc + 2;
+
+        for (int i = 0; i < QR5_K; ++i) {
+            const int8_t* quant_base_ptr = q8_1_quant_ptr + (bq8_offset + i) * QK8_1;
+            sycl::half2 ds_values = *(q8_1_ds + bq8_offset + i);
+
+            d8[i]                   = ds_values[0];
+
+            const int * q8 = (const int *) quant_base_ptr + ((iqs / 2) % 4);
+            u[2 * i + 0]   = q8[0];
+            u[2 * i + 1]   = q8[4];
+        }
+
+        return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, *dms, d8);
+    }
+};
+
 template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K> {
    static constexpr ggml_type gtype = GGML_TYPE_Q6_K;

--- a/ggml/src/ggml-virtgpu/ggml-backend-device.cpp
+++ b/ggml/src/ggml-virtgpu/ggml-backend-device.cpp
@@ -1,5 +1,7 @@
 #include "ggml-remoting.h"

+#include <mutex>
+
 static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
    virtgpu * gpu = DEV_TO_GPU(dev);

--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -2149,11 +2149,11 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin

    // Patch SPIR-V to enable RTE rounding for FP16, avoiding the need for
    // separate shader variants compiled with -DRTE16.
-    std::vector<uint32_t> spv;
+    std::vector<uint32_t> spirv;
    if (device->float_controls_rte_fp16) {
        const uint32_t* spv_words = reinterpret_cast<const uint32_t *>(spv_data);
        size_t word_count = spv_size / sizeof(uint32_t);
-        spv.assign(spv_words, spv_words + word_count);
+        spirv.assign(spv_words, spv_words + word_count);

        // Find insertion points respecting SPIR-V layout order:
        //   Header(5) -> OpCapability -> OpExtension -> ... -> OpEntryPoint -> OpExecutionMode -> ...
@@ -2163,9 +2163,9 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
        size_t exec_insert_pos = pos;
        uint32_t entry_point_id = 0;

-        while (pos < spv.size()) {
-            uint32_t opcode = spv[pos] & spv::OpCodeMask;
-            uint32_t len    = spv[pos] >> spv::WordCountShift;
+        while (pos < spirv.size()) {
+            uint32_t opcode = spirv[pos] & spv::OpCodeMask;
+            uint32_t len    = spirv[pos] >> spv::WordCountShift;
            if (len == 0) break;

            if (opcode == spv::OpCapability) {
@@ -2174,7 +2174,7 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
            } else if (opcode == spv::OpExtension) {
                ext_insert_pos = pos + len;
            } else if (opcode == spv::OpEntryPoint) {
-                entry_point_id = spv[pos + 2];
+                entry_point_id = spirv[pos + 2];
                exec_insert_pos = pos + len;
            } else if (opcode == spv::OpExecutionMode || opcode == spv::OpExecutionModeId) {
                exec_insert_pos = pos + len;
@@ -2189,7 +2189,7 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin

        // OpExecutionMode %entrypoint RoundingModeRTE 16
        uint32_t exec_mode[] = { (4u << spv::WordCountShift) | spv::OpExecutionMode, entry_point_id, spv::ExecutionModeRoundingModeRTE, 16 };
-        spv.insert(spv.begin() + exec_insert_pos, std::begin(exec_mode), std::end(exec_mode));
+        spirv.insert(spirv.begin() + exec_insert_pos, std::begin(exec_mode), std::end(exec_mode));

        // OpExtension "SPV_KHR_float_controls"
        const char ext_str[] = "SPV_KHR_float_controls";
@@ -2197,13 +2197,13 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
        std::vector<uint32_t> extension(1 + ext_str_words, 0);
        extension[0] = (uint32_t)((1 + ext_str_words) << spv::WordCountShift) | spv::OpExtension;
        memcpy(&extension[1], ext_str, sizeof(ext_str));
-        spv.insert(spv.begin() + ext_insert_pos, extension.begin(), extension.end());
+        spirv.insert(spirv.begin() + ext_insert_pos, extension.begin(), extension.end());

        // OpCapability RoundingModeRTE
        uint32_t capability[] = { (2u << spv::WordCountShift) | spv::OpCapability, spv::CapabilityRoundingModeRTE };
-        spv.insert(spv.begin() + cap_insert_pos, std::begin(capability), std::end(capability));
+        spirv.insert(spirv.begin() + cap_insert_pos, std::begin(capability), std::end(capability));

-        shader_module_create_info = vk::ShaderModuleCreateInfo({}, spv.size() * sizeof(uint32_t), spv.data());
+        shader_module_create_info = vk::ShaderModuleCreateInfo({}, spirv.size() * sizeof(uint32_t), spirv.data());
    }

    pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -175,6 +175,7 @@ class Keys:
        SLIDING_WINDOW               = "{arch}.attention.sliding_window"
        SCALE                        = "{arch}.attention.scale"
        OUTPUT_SCALE                 = "{arch}.attention.output_scale"
+        VALUE_SCALE                  = "{arch}.attention.value_scale"
        TEMPERATURE_LENGTH           = "{arch}.attention.temperature_length"
        KEY_LENGTH_MLA               = "{arch}.attention.key_length_mla"
        VALUE_LENGTH_MLA             = "{arch}.attention.value_length_mla"
@@ -773,6 +774,14 @@ class MODEL_TENSOR(IntEnum):
    V_DS_NORM            = auto() # qwen3vl
    V_DS_FC1             = auto() # qwen3vl
    V_DS_FC2             = auto() # qwen3vl
+    V_MERGER_LN1         = auto() # minicpmv4_6
+    V_MERGER_ATTN_Q      = auto() # minicpmv4_6
+    V_MERGER_ATTN_K      = auto() # minicpmv4_6
+    V_MERGER_ATTN_V      = auto() # minicpmv4_6
+    V_MERGER_ATTN_O      = auto() # minicpmv4_6
+    V_MERGER_DS_LN       = auto() # minicpmv4_6
+    V_MERGER_DS_UP       = auto() # minicpmv4_6
+    V_MERGER_DS_DOWN     = auto() # minicpmv4_6
    V_MM_POST_FC_NORM    = auto() # cogvlm
    V_MM_UP              = auto() # cogvlm
    V_MM_DOWN            = auto() # cogvlm
@@ -1277,6 +1286,14 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.V_DS_NORM:                 "v.deepstack.{bid}.norm",
    MODEL_TENSOR.V_DS_FC1:                  "v.deepstack.{bid}.fc1",
    MODEL_TENSOR.V_DS_FC2:                  "v.deepstack.{bid}.fc2",
+    MODEL_TENSOR.V_MERGER_LN1:              "v.vit_merger.ln1",
+    MODEL_TENSOR.V_MERGER_ATTN_Q:           "v.vit_merger.attn_q",
+    MODEL_TENSOR.V_MERGER_ATTN_K:           "v.vit_merger.attn_k",
+    MODEL_TENSOR.V_MERGER_ATTN_V:           "v.vit_merger.attn_v",
+    MODEL_TENSOR.V_MERGER_ATTN_O:           "v.vit_merger.attn_out",
+    MODEL_TENSOR.V_MERGER_DS_LN:            "v.vit_merger.ds_ln",
+    MODEL_TENSOR.V_MERGER_DS_UP:            "v.vit_merger.ds_ffn_up",
+    MODEL_TENSOR.V_MERGER_DS_DOWN:          "v.vit_merger.ds_ffn_down",
    MODEL_TENSOR.V_MM_POST_FC_NORM:         "mm.post_fc_norm", # cogvlm
    MODEL_TENSOR.V_MM_UP:                   "mm.up",
    MODEL_TENSOR.V_MM_DOWN:                 "mm.down",
@@ -1449,6 +1466,14 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.V_DS_NORM,
        MODEL_TENSOR.V_DS_FC1,
        MODEL_TENSOR.V_DS_FC2,
+        MODEL_TENSOR.V_MERGER_LN1,
+        MODEL_TENSOR.V_MERGER_ATTN_Q,
+        MODEL_TENSOR.V_MERGER_ATTN_K,
+        MODEL_TENSOR.V_MERGER_ATTN_V,
+        MODEL_TENSOR.V_MERGER_ATTN_O,
+        MODEL_TENSOR.V_MERGER_DS_LN,
+        MODEL_TENSOR.V_MERGER_DS_UP,
+        MODEL_TENSOR.V_MERGER_DS_DOWN,
        MODEL_TENSOR.V_MM_POST_FC_NORM,
        MODEL_TENSOR.V_MM_UP,
        MODEL_TENSOR.V_MM_DOWN,
@@ -2418,6 +2443,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
        MODEL_TENSOR.FFN_GATE_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
        MODEL_TENSOR.FFN_DOWN_EXP,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_POST_NORM,
@@ -3844,6 +3871,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
@@ -3858,6 +3886,10 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN_EXP,
        MODEL_TENSOR.FFN_UP_EXP,
        MODEL_TENSOR.FFN_EXP_PROBS_B,
+        MODEL_TENSOR.LAYER_OUT_NORM,
+        MODEL_TENSOR.NEXTN_EH_PROJ,
+        MODEL_TENSOR.NEXTN_ENORM,
+        MODEL_TENSOR.NEXTN_HNORM,
    ],
    MODEL_ARCH.STEP35: [
        MODEL_TENSOR.TOKEN_EMBD,
@@ -4224,6 +4256,7 @@ class VisionProjectorType:
    NEMOTRON_V2_VL = "nemotron_v2_vl"
    HUNYUANOCR     = "hunyuanocr"
    HUNYUANVL      = "hunyuanvl"
+    MINICPMV4_6    = "minicpmv4_6"
    GRANITE_SPEECH = "granite_speech"  # audio


--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -943,6 +943,9 @@ class GGUFWriter:
    def add_attn_output_scale(self, value: float) -> None:
        self.add_float32(Keys.Attention.OUTPUT_SCALE.format(arch=self.arch), value)

+    def add_attn_value_scale(self, value: float) -> None:
+        self.add_float32(Keys.Attention.VALUE_SCALE.format(arch=self.arch), value)
+
    def add_attn_temperature_length(self, value: int) -> None:
        self.add_uint32(Keys.Attention.TEMPERATURE_LENGTH.format(arch=self.arch), value)

--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -1399,6 +1399,7 @@ class TensorNameMap:

        MODEL_TENSOR.V_ENC_EMBD_PATCH: (
            "vision_tower.vision_model.embeddings.patch_embedding",
+            "model.vision_tower.embeddings.patch_embedding", # minicpmv4_6
            "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
            "vpm.embeddings.patch_embedding",
            "model.vision_model.embeddings.patch_embedding", # SmolVLM
@@ -1424,6 +1425,7 @@ class TensorNameMap:

        MODEL_TENSOR.V_ENC_EMBD_POS: (
            "vision_tower.vision_model.embeddings.position_embedding",
+            "model.vision_tower.embeddings.position_embedding", # minicpmv4_6
            "model.vision_tower.embeddings.position_embeddings", # Intern-S1
            "vpm.embeddings.position_embedding",
            "model.vision_model.embeddings.position_embedding", # SmolVLM
@@ -1460,6 +1462,7 @@ class TensorNameMap:

        MODEL_TENSOR.V_ENC_ATTN_Q: (
            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
+            "model.vision_tower.encoder.layers.{bid}.self_attn.q_proj", # minicpmv4_6
            "model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1
            "vpm.encoder.layers.{bid}.self_attn.q_proj",
            "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
@@ -1483,6 +1486,7 @@ class TensorNameMap:

        MODEL_TENSOR.V_ENC_ATTN_K: (
            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
+            "model.vision_tower.encoder.layers.{bid}.self_attn.k_proj", # minicpmv4_6
            "model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1
            "vpm.encoder.layers.{bid}.self_attn.k_proj",
            "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
@@ -1506,6 +1510,7 @@ class TensorNameMap:

        MODEL_TENSOR.V_ENC_ATTN_V: (
            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
+            "model.vision_tower.encoder.layers.{bid}.self_attn.v_proj", # minicpmv4_6
            "model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1
            "vpm.encoder.layers.{bid}.self_attn.v_proj",
            "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
@@ -1522,6 +1527,7 @@ class TensorNameMap:

        MODEL_TENSOR.V_ENC_INPUT_NORM: (
            "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
+            "model.vision_tower.encoder.layers.{bid}.layer_norm1", # minicpmv4_6
            "vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL
            "model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1
            "vpm.encoder.layers.{bid}.layer_norm1",
@@ -1542,6 +1548,7 @@ class TensorNameMap:

        MODEL_TENSOR.V_ENC_ATTN_O: (
            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
+            "model.vision_tower.encoder.layers.{bid}.self_attn.out_proj", # minicpmv4_6
            "vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL
            "model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1
            "vpm.encoder.layers.{bid}.self_attn.out_proj",
@@ -1564,6 +1571,7 @@ class TensorNameMap:

        MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
            "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
+            "model.vision_tower.encoder.layers.{bid}.layer_norm2", # minicpmv4_6
            "vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL
            "model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1
            "vpm.encoder.layers.{bid}.layer_norm2",
@@ -1585,6 +1593,7 @@ class TensorNameMap:

        MODEL_TENSOR.V_ENC_FFN_UP: (
            "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
+            "model.vision_tower.encoder.layers.{bid}.mlp.fc1", # minicpmv4_6
            "model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1
            "vpm.encoder.layers.{bid}.mlp.fc1",
            "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
@@ -1613,6 +1622,7 @@ class TensorNameMap:

        MODEL_TENSOR.V_ENC_FFN_DOWN: (
            "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
+            "model.vision_tower.encoder.layers.{bid}.mlp.fc2", # minicpmv4_6
            "model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1
            "vpm.encoder.layers.{bid}.mlp.fc2",
            "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
@@ -1668,6 +1678,7 @@ class TensorNameMap:

        MODEL_TENSOR.V_POST_NORM: (
            "vision_tower.vision_model.post_layernorm",
+            "model.vision_tower.post_layernorm", # minicpmv4_6
            "model.vision_model.post_layernorm", # SmolVLM
            "vision_model.layernorm_post", # llama4
            "visual.merger.ln_q", # qwen2vl
@@ -1696,6 +1707,7 @@ class TensorNameMap:
            "mlp_AR.pre_norm", # PaddleOCR-VL
            "merger.ln_q",
            "vision_tower.merger.ln_q", # dots.ocr
+            "model.merger.mlp.0.pre_norm", # minicpmv4_6
        ),

        MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
@@ -1769,6 +1781,38 @@ class TensorNameMap:
            "model.visual.deepstack_merger_list.{bid}.linear_fc2", # deepstack in qwen3vl
        ),

+        MODEL_TENSOR.V_MERGER_LN1: (
+            "model.vision_tower.vit_merger.layer_norm1", # minicpmv4_6
+        ),
+
+        MODEL_TENSOR.V_MERGER_ATTN_Q: (
+            "model.vision_tower.vit_merger.self_attn.q_proj", # minicpmv4_6
+        ),
+
+        MODEL_TENSOR.V_MERGER_ATTN_K: (
+            "model.vision_tower.vit_merger.self_attn.k_proj", # minicpmv4_6
+        ),
+
+        MODEL_TENSOR.V_MERGER_ATTN_V: (
+            "model.vision_tower.vit_merger.self_attn.v_proj", # minicpmv4_6
+        ),
+
+        MODEL_TENSOR.V_MERGER_ATTN_O: (
+            "model.vision_tower.vit_merger.self_attn.out_proj", # minicpmv4_6
+        ),
+
+        MODEL_TENSOR.V_MERGER_DS_LN: (
+            "model.vision_tower.vit_merger.pre_norm", # minicpmv4_6
+        ),
+
+        MODEL_TENSOR.V_MERGER_DS_UP: (
+            "model.vision_tower.vit_merger.linear_1", # minicpmv4_6
+        ),
+
+        MODEL_TENSOR.V_MERGER_DS_DOWN: (
+            "model.vision_tower.vit_merger.linear_2", # minicpmv4_6
+        ),
+
        MODEL_TENSOR.V_SAM_POS_EMBD: (
            "model.sam_model.pos_embed",
        ),
@@ -1828,11 +1872,13 @@ class TensorNameMap:
        MODEL_TENSOR.V_MM_UP: (
            "model.vision.linear_proj.dense_h_to_4h", # cogvlm
            "visual.merger.up_proj", # glm4v
+            "model.merger.mlp.0.linear_1", # minicpmv4_6
        ),

        MODEL_TENSOR.V_MM_DOWN: (
            "model.vision.linear_proj.dense_4h_to_h", # cogvlm
            "visual.merger.down_proj", # glm4v
+            "model.merger.mlp.0.linear_2", # minicpmv4_6
        ),

        MODEL_TENSOR.V_MM_GATE: (
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,8 +1,8 @@
 [project]
 name = "gguf"
+version = "0.19.0"
 description = "Read and write ML models in GGUF for GGML"
 keywords = ["ggml", "gguf", "llama.cpp"]
-version = "0.18.0"
 dynamic = ["classifiers"]
 readme = "README.md"
 authors = [{name = "GGML", email = "ggml@ggml.ai"}]
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-ac6f7b44f60fde0091f0b3d99afde48f8c99b13a
+628249b398293fc8d2fa81a449ae2920a02c6523
--- a/scripts/sync_vendor.py
+++ b/scripts/sync_vendor.py
@@ -5,7 +5,7 @@ import os
 import sys
 import subprocess

-HTTPLIB_VERSION = "refs/tags/v0.43.3"
+HTTPLIB_VERSION = "refs/tags/v0.43.4"

 vendor = {
    "https://github.com/nlohmann/json/releases/latest/download/json.hpp":     "vendor/nlohmann/json.hpp",
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -232,6 +232,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,       "%s.attention.sliding_window_pattern"       },
    { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
    { LLM_KV_ATTENTION_OUTPUT_SCALE,                 "%s.attention.output_scale"                 },
+    { LLM_KV_ATTENTION_VALUE_SCALE,                  "%s.attention.value_scale"                  },
    { LLM_KV_ATTENTION_TEMPERATURE_LENGTH,           "%s.attention.temperature_length"           },
    { LLM_KV_ATTENTION_TEMPERATURE_SCALE,            "%s.attention.temperature_scale"            },
    { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -236,6 +236,7 @@ enum llm_kv {
    LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,
    LLM_KV_ATTENTION_SCALE,
    LLM_KV_ATTENTION_OUTPUT_SCALE,
+    LLM_KV_ATTENTION_VALUE_SCALE,
    LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
    LLM_KV_ATTENTION_TEMPERATURE_SCALE,
    LLM_KV_ATTENTION_KEY_LENGTH_MLA,
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2451,7 +2451,30 @@ public:
        for (auto & [buft, mbuf] : mbufs_new) {
            auto & mbuf_cur = mbufs[buft];

-            if (!mbuf_cur.buf || mbuf_cur.org.size() != mbuf.org.size() || mbuf_cur.total_size != mbuf.total_size) {
+            bool need_alloc = false;
+
+            need_alloc = need_alloc || (!mbuf_cur.buf);
+            need_alloc = need_alloc || (mbuf_cur.org.size() != mbuf.org.size());
+            need_alloc = need_alloc || (mbuf_cur.total_size != mbuf.total_size);
+
+            if (!need_alloc) {
+                for (size_t i = 0; i < mbuf_cur.org.size(); ++i) {
+                    auto * org0 = mbuf_cur.org[i];
+                    auto * org1 = mbuf.org[i];
+
+                    if (!ggml_are_same_shape(org0, org1)) {
+                        need_alloc = true;
+                        break;
+                    }
+
+                    if (org0->view_src != org1->view_src || org0->view_offs != org1->view_offs) {
+                        need_alloc = true;
+                        break;
+                    }
+                }
+            }
+
+            if (need_alloc) {
                mbuf_cur = std::move(mbuf);

                mbuf_cur.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(mbuf_cur.ctx.get(), buft));
@@ -2515,6 +2538,31 @@ public:
            mbufs_new[buft].total_size += rinfo.size;
        }

+        for (auto & [buft, mbuf] : mbufs_new) {
+            ggml_init_params params = {
+                /*.mem_size   =*/ mbuf.n_tensors*ggml_tensor_overhead(),
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+
+            mbuf.ctx.reset(ggml_init(params));
+
+            mbuf.org.reserve(mbuf.n_tensors);
+        }
+
+        for (const auto & rinfo : rinfos) {
+            auto * buft = ggml_backend_buffer_get_type(rinfo.tensor->buffer);
+
+            const int64_t n = rinfo.size/ggml_element_size(rinfo.tensor);
+
+            auto & mbuf = mbufs_new[buft];
+
+            mbuf.org.push_back(ggml_view_1d(mbuf.ctx.get(), rinfo.tensor, n, rinfo.offset));
+
+            auto & view = mbuf.org.back();
+            view->buffer = rinfo.tensor->buffer;
+        }
+
        for (auto & [buft, mbuf] : mbufs_new) {
            const auto & mbuf_cur = mbufs.at(buft);

@@ -2523,9 +2571,11 @@ public:
            }

            for (size_t i = 0; i < mbuf_cur.org.size(); ++i) {
-                ggml_backend_tensor_copy(mbuf_cur.cpy[i], mbuf_cur.org[i]);
+                ggml_backend_tensor_copy(mbuf_cur.cpy[i], mbuf.org[i]);
            }
        }
+
+        GGML_ASSERT(buf_size == 0);
    }

    void read(void * dst, size_t size) override {
@@ -2656,13 +2706,8 @@ size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * sr
            throw std::runtime_error("wrong sequence state magic");
        }

-        const bool need_seq_match = (flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-
        llama_seq_id seq_id_read;
        io->read(&seq_id_read, sizeof(seq_id_read));
-        if (need_seq_match && seq_id != seq_id_read) {
-            throw std::runtime_error("wrong sequence id");
-        }

        return state_seq_read_data(*io, seq_id, flags);
    } catch (const std::exception & err) {
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -166,6 +166,8 @@ struct llama_hparams {
    float    f_attn_out_scale = 0.0f;
    uint32_t attn_temp_length = 0;

+    float    f_attn_value_scale = 0.0f;
+
    bool causal_attn   = true;
    bool use_alibi     = false;
    bool attn_soft_cap = false;
--- a/src/llama-memory-recurrent.cpp
+++ b/src/llama-memory-recurrent.cpp
@@ -726,6 +726,10 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
        cell_ranges.emplace_back(cell_range_begin, size);
    }

+    if (flags % LLAMA_STATE_SEQ_FLAGS_ON_DEVICE && cell_ranges.size() > 1) {
+        GGML_ABORT("cannot save/load multiple ranges of cells to/from device memory\n");
+    }
+
    // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
    uint32_t cell_count_check = 0;
    for (const auto & range : cell_ranges) {
--- a/src/llama-model-saver.cpp
+++ b/src/llama-model-saver.cpp
@@ -268,6 +268,7 @@ void llama_model_saver::add_kv_from_model() {
    // add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,  ???);
    add_kv(LLM_KV_ATTENTION_SCALE,                   hparams.f_attention_scale);
    add_kv(LLM_KV_ATTENTION_OUTPUT_SCALE,            hparams.f_attn_out_scale);
+    add_kv(LLM_KV_ATTENTION_VALUE_SCALE,             hparams.f_attn_value_scale);
    add_kv(LLM_KV_ATTENTION_TEMPERATURE_LENGTH,      hparams.attn_temp_length);
    add_kv(LLM_KV_ATTENTION_TEMPERATURE_SCALE,       hparams.f_attn_temp_scale);
    add_kv(LLM_KV_ATTENTION_KEY_LENGTH_MLA,          hparams.n_embd_head_k_mla_impl);
--- a/Show More
+++ b/Show More