server : print warning when HTTP timeout exceeded (#22907 )

backend sampling: support returning post-sampling probs (#22622 )
* server: Never return 0.0 post-sampling probabilities * backend sampling: support returning post-sampling probs
2026-05-14 17:07:43 +03:00 · 2026-05-10 22:00:18 +03:00 · 2026-05-10 19:12:02 +02:00 · 2026-05-10 18:46:54 +02:00 · 2026-05-10 17:32:41 +02:00 · 2026-05-10 17:00:11 +03:00
76 changed files with 3544 additions and 310 deletions
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -33,10 +33,10 @@ RUN mkdir -p /app/full \

 FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

-ARG IGC_VERSION=v2.30.1
-ARG IGC_VERSION_FULL=2_2.30.1+20950
-ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
+ARG IGC_VERSION=v2.32.7
+ARG IGC_VERSION_FULL=2_2.32.7+21184
+ARG COMPUTE_RUNTIME_VERSION=26.14.37833.4
+ARG COMPUTE_RUNTIME_VERSION_FULL=26.14.37833.4-0
 ARG IGDGMM_VERSION=22.9.0
 RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -103,6 +103,7 @@ let
    vulkan-headers
    vulkan-loader
    shaderc
+    spirv-headers
  ];
 in

@@ -146,7 +147,6 @@ effectiveStdenv.mkDerivation (finalAttrs: {
      ninja
      pkg-config
      git
-      spirv-headers
    ]
    ++ optionals useCuda [
      cudaPackages.cuda_nvcc
--- a/.gitignore
+++ b/.gitignore
@@ -110,6 +110,7 @@ uv.lock

 # Nix

+flake.lock
 /result

 # Test binaries
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@@ -369,9 +369,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
                                           arguments.name_suffix) +
                           arguments.value_prefix +
                           (schema_info.resolves_to_string(param_schema) ?
-                                p.tool_arg_string_value(p.schema(until_suffix,
-                                                                 "tool-" + name + "-arg-" + param_name + "-schema",
-                                                                 param_schema, true)) :
+                                p.tool_arg_string_value(until_suffix) :
                                p.tool_arg_json_value(p.schema(
                                    p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
                                    p.space()) +
--- a/common/reasoning-budget.cpp
+++ b/common/reasoning-budget.cpp
@@ -158,8 +158,6 @@ static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_tok
    for (size_t i = 0; i < cur_p->size; i++) {
        if (cur_p->data[i].id != forced) {
            cur_p->data[i].logit = -INFINITY;
-        } else {
-            cur_p->data[i].logit = +INFINITY; // force the token
        }
    }
 }
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -547,6 +547,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    auto & chain = gsmpl->chain;
    auto & cur_p = gsmpl->cur_p; // initialized by set_logits

+    gsmpl->set_logits(ctx, idx);
+
    // Check if a backend sampler has already sampled a token in which case we
    // return that token id directly.
    {
@@ -558,17 +560,17 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
            GGML_ASSERT(!gsmpl->grmr    && "using grammar in combination with backend sampling is not supported");
            GGML_ASSERT(!gsmpl->rbudget && "using reasoning budget in combination with backend sampling is not supported");

-            // TODO: simplify
-            gsmpl->cur.resize(1);
-            gsmpl->cur[0] = { id, 0.0f, 1.0f };
-            cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
+            for (size_t i = 0; i < cur_p.size; ++i) {
+                if (cur_p.data[i].id == id) {
+                    cur_p.selected = i;
+                    break;
+                }
+            }

            return id;
        }
    }

-    gsmpl->set_logits(ctx, idx);
-
    // apply reasoning budget first
    llama_sampler_apply(rbudget, &cur_p);

--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1570,6 +1570,9 @@ class TextModel(ModelBase):
        if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015":
            # ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B
            res = "f2llmv2"
+        if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57":
+            # ref: https://huggingface.co/sarvamai/sarvam-30b
+            res = "sarvam-moe"

        if res is None:
            logger.warning("\n")
@@ -7988,13 +7991,37 @@ class Gemma4Model(Gemma3Model):
        rope_freqs_full = torch.tensor(values, dtype=torch.float32)
        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), rope_freqs_full)

+    def _generate_nvfp4_tensors(self):
+        # Gemma-4 stores a per-layer router.per_expert_scale ([n_expert]) that scales
+        # each expert's contribution. It's mathematically equivalent to a per-expert
+        # scalar on the down_proj output, which is exactly where ffn_down_exps_s is
+        # applied at inference. Fold it into each expert's NVFP4 weight_scale_2 so the
+        # existing NVFP4 path produces the right scales.
+        n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0
+        for name in [n for n in self.model_tensors if n.endswith(".router.per_expert_scale")]:
+            bid_match = re.search(r"\.layers\.(\d+)\.", name)
+            if bid_match is None:
+                continue
+            bid = bid_match.group(1)
+            prefix = name[: name.index(f".layers.{bid}.") + len(f".layers.{bid}.")]
+            w2_targets = [f"{prefix}experts.{e}.down_proj.weight_scale_2" for e in range(n_experts)]
+            present = [w2 in self.model_tensors for w2 in w2_targets]
+            if not any(present):
+                continue
+            assert all(present), f"layer {bid}: partial NVFP4 quantization across experts"
+            r = self.model_tensors.pop(name)
+            for e, w2 in enumerate(w2_targets):
+                s = self.model_tensors[w2]
+                self.model_tensors[w2] = lambda s=s, r=r, i=e: s() * r()[i]
+        super()._generate_nvfp4_tensors()
+
    @classmethod
    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
        name, gen = item

        if name.endswith("per_dim_scale") or name.endswith("layer_scalar"):
            name = name + ".weight"
-        if ".experts." in name and not name.endswith(".weight"):
+        if ".experts." in name and not name.endswith((".weight", ".weight_scale", ".weight_scale_2", ".input_scale")):
            name += ".weight"

        return super().filter_tensors((name, gen))
@@ -11567,6 +11594,34 @@ class BailingMoeV2Model(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


+@ModelBase.register("SarvamMoEForCausalLM", "modeling_sarvam_moe.SarvamMoEForCausalLM")
+class SarvamMoEModel(BailingMoeV2Model):
+    model_arch = gguf.MODEL_ARCH.BAILINGMOE2
+    # Sarvam-MoE shares the BailingMoeV2 architecture; only differences:
+    #  - full rotary (no partial_rotary_factor)
+    #  - expert bias is zero-mean normalized at load time
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        if (rope_dim := hparams.get("head_dim")) is None:
+            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
+        # Override the partial-rotary value written by BailingMoeV2 with the full rotary dim
+        self.gguf_writer.add_rope_dimension_count(rope_dim)
+
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, gen = item
+        if name.endswith(".expert_bias"):
+            # Sarvam normalizes expert bias to zero mean
+            inner = gen
+
+            def gen():
+                t = inner()
+                return t - t.mean()
+        return super().filter_tensors((name, gen))
+
+
@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
 class GroveMoeModel(TextModel):
    model_arch = gguf.MODEL_ARCH.GROVEMOE
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -155,6 +155,7 @@ models = [
    {"name": "joyai-llm",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
    {"name": "kanana2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
    {"name": "f2llmv2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
+    {"name": "sarvam-moe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -737,6 +737,14 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
 | UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|

+## Compile-time Flags
+
+Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spot.
+
+| Name            | Function                                                                         |
+|-----------------|----------------------------------------------------------------------------------|
+| DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. |
+
 ## Design Rule

 - Open to all contributors.
--- a/flake.lock
+++ b/flake.lock
@@ -1,58 +0,0 @@
-{
-  "nodes": {
-    "flake-parts": {
-      "inputs": {
-        "nixpkgs-lib": "nixpkgs-lib"
-      },
-      "locked": {
-        "lastModified": 1730504689,
-        "narHash": "sha256-hgmguH29K2fvs9szpq2r3pz2/8cJd2LPS+b4tfNFCwE=",
-        "owner": "hercules-ci",
-        "repo": "flake-parts",
-        "rev": "506278e768c2a08bec68eb62932193e341f55c90",
-        "type": "github"
-      },
-      "original": {
-        "owner": "hercules-ci",
-        "repo": "flake-parts",
-        "type": "github"
-      }
-    },
-    "nixpkgs": {
-      "locked": {
-        "lastModified": 1732014248,
-        "narHash": "sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w=",
-        "owner": "NixOS",
-        "repo": "nixpkgs",
-        "rev": "23e89b7da85c3640bbc2173fe04f4bd114342367",
-        "type": "github"
-      },
-      "original": {
-        "owner": "NixOS",
-        "ref": "nixos-unstable",
-        "repo": "nixpkgs",
-        "type": "github"
-      }
-    },
-    "nixpkgs-lib": {
-      "locked": {
-        "lastModified": 1730504152,
-        "narHash": "sha256-lXvH/vOfb4aGYyvFmZK/HlsNsr/0CVWlwYvo2rxJk3s=",
-        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
-      },
-      "original": {
-        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
-      }
-    },
-    "root": {
-      "inputs": {
-        "flake-parts": "flake-parts",
-        "nixpkgs": "nixpkgs"
-      }
-    }
-  },
-  "root": "root",
-  "version": 7
-}
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
 set(GGML_VERSION_MINOR 11)
-set(GGML_VERSION_PATCH 0)
+set(GGML_VERSION_PATCH 1)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
--- a/ggml/src/ggml-cuda/allreduce.cu
+++ b/ggml/src/ggml-cuda/allreduce.cu
@@ -0,0 +1,968 @@
+#include "allreduce.cuh"
+
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+
+#include "convert.cuh"
+#include "ggml-impl.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+
+// ---------------------------------------------------------------------------
+// CUDA AllReduce for tensor-parallel inference across two GPUs.
+//
+// Provides an in-place sum reduction over matching tensors on two CUDA
+// devices in the same process.  Used by the tensor-split path alongside
+// NCCL; targets setups without NVLink, where data is exchanged between the
+// GPUs by staging it through pinned host memory over PCIe.
+//
+// Two reduction strategies are selected per call by tensor size:
+//
+//   * Chunked kernel path (small reductions): a single CUDA kernel both
+//     stages data through pinned host memory and performs the local sum.
+//     Cross-GPU synchronization happens *inside the kernel* (busy-wait on
+//     a host-memory flag), which keeps launch overhead low for the
+//     latency-sensitive token-generation case.
+//
+//   * Copy-engine path (large reductions): the transfer is split into
+//     D2H + H2D cudaMemcpyAsync chunks driven by the GPU's copy engine,
+//     followed by a small device-side add kernel.  Cross-GPU
+//     synchronization happens *outside the kernel*, via CUDA events
+//     between streams.  This keeps the compute engine free while large
+//     transfers are in flight, which matters for prefill-sized tensors.
+//     Reductions larger than the per-call inner cap are processed by an
+//     outer chunker that issues sequential inner calls.
+// ---------------------------------------------------------------------------
+
+// ---------------------------------------------------------------------------
+// Cross-GPU signal mechanism
+//
+// One int per (slot, rank) pair in pinned host memory.  Each AR call writes a
+// strictly increasing token (= the AR call number) into its own arrival int.
+// The peer spins until its read of the other's arrival int equals the token
+// it expects for this call -- a mismatch means the peer hasn't arrived yet.
+// Tokens never repeat over realistic call rates (32-bit int wraps in tens of
+// days at thousands of ARs/sec), so arrival ints don't need to be reset
+// between calls; we initialize once at pipeline init and let the values
+// accumulate.
+//
+// There is exactly one writer (the owning GPU) and one reader (the peer), so
+// we don't need atomics.  A volatile store paired with __threadfence_system()
+// provides the release ordering that makes the D2H writes visible system-wide
+// before the arrival token is observed.
+//
+// atomicAdd_system() requires hostNativeAtomicSupported, which is unavailable
+// on PCIe-attached consumer GPUs without NVLink, so the volatile path is the
+// portable choice.
+// ---------------------------------------------------------------------------
+
+static __device__ __forceinline__ void ggml_cuda_ar_signal_set(int * p, int token) {
+    *(volatile int *)p = token;
+}
+static __device__ __forceinline__ int ggml_cuda_ar_signal_get(const int * p) {
+    return *(const volatile int *)p;
+}
+
+// Byte spacing between adjacent arrival ints.  64 bytes (one cache line)
+// ensures each GPU/block's arrival slot lives on its own line, preventing
+// false-sharing stalls on the polling GPU.
+static constexpr size_t GGML_CUDA_AR_ARRIVAL_STRIDE = 64;
+
+// Number of blocks the chunked kernel launches with.  Each block stripes a
+// disjoint slice of the data and synchronizes through its own arrival-token
+// slot so multiple SMs can pump PCIe stores in parallel.
+static constexpr int GGML_CUDA_AR_KERNEL_BLOCKS = 8;
+
+// ---------------------------------------------------------------------------
+// Chunked kernel AllReduce -- 2 GPUs, supports float, half, and bfloat16.
+//
+// Both GPUs run this kernel simultaneously on independent streams.  sendbuf
+// and recvbuf live in T_dst (the caller's tensor type); host_mine / host_other
+// carry data in T_wire (the on-wire type, possibly narrower than T_dst -- e.g.
+// T_dst=F32 with T_wire=BF16 halves the bytes pushed across PCIe).  When
+// T_dst == T_wire the casts below are no-ops.
+//
+// Each GPU runs three phases:
+//
+//   Phase 1 (all threads): cast sendbuf (T_dst) -> T_wire and store as
+//                          single-instruction-width vectors into host_mine.
+//                          __threadfence_system() commits these writes to host
+//                          memory.
+//   Phase 2 (thread 0):    write token to arrival_mine; spin until
+//                          arrival_other == token.
+//   Phase 3 (all threads): read T_wire vectors from host_other, cast
+//                          each element to T_dst, and sum with the local
+//                          sendbuf value (also rounded through T_wire so that
+//                          both GPUs truncate identically -- this guarantees
+//                          bit-equivalent results across the two devices).
+//
+// Multi-block: blocks stripe vectors across (gridDim.x * blockDim.x) global
+// threads to keep multiple SMs issuing PCIe stores in parallel.  Each block
+// has its own arrival-token slot (offset by blockIdx.x * ARRIVAL_STRIDE);
+// thread 0 of each block signals/spins on that slot independently of other
+// blocks.  Tail elements (the leftover < ELEMS_PER_VEC at the end) are
+// handled only by block 0 to avoid cross-block writes to the same slots.
+// ---------------------------------------------------------------------------
+template <typename T_dst, typename T_wire>
+static __global__ void ggml_cuda_ar_kernel(
+        const T_dst  *              sendbuf,
+        T_dst        *              recvbuf,
+        T_wire       * __restrict__ host_mine,
+        const T_wire * __restrict__ host_other,
+        int                         count,
+        int *                       arrival_mine,
+        int *                       arrival_other,
+        int                         token) {
+
+    // Vector unit for the wire type, sized to the arch's widest single-instruction
+    // copy (16 B on Volta+).  Each phase-1 iter writes one vector to host memory;
+    // each phase-3 iter reads one and produces ELEMS_PER_VEC sums.
+    constexpr int ELEMS_PER_VEC = ggml_cuda_get_max_cpy_bytes() / sizeof(T_wire);
+    constexpr int ARRIVAL_INTS  = (int)(GGML_CUDA_AR_ARRIVAL_STRIDE / sizeof(int));
+
+    const int tid       = threadIdx.x;
+    const int nt        = blockDim.x;
+    const int bid       = blockIdx.x;
+    const int gtid      = bid * nt + tid;
+    const int gnt       = gridDim.x * nt;
+    const int count_vec = count / ELEMS_PER_VEC;
+    const int tail      = count_vec * ELEMS_PER_VEC;
+
+    // Phase 1: cast sendbuf (T_dst) -> host_mine (T_wire) and store as vectors.
+    {
+        for (int i = gtid; i < count_vec; i += gnt) {
+            const int off = i * ELEMS_PER_VEC;
+            T_wire wire[ELEMS_PER_VEC];
+            #pragma unroll
+            for (int k = 0; k < ELEMS_PER_VEC; ++k) {
+                wire[k] = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
+            }
+            ggml_cuda_memcpy_1<sizeof(wire)>(&host_mine[off], wire);
+        }
+        if (bid == 0 && tid < count - tail) {
+            host_mine[tail + tid] = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
+        }
+    }
+
+    // Commit this block's host writes before signalling.
+    __threadfence_system();
+    __syncthreads();
+
+    // Phase 2: thread 0 of each block signals on its own arrival slot, then
+    // spins for the matching slot from peer.  Per-block tokens mean blocks
+    // proceed independently -- no inter-block barrier needed.
+    if (tid == 0) {
+        int       * my_slot    = arrival_mine  + bid * ARRIVAL_INTS;
+        const int * other_slot = arrival_other + bid * ARRIVAL_INTS;
+
+        ggml_cuda_ar_signal_set(my_slot, token);
+        __threadfence_system(); // make our signal visible system-wide
+
+        while (ggml_cuda_ar_signal_get(other_slot) != token) {
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+            __nanosleep(100);
+#else
+            NO_DEVICE_CODE;
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+        }
+    }
+
+    __syncthreads();
+
+    // Acquire peer's host_other writes (this block's stripe of them).
+    __threadfence_system();
+
+    // Phase 3: read peer's T_wire vector, cast both sides through T_wire for
+    // bit-equivalence, sum in T_dst precision, and write back to recvbuf.
+    {
+        for (int i = gtid; i < count_vec; i += gnt) {
+            const int off = i * ELEMS_PER_VEC;
+            T_wire wire[ELEMS_PER_VEC];
+            ggml_cuda_memcpy_1<sizeof(wire)>(wire, &host_other[off]);
+            #pragma unroll
+            for (int k = 0; k < ELEMS_PER_VEC; ++k) {
+                const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
+                recvbuf[off + k] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(wire[k]);
+            }
+        }
+        if (bid == 0 && tid < count - tail) {
+            const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
+            recvbuf[tail + tid] =
+                ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(host_other[tail + tid]);
+        }
+    }
+}
+
+// Combined load-convert-add kernel.  The peer's contribution arrives as T_src
+// (which may be a lower-precision type than T_dst when the BF16 round-trip is
+// active).  For bit-equivalence between the two GPUs, dst is first rounded
+// through T_src's precision via ggml_cuda_cast -- peer already truncated its
+// own value the same way before sending -- so both sides perform identical
+// arithmetic.  When T_dst == T_src the round-trip cast is a no-op.
+template <typename T_dst, typename T_src>
+static __global__ void ggml_cuda_ar_add_kernel(
+        T_dst       * __restrict__ dst,
+        const T_src * __restrict__ src,
+        int count) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int nt  = gridDim.x * blockDim.x;
+    for (int i = tid; i < count; i += nt) {
+        const T_src d_low = ggml_cuda_cast<T_src>(dst[i]);
+        dst[i] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(src[i]);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Pipeline structure
+// ---------------------------------------------------------------------------
+
+// Number of slots in the event / arrival ring.  Two slots is sufficient:
+// lockstep guarantees the two GPUs are at most one AR (or chunk) apart, so
+// slot[N%2] is always safe to reuse -- peer has already consumed slot[N%2]
+// from AR N-2 by the time we get to AR N.  acquire_slot's
+// cudaEventSynchronize on ev.ker for both devices makes that consumption
+// explicit before we overwrite host_buf[slot] for the new AR.
+static constexpr int GGML_CUDA_AR_POOL_SIZE = 2;
+
+// Maximum chunk size (bytes per GPU) handled by one chunked kernel launch.
+// Larger tensors are reduced by issuing multiple chunked launches.
+static constexpr size_t GGML_CUDA_AR_MAX_BYTES = 1024 * 1024; // 1 MB
+
+// Copy-engine path: largest tensor accepted on this path; sets host_large /
+// dev_tmp allocation size.
+static constexpr size_t GGML_CUDA_AR_COPY_MAX_BYTES = 32 * 1024 * 1024; // 32 MB
+
+// AR wire size at which the copy-engine path takes over from the chunked-
+// kernel path.  Override via GGML_CUDA_AR_COPY_THRESHOLD.
+static constexpr size_t GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT = 1024 * 1024; // 1 MB
+// Per-call CE chunk-size heuristic: chunk_bytes = clamp(nbytes / 4, MIN, MAX).
+// The /4 keeps ~4 chunks in flight at any moment (good D2H/H2D overlap with
+// the peer); the clamps cover the cases where nbytes/4 is too small (per-
+// memcpy fixed cost dominates) or too large (chunk-level pipelining stalls).
+// Env var GGML_CUDA_AR_COPY_CHUNK_BYTES can override with a fixed value.
+static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN = 512 * 1024;       // 512 KB
+static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX = 2 * 1024 * 1024;  // 2 MB
+// Absolute floor that an env-var override is allowed to set; this caps the
+// per-slot copy-event array.  256 KB -> up to 128 chunks per 32 MB tensor.
+static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN = 256 * 1024;
+static constexpr int GGML_CUDA_AR_COPY_MAX_CHUNKS =
+    static_cast<int>((GGML_CUDA_AR_COPY_MAX_BYTES + GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN - 1) /
+                    GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN);
+
+struct ggml_cuda_ar_event_slot {
+    cudaEvent_t app = nullptr;  // upstream computation complete
+    cudaEvent_t cpy[GGML_CUDA_AR_COPY_MAX_CHUNKS] = {};  // copy-engine D2H chunks complete
+    cudaEvent_t h2d = nullptr;  // copy-engine H2Ds complete (handoff AR stream -> compute stream)
+    cudaEvent_t ker = nullptr;  // AllReduce kernel complete
+};
+
+// Mapped pinned host allocation: cudaHostAlloc + cudaHostGetDevicePointer
+// in one place, with the host handle preserved for cudaFreeHost.  Used where
+// the CPU never touches the buffer -- only the device reads/writes via the
+// mapped device pointer.  Required on systems where cudaDevAttrCanUseHost-
+// PointerForRegisteredMem is 0 and the host pointer can't be used as a
+// device pointer.
+struct ggml_cuda_ar_host_mapping {
+    uint8_t * host = nullptr;   // cudaFreeHost handle; also the H-side ptr for cudaMemcpyAsync
+    uint8_t * dev  = nullptr;   // device-side pointer for kernels / cudaMemset
+
+    cudaError_t alloc(size_t bytes) {
+        cudaError_t rc = cudaHostAlloc(reinterpret_cast<void **>(&host), bytes,
+                                       cudaHostAllocPortable | cudaHostAllocMapped);
+        if (rc != cudaSuccess) {
+            host = nullptr;
+            return rc;
+        }
+        rc = cudaHostGetDevicePointer(reinterpret_cast<void **>(&dev), host, 0);
+        if (rc != cudaSuccess) {
+            cudaFreeHost(host);
+            host = nullptr;
+            dev  = nullptr;
+        }
+        return rc;
+    }
+
+    void free() {
+        if (host) {
+            cudaFreeHost(host);
+            host = nullptr;
+            dev  = nullptr;
+        }
+    }
+};
+
+struct ggml_cuda_ar_pipeline {
+    int      n_devices;
+    int      devices[GGML_CUDA_MAX_DEVICES];
+    size_t   buf_bytes;    // bytes per device in host_buf[]
+    size_t   copy_bytes;   // bytes per device in host_large[] / dev_tmp[]
+    size_t   copy_threshold;
+    size_t   copy_chunk_bytes;
+    size_t   bf16_threshold; // tensors >= this size (bytes) are reduced via FP32->BF16 round-trip; 0 disables
+    uint64_t call_count;
+
+    // Per-device resources.
+    ggml_cuda_ar_host_mapping host_buf[GGML_CUDA_MAX_DEVICES];   // pinned staging (chunked kernel)
+    ggml_cuda_ar_host_mapping host_large[GGML_CUDA_MAX_DEVICES]; // pinned staging (copy-engine)
+    char *                    dev_tmp[GGML_CUDA_MAX_DEVICES];    // device scratch for copy-engine path
+    cudaStream_t             streams[GGML_CUDA_MAX_DEVICES];   // non-blocking
+    ggml_cuda_ar_event_slot  ev_pool[GGML_CUDA_MAX_DEVICES][GGML_CUDA_AR_POOL_SIZE];
+
+    // Copy-engine: per-device "I finished reading my peer's host_large"
+    // event.  Indexed by RECORDER device.  Recorded same-device on streams[i]
+    // after stage 2's last H2D from host_large[peer].  Waited cross-device
+    // by peer's stage-1 stream before the next AR overwrites host_large[peer].
+    cudaEvent_t              host_large_read_done[GGML_CUDA_MAX_DEVICES];
+    bool                     host_large_read_done_valid;
+
+    // Copy-engine: per-device "my add_kernel is done with dev_tmp" event.
+    // Recorded on the compute stream after each add_kernel; the AR stream
+    // waits on it before the next copy_impl's H2D overwrites dev_tmp.  Lets us
+    // single-buffer dev_tmp despite add_kernel running on a separate stream.
+    cudaEvent_t              dev_tmp_kernel_done[GGML_CUDA_MAX_DEVICES];
+    bool                     dev_tmp_kernel_done_valid;
+
+    // Arrival ring: ARRIVAL_STRIDE bytes between adjacent ints.  Mapped pinned
+    // memory; CPU never reads/writes -- only the kernel and cudaMemset.
+    // Use ggml_cuda_ar_arrival_ptr() to index.
+    ggml_cuda_ar_host_mapping arrival;
+};
+
+// Base pointer for the (slot, rank) per-block token block.  The kernel adds
+// blockIdx.x * (ARRIVAL_STRIDE/sizeof(int)) internally to land on its own slot.
+static int * ggml_cuda_ar_arrival_ptr(const ggml_cuda_ar_pipeline * p, int slot, int rank) {
+    const size_t offset = ((size_t)slot * p->n_devices + rank) *
+                          GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE;
+    return reinterpret_cast<int *>(p->arrival.dev + offset);
+}
+
+static uint64_t ggml_cuda_ar_env_u64(const char * name, uint64_t default_value) {
+    const char * value = getenv(name);
+    if (value == nullptr || value[0] == '\0') {
+        return default_value;
+    }
+
+    char * end = nullptr;
+    const unsigned long long parsed = strtoull(value, &end, 10);
+    return end != value ? (uint64_t) parsed : default_value;
+}
+
+struct ggml_cuda_ar_slot_info {
+    int slot;
+    int token;
+};
+
+static ggml_cuda_ar_slot_info ggml_cuda_ar_acquire_slot(ggml_cuda_ar_pipeline * p) {
+    const int  slot        = static_cast<int>(p->call_count % GGML_CUDA_AR_POOL_SIZE);
+    const bool pool_lapped = p->call_count >= GGML_CUDA_AR_POOL_SIZE;
+    p->call_count++;
+
+    if (pool_lapped) {
+        for (int i = 0; i < p->n_devices; ++i) {
+            ggml_cuda_set_device(p->devices[i]);
+            CUDA_CHECK(cudaEventSynchronize(p->ev_pool[i][slot].ker));
+        }
+    }
+
+    return { slot, (int) p->call_count };
+}
+
+// Per-AR copy-engine chunk size: env-var override if set, else heuristic
+// (clamp(nbytes/4, HEURISTIC_MIN, HEURISTIC_MAX)).
+static size_t ggml_cuda_ar_chunk_bytes(const ggml_cuda_ar_pipeline * p, size_t nbytes) {
+    if (p->copy_chunk_bytes > 0) {
+        return p->copy_chunk_bytes;
+    }
+    return std::min(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX,
+                    std::max(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN, nbytes / 4));
+}
+
+static void ggml_cuda_ar_wait_for_compute(
+        ggml_cuda_ar_pipeline * p, ggml_backend_cuda_context * cuda_ctx, int rank, int slot) {
+    ggml_cuda_ar_event_slot & ev = p->ev_pool[rank][slot];
+    CUDA_CHECK(cudaEventRecord(ev.app, cuda_ctx->stream()));
+    CUDA_CHECK(cudaStreamWaitEvent(p->streams[rank], ev.app));
+}
+
+// ---------------------------------------------------------------------------
+// Init / free
+// ---------------------------------------------------------------------------
+
+ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int * devices, size_t n_devices) {
+
+    if (n_devices != 2) {
+        GGML_LOG_DEBUG("%s: internal AllReduce only supports n_devices=2 (got %zu); "
+                       "falling back\n", __func__, n_devices);
+        return nullptr;
+    }
+
+    // The chunked kernel uses __nanosleep, which is sm70+ (Volta+).
+    for (size_t i = 0; i < n_devices; ++i) {
+        const int cc = ggml_cuda_info().devices[devices[i]].cc;
+        if (cc < GGML_CUDA_CC_VOLTA) {
+            GGML_LOG_DEBUG("%s: internal AllReduce requires compute capability >= %d "
+                           "(device %d has cc=%d); falling back\n",
+                           __func__, GGML_CUDA_CC_VOLTA, devices[i], cc);
+            return nullptr;
+        }
+    }
+
+    auto * p = new ggml_cuda_ar_pipeline{};
+    p->n_devices        = n_devices;
+    p->copy_bytes       = GGML_CUDA_AR_COPY_MAX_BYTES;
+    p->copy_threshold   = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_THRESHOLD", GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT);
+    // 0 = use the per-call heuristic (default).  Non-zero env value forces a
+    // fixed chunk size for diagnostics, with a floor at COPY_CHUNK_BYTES_MIN.
+    p->copy_chunk_bytes = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_CHUNK_BYTES", 0);
+    if (p->copy_chunk_bytes > 0 && p->copy_chunk_bytes < GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN) {
+        GGML_LOG_WARN("%s: GGML_CUDA_AR_COPY_CHUNK_BYTES=%zu below minimum %zu; clamping\n",
+                      __func__, p->copy_chunk_bytes, GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN);
+        p->copy_chunk_bytes = GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN;
+    }
+    // Default 1: BF16 round-trip is always on for F32 inputs (any non-zero
+    // ne).  Set GGML_CUDA_AR_BF16_THRESHOLD=0 to disable, or to a larger
+    // byte threshold to opt out for small tensors.
+    p->bf16_threshold   = ggml_cuda_ar_env_u64("GGML_CUDA_AR_BF16_THRESHOLD", 1);
+    for (size_t i = 0; i < n_devices; ++i) {
+        p->devices[i] = devices[i];
+    }
+
+    // Per-device streams and event pools.
+    for (size_t i = 0; i < n_devices; ++i) {
+        ggml_cuda_set_device(p->devices[i]);
+
+        cudaStream_t stream = nullptr;
+        if (cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking) != cudaSuccess) {
+            GGML_LOG_ERROR("%s: cudaStreamCreateWithFlags failed for device %d\n",
+                           __func__, p->devices[i]);
+            ggml_cuda_ar_pipeline_free(p);
+            return nullptr;
+        }
+        p->streams[i] = stream;
+
+        for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) {
+            bool ok =
+                cudaEventCreateWithFlags(&p->ev_pool[i][s].app, cudaEventDisableTiming) == cudaSuccess &&
+                cudaEventCreateWithFlags(&p->ev_pool[i][s].h2d, cudaEventDisableTiming) == cudaSuccess &&
+                cudaEventCreateWithFlags(&p->ev_pool[i][s].ker, cudaEventDisableTiming) == cudaSuccess;
+            for (int c = 0; ok && c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) {
+                ok = cudaEventCreateWithFlags(&p->ev_pool[i][s].cpy[c], cudaEventDisableTiming) == cudaSuccess;
+            }
+            if (!ok) {
+                GGML_LOG_ERROR("%s: cudaEventCreate failed for device %d slot %d\n",
+                               __func__, p->devices[i], s);
+                ggml_cuda_ar_pipeline_free(p);
+                return nullptr;
+            }
+        }
+
+        if (cudaEventCreateWithFlags(&p->host_large_read_done[i], cudaEventDisableTiming) != cudaSuccess) {
+            GGML_LOG_ERROR("%s: cudaEventCreate for host_large_read_done failed for device %d\n",
+                           __func__, p->devices[i]);
+            ggml_cuda_ar_pipeline_free(p);
+            return nullptr;
+        }
+        if (cudaEventCreateWithFlags(&p->dev_tmp_kernel_done[i], cudaEventDisableTiming) != cudaSuccess) {
+            GGML_LOG_ERROR("%s: cudaEventCreate for dev_tmp_kernel_done failed for device %d\n",
+                           __func__, p->devices[i]);
+            ggml_cuda_ar_pipeline_free(p);
+            return nullptr;
+        }
+    }
+
+    // Arrival ring: cache-line padded so each GPU's int is on its own line.
+    const size_t arrival_bytes =
+        (size_t)GGML_CUDA_AR_POOL_SIZE * n_devices *
+        GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE;
+    if (p->arrival.alloc(arrival_bytes) != cudaSuccess) {
+        GGML_LOG_ERROR("%s: alloc for arrival ring failed (%zu bytes)\n",
+                       __func__, arrival_bytes);
+        ggml_cuda_ar_pipeline_free(p);
+        return nullptr;
+    }
+    ggml_cuda_set_device(p->devices[0]);
+    if (cudaMemset(p->arrival.dev, 0, arrival_bytes) != cudaSuccess) {
+        GGML_LOG_ERROR("%s: cudaMemset for arrival ring failed (%zu bytes)\n",
+                       __func__, arrival_bytes);
+        ggml_cuda_ar_pipeline_free(p);
+        return nullptr;
+    }
+
+    // Per-device pinned staging buffers -- POOL_SIZE-deep ring so the chunked-
+    // kernel can write the next slot's data while the peer is still reading
+    // the previous slot's. Indexed by (slot * buf_bytes) at the call site.
+    p->buf_bytes = GGML_CUDA_AR_MAX_BYTES;
+    const size_t host_buf_total = (size_t) GGML_CUDA_AR_POOL_SIZE * p->buf_bytes;
+    for (size_t i = 0; i < n_devices; ++i) {
+        if (p->host_buf[i].alloc(host_buf_total) != cudaSuccess) {
+            GGML_LOG_ERROR("%s: alloc for staging failed (%zu bytes)\n",
+                           __func__, host_buf_total);
+            ggml_cuda_ar_pipeline_free(p);
+            return nullptr;
+        }
+    }
+
+    // Copy-engine path: pinned host staging + device scratch, sized for the
+    // largest tensor we accept on this path (GGML_CUDA_AR_COPY_MAX_BYTES).
+    // dev_tmp is single-buffered; cross-AR safety is enforced by an explicit
+    // cross-stream wait in copy_impl on the prior AR's add_kernel-done event.
+    for (size_t i = 0; i < n_devices; ++i) {
+        ggml_cuda_set_device(p->devices[i]);
+        if (p->host_large[i].alloc(p->copy_bytes) != cudaSuccess) {
+            GGML_LOG_ERROR("%s: alloc for large staging failed (%zu bytes)\n",
+                           __func__, p->copy_bytes);
+            ggml_cuda_ar_pipeline_free(p);
+            return nullptr;
+        }
+        if (cudaMalloc(reinterpret_cast<void **>(&p->dev_tmp[i]), p->copy_bytes) != cudaSuccess) {
+            GGML_LOG_ERROR("%s: cudaMalloc for copy scratch failed (%zu bytes) on device %d\n",
+                           __func__, p->copy_bytes, p->devices[i]);
+            ggml_cuda_ar_pipeline_free(p);
+            return nullptr;
+        }
+    }
+
+    GGML_LOG_INFO("%s: initialized AllReduce pipeline: %zu GPUs, "
+                  "%zu KB chunked kernel staging + %zu MB copy-engine staging per GPU\n",
+                  __func__, n_devices, p->buf_bytes >> 10, p->copy_bytes >> 20);
+
+    return p;
+}
+
+void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * p) {
+    if (!p) {
+        return;
+    }
+
+    // Drain all in-flight kernels before tearing down resources.
+    for (int i = 0; i < p->n_devices; ++i) {
+        if (p->streams[i]) {
+            ggml_cuda_set_device(p->devices[i]);
+            cudaStreamSynchronize(p->streams[i]);
+        }
+    }
+
+    for (int i = 0; i < p->n_devices; ++i) {
+        p->host_buf[i].free();
+        p->host_large[i].free();
+        if (p->dev_tmp[i]) {
+            ggml_cuda_set_device(p->devices[i]);
+            cudaFree(p->dev_tmp[i]);
+        }
+        ggml_cuda_set_device(p->devices[i]);
+        for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) {
+            if (p->ev_pool[i][s].app) { cudaEventDestroy(p->ev_pool[i][s].app); }
+            for (int c = 0; c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) {
+                if (p->ev_pool[i][s].cpy[c]) { cudaEventDestroy(p->ev_pool[i][s].cpy[c]); }
+            }
+            if (p->ev_pool[i][s].h2d) { cudaEventDestroy(p->ev_pool[i][s].h2d); }
+            if (p->ev_pool[i][s].ker) { cudaEventDestroy(p->ev_pool[i][s].ker); }
+        }
+        if (p->host_large_read_done[i]) {
+            ggml_cuda_set_device(p->devices[i]);
+            cudaEventDestroy(p->host_large_read_done[i]);
+        }
+        if (p->dev_tmp_kernel_done[i]) {
+            ggml_cuda_set_device(p->devices[i]);
+            cudaEventDestroy(p->dev_tmp_kernel_done[i]);
+        }
+        if (p->streams[i]) {
+            ggml_cuda_set_device(p->devices[i]);
+            cudaStreamDestroy(p->streams[i]);
+        }
+    }
+    p->arrival.free();
+    delete p;
+}
+
+// ---------------------------------------------------------------------------
+// Dispatch
+// ---------------------------------------------------------------------------
+
+// Asymmetric copy_impl: data sent over PCIe in T_src precision (one element of
+// nbytes per ne element); accumulated locally into a T_dst buffer.  When
+// T_src == T_dst this is the original homogeneous reduction.  When they differ
+// (e.g. BF16 wire / F32 accumulator) the add kernel rounds dst through T_src
+// for bit-equivalence between GPUs and we skip the otherwise-needed
+// post-conversion entirely.
+template <typename T_src, typename T_dst>
+static bool ggml_cuda_ar_allreduce_copy_impl(
+        ggml_cuda_ar_pipeline * p,
+        ggml_backend_t        * backends,
+        T_src * const           src_buf[GGML_CUDA_MAX_DEVICES],
+        T_dst * const           dst_buf[GGML_CUDA_MAX_DEVICES],
+        const bool              compute[GGML_CUDA_MAX_DEVICES],
+        int64_t                 ne,
+        size_t                  nbytes) {
+    GGML_ASSERT(p->n_devices == 2);
+    GGML_ASSERT(nbytes <= p->copy_bytes);
+    GGML_ASSERT(ne <= std::numeric_limits<int>::max());
+
+    const size_t chunk_bytes = ggml_cuda_ar_chunk_bytes(p, nbytes);
+    GGML_ASSERT(chunk_bytes > 0);
+
+    const int slot = ggml_cuda_ar_acquire_slot(p).slot;
+    const size_t copy_chunks = (nbytes + chunk_bytes - 1) / chunk_bytes;
+    GGML_ASSERT(copy_chunks <= GGML_CUDA_AR_COPY_MAX_CHUNKS);
+
+    ggml_backend_cuda_context * cuda_ctx[2] = {};
+
+    // Stage 1: both GPUs copy their local contribution to pinned host memory.
+    for (int i = 0; i < 2; ++i) {
+        ggml_cuda_set_device(p->devices[i]);
+        cuda_ctx[i] = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
+        GGML_ASSERT(cuda_ctx[i]->device == p->devices[i]);
+
+        ggml_cuda_ar_wait_for_compute(p, cuda_ctx[i], i, slot);
+
+        // Wait for peer's H2D from our host_large[i] (recorded in the
+        // previous AR's stage 2) to complete before we overwrite host_large[i].
+        // host_large_read_done[peer] = peer finished reading host_large[i].
+        // No-op on the first AR -- no prior record exists.
+        if (p->host_large_read_done_valid) {
+            const int peer = 1 - i;
+            CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->host_large_read_done[peer]));
+        }
+
+        if (!compute[i]) {
+            CUDA_CHECK(cudaMemsetAsync(src_buf[i], 0, nbytes, p->streams[i]));
+        }
+
+        for (size_t c = 0; c < copy_chunks; ++c) {
+            const size_t offset = c * chunk_bytes;
+            const size_t this_bytes = (nbytes - offset) < chunk_bytes ?
+                (nbytes - offset) : chunk_bytes;
+
+            CUDA_CHECK(cudaMemcpyAsync(
+                p->host_large[i].host + offset, reinterpret_cast<char *>(src_buf[i]) + offset, this_bytes,
+                cudaMemcpyDeviceToHost, p->streams[i]));
+            CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].cpy[c], p->streams[i]));
+        }
+    }
+
+    // Stage 2: each GPU waits for each peer D2H chunk, pulls that chunk back to
+    // local device scratch (dev_tmp), then performs one device-local add over
+    // the assembled peer tensor.  The H2Ds run on the AR stream (copy engine)
+    // and the add_kernel runs on the caller's compute stream, so the AR stream
+    // stays pure-copy and avoids an in-stream copy->compute engine switch every
+    // AR.  dev_tmp is single-buffered: the AR stream waits cross-stream on the
+    // prior AR's add_kernel-done event before overwriting it.
+    for (int i = 0; i < 2; ++i) {
+        const int peer = 1 - i;
+        ggml_cuda_set_device(p->devices[i]);
+
+        // Wait for the previous AR's add_kernel (on the compute stream) to
+        // finish reading dev_tmp before our H2D overwrites it.  No-op on the
+        // first copy_impl call.
+        if (p->dev_tmp_kernel_done_valid) {
+            CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->dev_tmp_kernel_done[i]));
+        }
+
+        for (size_t c = 0; c < copy_chunks; ++c) {
+            const size_t offset = c * chunk_bytes;
+            const size_t this_bytes = (nbytes - offset) < chunk_bytes ?
+                (nbytes - offset) : chunk_bytes;
+
+            CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->ev_pool[peer][slot].cpy[c]));
+            CUDA_CHECK(cudaMemcpyAsync(
+                p->dev_tmp[i] + offset, p->host_large[peer].host + offset, this_bytes,
+                cudaMemcpyHostToDevice, p->streams[i]));
+        }
+
+        // Mark our reads of host_large[peer] complete so peer's next AR can
+        // safely overwrite it.
+        CUDA_CHECK(cudaEventRecord(p->host_large_read_done[i], p->streams[i]));
+
+        // Hand off from AR stream (copy engine) to compute stream: compute
+        // stream waits for all H2Ds to finish, then runs the add_kernel.
+        CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].h2d, p->streams[i]));
+        CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx[i]->stream(), p->ev_pool[i][slot].h2d));
+
+        const int block_size = 256;
+        int n_blocks = (int) ((ne + block_size - 1) / block_size);
+        if (n_blocks > 1024) {
+            n_blocks = 1024;
+        }
+        ggml_cuda_ar_add_kernel<T_dst, T_src><<<n_blocks, block_size, 0, cuda_ctx[i]->stream()>>>(
+            dst_buf[i],
+            reinterpret_cast<const T_src *>(p->dev_tmp[i]),
+            (int) ne);
+        CUDA_CHECK(cudaGetLastError());
+
+        // Record dev_tmp-released on the compute stream so the next copy_impl
+        // can wait for the kernel to finish before overwriting dev_tmp.  Also
+        // record AR-done as ev.ker for acquire_slot's pool-wraparound sync.
+        CUDA_CHECK(cudaEventRecord(p->dev_tmp_kernel_done[i], cuda_ctx[i]->stream()));
+        CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, cuda_ctx[i]->stream()));
+    }
+    p->host_large_read_done_valid = true;
+    p->dev_tmp_kernel_done_valid = true;
+
+    return true;
+}
+
+// Outer-level chunker: copy_impl handles up to copy_bytes per call (limited by
+// the host_large / dev_tmp allocation size).  When the full AR exceeds that,
+// slice the tensor into copy_bytes-sized pieces and call copy_impl repeatedly.
+// Each slice goes through its own stage 1 -> stage 2 cycle and acquires its own
+// slot, so cross-AR fences and pool wraparound work the same way as for any
+// other sequence of small ARs.
+template <typename T_src, typename T_dst>
+static bool ggml_cuda_ar_allreduce_copy_outer(
+        ggml_cuda_ar_pipeline * p,
+        ggml_backend_t        * backends,
+        T_src * const           src_buf[GGML_CUDA_MAX_DEVICES],
+        T_dst * const           dst_buf[GGML_CUDA_MAX_DEVICES],
+        const bool              compute[GGML_CUDA_MAX_DEVICES],
+        int64_t                 ne) {
+    const int64_t outer_max_elems = (int64_t) (p->copy_bytes / sizeof(T_src));
+    GGML_ASSERT(outer_max_elems > 0);
+
+    bool ok = true;
+    for (int64_t outer_start = 0; outer_start < ne && ok; outer_start += outer_max_elems) {
+        const int64_t outer_ne     = std::min(outer_max_elems, ne - outer_start);
+        const size_t  outer_nbytes = (size_t) outer_ne * sizeof(T_src);
+
+        T_src * src[GGML_CUDA_MAX_DEVICES] = {};
+        T_dst * dst[GGML_CUDA_MAX_DEVICES] = {};
+        for (int i = 0; i < p->n_devices; ++i) {
+            src[i] = src_buf[i] + outer_start;
+            dst[i] = dst_buf[i] + outer_start;
+        }
+        ok = ggml_cuda_ar_allreduce_copy_impl<T_src, T_dst>(
+            p, backends, src, dst, compute, outer_ne, outer_nbytes);
+    }
+    return ok;
+}
+
+bool ggml_cuda_ar_allreduce(
+        ggml_cuda_ar_pipeline * p,
+        ggml_backend_t        * backends,
+        ggml_tensor           ** tensors) {
+    GGML_ASSERT(p != nullptr);
+
+    const int n = p->n_devices;
+    GGML_ASSERT(n == 2);
+
+    const ggml_type input_type = tensors[0]->type;
+    GGML_ASSERT(input_type == GGML_TYPE_F32 || input_type == GGML_TYPE_F16 || input_type == GGML_TYPE_BF16);
+
+    const int64_t ne = ggml_nelements(tensors[0]);
+    GGML_ASSERT(ne > 0);
+
+    const size_t   input_nbytes = ggml_nbytes(tensors[0]);
+
+    // BF16 round-trip: F32 inputs >= bf16_threshold are converted to BF16 for
+    // the reduction (chunked or copy-engine), halving on-wire bytes. Matches
+    // NCCL's behaviour. The pre-conversion zeroes inactive shards so the
+    // inner paths see them as already-prepared compute tensors.
+    const bool use_bf16 =
+        input_type == GGML_TYPE_F32 &&
+        p->bf16_threshold > 0 &&
+        input_nbytes >= p->bf16_threshold;
+
+    const ggml_type kernel_type = use_bf16 ? GGML_TYPE_BF16 : input_type;
+    const size_t    type_size   = ggml_type_size(kernel_type);
+    GGML_ASSERT(p->buf_bytes >= type_size);
+    const size_t    nbytes      = (size_t) ne * type_size;
+
+    bool compute_flag[GGML_CUDA_MAX_DEVICES] = {};
+    for (int i = 0; i < n; ++i) {
+        compute_flag[i] = (tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) != 0;
+    }
+
+    // Decide between copy-engine and chunked kernel paths based on the working
+    // type's actual byte count.  No upper bound: copy_outer slices reductions
+    // larger than copy_bytes into copy_bytes-sized pieces.
+    const bool use_copy_engine =
+        p->copy_threshold > 0 &&
+        nbytes >= p->copy_threshold;
+
+    // BF16 inactive-shard zeroing: when use_bf16 is on, the combined kernel
+    // (chunked kernel path) and the combined add kernel (copy_engine path)
+    // both accumulate into the F32 tensor data directly, so an inactive
+    // shard's accumulator must start at zero.
+    if (use_bf16) {
+        for (int i = 0; i < n; ++i) {
+            if (!compute_flag[i]) {
+                auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
+                GGML_ASSERT(cuda_ctx->device == p->devices[i]);
+                ggml_cuda_set_device(p->devices[i]);
+                CUDA_CHECK(cudaMemsetAsync(tensors[i]->data, 0, (size_t) ne * sizeof(float), cuda_ctx->stream()));
+            }
+        }
+    }
+
+    // Pre-convert F32 -> BF16 into bf16_tmp ONLY for the copy_engine + use_bf16
+    // path; the chunked kernel path's combined kernel does the conversion
+    // inline as it writes to host_buf.
+    ggml_cuda_pool_alloc<nv_bfloat16> bf16_tmp[GGML_CUDA_MAX_DEVICES];
+    void * copy_src_ptr[GGML_CUDA_MAX_DEVICES] = {};
+
+    if (use_copy_engine && use_bf16) {
+        to_bf16_cuda_t to_bf16 = ggml_get_to_bf16_cuda(GGML_TYPE_F32);
+        for (int i = 0; i < n; ++i) {
+            auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
+            GGML_ASSERT(cuda_ctx->device == p->devices[i]);
+            bf16_tmp[i].pool = &cuda_ctx->pool();
+            bf16_tmp[i].alloc(ne);
+            ggml_cuda_set_device(p->devices[i]);
+            if (compute_flag[i]) {
+                to_bf16(tensors[i]->data, bf16_tmp[i].get(), ne, cuda_ctx->stream());
+                CUDA_CHECK(cudaGetLastError());
+            } else {
+                CUDA_CHECK(cudaMemsetAsync(bf16_tmp[i].get(), 0, nbytes, cuda_ctx->stream()));
+            }
+            copy_src_ptr[i] = bf16_tmp[i].get();
+        }
+    }
+
+    bool ok = true;
+    if (use_copy_engine) {
+        // After up-front BF16 conversion, the tmp buffers already hold the
+        // (possibly zeroed-for-inactive) data, so the inner path can treat
+        // every shard as compute.
+        bool inner_compute[GGML_CUDA_MAX_DEVICES];
+        for (int i = 0; i < n; ++i) {
+            inner_compute[i] = use_bf16 ? true : compute_flag[i];
+        }
+
+        // Dispatch into copy_impl with explicit src/dst types.  When use_bf16
+        // is on, the wire type is BF16 (src = bf16_tmp) and the accumulator
+        // is F32 (dst = tensors[i]->data); the combined add kernel rounds dst
+        // through BF16 for bit-equivalence and writes F32 directly, so no
+        // post-conversion is needed.  Otherwise src == dst (same native type).
+        if (use_bf16) {
+            GGML_ASSERT(kernel_type == GGML_TYPE_BF16);
+            nv_bfloat16 * src[GGML_CUDA_MAX_DEVICES] = {};
+            float       * dst[GGML_CUDA_MAX_DEVICES] = {};
+            for (int i = 0; i < n; ++i) {
+                src[i] = static_cast<nv_bfloat16 *>(copy_src_ptr[i]);
+                dst[i] = static_cast<float *>(tensors[i]->data);
+            }
+            ok = ggml_cuda_ar_allreduce_copy_outer<nv_bfloat16, float>(
+                p, backends, src, dst, inner_compute, ne);
+        } else {
+            switch (kernel_type) {
+                case GGML_TYPE_F32: {
+                    float * buf[GGML_CUDA_MAX_DEVICES] = {};
+                    for (int i = 0; i < n; ++i) {
+                        buf[i] = static_cast<float *>(tensors[i]->data);
+                    }
+                    ok = ggml_cuda_ar_allreduce_copy_outer<float, float>(
+                        p, backends, buf, buf, inner_compute, ne);
+                    break;
+                }
+                case GGML_TYPE_BF16: {
+                    nv_bfloat16 * buf[GGML_CUDA_MAX_DEVICES] = {};
+                    for (int i = 0; i < n; ++i) {
+                        buf[i] = static_cast<nv_bfloat16 *>(tensors[i]->data);
+                    }
+                    ok = ggml_cuda_ar_allreduce_copy_outer<nv_bfloat16, nv_bfloat16>(
+                        p, backends, buf, buf, inner_compute, ne);
+                    break;
+                }
+                case GGML_TYPE_F16: {
+                    half * buf[GGML_CUDA_MAX_DEVICES] = {};
+                    for (int i = 0; i < n; ++i) {
+                        buf[i] = static_cast<half *>(tensors[i]->data);
+                    }
+                    ok = ggml_cuda_ar_allreduce_copy_outer<half, half>(
+                        p, backends, buf, buf, inner_compute, ne);
+                    break;
+                }
+                default:
+                    GGML_ASSERT(false);
+            }
+        }
+    } else {
+        // host_buf carries T_wire-typed data; max_chunk_elems is the count that
+        // fits in one host_buf at the wire size.
+        const size_t max_chunk_elems = p->buf_bytes / type_size;
+        const size_t input_type_size = ggml_type_size(input_type);
+
+        // Chunked kernel path runs entirely on the caller's compute stream:
+        // since AR is a barrier here, same-stream ordering subsumes any
+        // cross-stream event handshake that the copy-engine path needs, and
+        // skips the cross-stream scheduling overhead that was hurting the
+        // small-tensor (tg) latency on the AR-stream variant.  Only ev.ker is
+        // still recorded at end-of-AR for acquire_slot's pool-wraparound check.
+        for (int64_t chunk_start = 0; chunk_start < ne; chunk_start += (int64_t) max_chunk_elems) {
+            const size_t remaining_elems = (size_t) (ne - chunk_start);
+            const size_t chunk_elems = remaining_elems < max_chunk_elems ? remaining_elems : max_chunk_elems;
+            const size_t chunk_dst_bytes  = chunk_elems * input_type_size;
+
+            const auto [slot, token] = ggml_cuda_ar_acquire_slot(p);
+            const bool last_chunk = chunk_start + (int64_t) chunk_elems == ne;
+
+            for (int i = 0; i < n; ++i) {
+                const int peer = 1 - i;  // valid for n == 2 only
+                ggml_cuda_set_device(p->devices[i]);
+                auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
+                GGML_ASSERT(cuda_ctx->device == p->devices[i]);
+                cudaStream_t stream = cuda_ctx->stream();
+
+                char * data = static_cast<char *>(tensors[i]->data) + chunk_start * (int64_t) input_type_size;
+
+                // Match NCCL/meta-backend semantics: inactive shards contribute
+                // zeros.  On the BF16 path the F32 tensor data was already
+                // zeroed up-front (above), so per-chunk zeroing isn't needed.
+                if (!compute_flag[i] && !use_bf16) {
+                    CUDA_CHECK(cudaMemsetAsync(data, 0, chunk_dst_bytes, stream));
+                }
+
+#define LAUNCH_AR_KERNEL(T_dst, T_wire) \
+                ggml_cuda_ar_kernel<T_dst, T_wire><<<dim3(GGML_CUDA_AR_KERNEL_BLOCKS), dim3(256), 0, stream>>>( \
+                    reinterpret_cast<const T_dst *>(data), \
+                    reinterpret_cast<T_dst *>(data), \
+                    reinterpret_cast<T_wire *>(p->host_buf[i].dev + (size_t) slot * p->buf_bytes), \
+                    reinterpret_cast<const T_wire *>(p->host_buf[peer].dev + (size_t) slot * p->buf_bytes), \
+                    static_cast<int>(chunk_elems), \
+                    ggml_cuda_ar_arrival_ptr(p, slot, i), \
+                    ggml_cuda_ar_arrival_ptr(p, slot, peer), \
+                    token)
+
+                if (use_bf16) {
+                    GGML_ASSERT(input_type == GGML_TYPE_F32);
+                    LAUNCH_AR_KERNEL(float, nv_bfloat16);
+                } else {
+                    switch (input_type) {
+                        case GGML_TYPE_F32:  LAUNCH_AR_KERNEL(float,       float);       break;
+                        case GGML_TYPE_F16:  LAUNCH_AR_KERNEL(half,        half);        break;
+                        case GGML_TYPE_BF16: LAUNCH_AR_KERNEL(nv_bfloat16, nv_bfloat16); break;
+                        default: GGML_ASSERT(false);
+                    }
+                }
+
+#undef LAUNCH_AR_KERNEL
+                CUDA_CHECK(cudaGetLastError());
+
+                if (last_chunk) {
+                    CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, stream));
+                }
+            }
+        }
+    }
+
+    return ok;
+}
+
+#else // defined(GGML_USE_HIP) || defined(GGML_USE_MUSA)
+
+// HIP and MUSA lack the host-mapped pinned-memory APIs (cudaHostAllocPortable
+// / cudaHostAllocMapped / cudaHostGetDevicePointer) and __nanosleep that this
+// implementation relies on, so the internal AllReduce is a CUDA-only feature.
+// The dispatcher in ggml-cuda.cu treats a nullptr pipeline as "init failed"
+// and silently falls back to the meta backend's generic AllReduce.
+ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int *, size_t) {
+    return nullptr;
+}
+void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline *) {
+}
+bool ggml_cuda_ar_allreduce(ggml_cuda_ar_pipeline *, ggml_backend_t *, ggml_tensor **) {
+    return false;
+}
+
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
--- a/ggml/src/ggml-cuda/allreduce.cuh
+++ b/ggml/src/ggml-cuda/allreduce.cuh
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "common.cuh"
+#include "ggml-backend-impl.h"
+
+#include <cstddef>
+
+// Opaque pipeline context -- owns all pinned buffers, streams, and events.
+struct ggml_cuda_ar_pipeline;
+
+// Allocate a pipeline for n_devices GPUs.
+// devices[] holds the CUDA device IDs in rank order.
+// Returns nullptr on allocation failure.
+ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(
+    const int * devices, size_t n_devices);
+
+// Release all resources owned by the pipeline.
+void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * pipeline);
+
+// Execute an in-place AllReduce (sum) across tensors[0..n_devices-1].
+// tensors[i] must live on the device managed by backends[i] and be
+// contiguous F32, F16, or BF16.
+// Preconditions are checked by the CUDA comm dispatcher before calling this.
+// Returns true once the reduction work has been enqueued successfully.
+bool ggml_cuda_ar_allreduce(
+    ggml_cuda_ar_pipeline * pipeline,
+    ggml_backend_t        * backends,
+    ggml_tensor           ** tensors);
+
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -61,6 +61,11 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 32, 128, 2,  64,  64,  64,  64, 2, true);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 64, 128, 2,  64,  64,  64,  64, 2, true);

+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128,  8,  64, 4,  64,  96,  64,  64, 2, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 16,  64, 4,  32,  96,  64,  64, 2, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 32, 128, 2,  32,  96,  64,  64, 2, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 64, 128, 2,  32,  96,  64,  64, 2, true);
+
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256,  8,  64, 4,  64, 128, 128, 128, 2, true);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 16,  64, 4,  32, 128, 128, 128, 2, true);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2,  32, 128, 128, 128, 2, true);
@@ -1561,6 +1566,10 @@ static __global__ void flash_attn_ext_f16(
        NO_DEVICE_CODE;
        return;
    }
+    if (DKQ == 192 && ncols2 != 8 && ncols2 != 16) {
+        NO_DEVICE_CODE;
+        return;
+    }
 #ifdef VOLTA_MMA_AVAILABLE
    if (ncols1*ncols2 < 32) {
        NO_DEVICE_CODE;
--- a/ggml/src/ggml-cuda/fattn-tile.cu
+++ b/ggml/src/ggml-cuda/fattn-tile.cu
@@ -34,6 +34,10 @@ void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor
            GGML_ASSERT(V->ne[0] == K->ne[0]);
            ggml_cuda_flash_attn_ext_tile_case<128, 128>(ctx, dst);
        } break;
+        case 192: {
+            GGML_ASSERT(V->ne[0] == 128);
+            ggml_cuda_flash_attn_ext_tile_case<192, 128>(ctx, dst);
+        } break;
        case 256: {
            GGML_ASSERT(V->ne[0] == K->ne[0]);
            ggml_cuda_flash_attn_ext_tile_case<256, 256>(ctx, dst);
--- a/ggml/src/ggml-cuda/fattn-tile.cuh
+++ b/ggml/src/ggml-cuda/fattn-tile.cuh
@@ -62,6 +62,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 16, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 2,  64,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  2,  64, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  4, 128, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  8, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 2,  64,  64)
+
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  2,  64, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  4, 128, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  8, 256, 2,  64,  64)
@@ -124,6 +130,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 16, 128, 3,  32, 128)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 2,  64,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  2, 128, 3,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  4, 128, 3,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  8, 256, 2,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 2,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 2,  32,  64)
+
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  2, 128, 3,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  4, 128, 3,  32,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  8, 256, 2,  32, 256)
@@ -193,6 +205,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 64, 256, 2,  64,  32)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  2, 256, 2, 128,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  4, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  8, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 2,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 2,  32,  64)
+
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  2, 256, 2, 128,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  4, 256, 2,  64, 128)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  8, 256, 2,  64, 128)
@@ -264,6 +282,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 3, 128,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 64, 256, 3,  64,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  2,  64, 8,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  4, 128, 6,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128,  8, 128, 6,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 5,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 3,  64,  64)
+
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  2,  64, 8,  32,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  4, 128, 6,  32, 256)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256,  8, 128, 6,  32, 256)
@@ -1250,7 +1274,20 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
        }
    }

-    if constexpr (DKQ <= 512 && DKQ != 320) {
+    if constexpr (DKQ == 192) {
+        // MiMo-V2.5 / V2.5-Pro / V2-Flash: gqa_ratio is 8 (SWA) or 16 (full attn)
+        if (use_gqa_opt && gqa_ratio % 16 == 0) {
+            launch_fattn_tile_switch_ncols1<DKQ, DV, 16, use_logit_softcap>(ctx, dst);
+            return;
+        }
+        if (use_gqa_opt && gqa_ratio % 8 == 0) {
+            launch_fattn_tile_switch_ncols1<DKQ, DV,  8, use_logit_softcap>(ctx, dst);
+            return;
+        }
+        GGML_ABORT("flash-attn tile (192/128): expected GQA ratio multiple of 8");
+    }
+
+    if constexpr (DKQ <= 512 && DKQ != 320 && DKQ != 192) {
        if (use_gqa_opt && gqa_ratio % 8 == 0) {
            launch_fattn_tile_switch_ncols1<DKQ, DV, 8, use_logit_softcap>(ctx, dst);
            return;
@@ -1303,6 +1340,7 @@ extern DECL_FATTN_TILE_CASE( 80,  80);
 extern DECL_FATTN_TILE_CASE( 96,  96);
 extern DECL_FATTN_TILE_CASE(112, 112);
 extern DECL_FATTN_TILE_CASE(128, 128);
+extern DECL_FATTN_TILE_CASE(192, 128);
 extern DECL_FATTN_TILE_CASE(256, 256);
 extern DECL_FATTN_TILE_CASE(320, 256);
 extern DECL_FATTN_TILE_CASE(512, 512);
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -139,6 +139,22 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
            GGML_ASSERT(V->ne[0] == 128);
            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<128, 128>(ctx, dst);
            break;
+        case 192: {
+            // MiMo-V2.5 / V2.5-Pro / V2-Flash: gqa_ratio is 8 (SWA) or 16 (full attn)
+            GGML_ASSERT(V->ne[0] == 128);
+            float max_bias = 0.0f;
+            memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
+            const bool use_gqa_opt = mask && max_bias == 0.0f;
+            GGML_ASSERT(use_gqa_opt);
+            GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
+            const int gqa_ratio = Q->ne[2] / K->ne[2];
+            if (gqa_ratio % 16 == 0) {
+                ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<192, 128, 16>(ctx, dst);
+            } else {
+                GGML_ASSERT(gqa_ratio % 8 == 0);
+                ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<192, 128,  8>(ctx, dst);
+            }
+        } break;
        case 256:
            GGML_ASSERT(V->ne[0] == 256);
            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<256, 256>(ctx, dst);
@@ -368,6 +384,14 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
                return BEST_FATTN_KERNEL_NONE;
            }
            break;
+        case 192:
+            if (V->ne[0] != 128 || !gqa_opt_applies) {
+                return BEST_FATTN_KERNEL_NONE;
+            }
+            if (gqa_ratio % 8 != 0) {
+                return BEST_FATTN_KERNEL_NONE;
+            }
+            break;
        case 320:
            if (V->ne[0] != 256 || !gqa_opt_applies) {
                return BEST_FATTN_KERNEL_NONE;
@@ -425,7 +449,8 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
    }

    // For small batch sizes the vector kernel may be preferable over the kernels optimized for large batch sizes:
-    const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;
+    // 192 satisfies % 64 == 0 but has no vec instance (DKQ != DV); force it onto the MMA path.
+    const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && Q->ne[0] != 192 && K->ne[1] % FATTN_KQ_STRIDE == 0;

    // If Turing tensor cores are available, use them:
    if (turing_mma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72) {
@@ -454,7 +479,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const

    if (volta_mma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72) {
        int gqa_ratio_eff = 1;
-        const int ncols2_max = Q->ne[0] == 576 ? 16 : 8;
+        const int ncols2_max = (Q->ne[0] == 576 || Q->ne[0] == 192) ? 16 : 8;
        while (gqa_ratio % (2*gqa_ratio_eff) == 0 && gqa_ratio_eff < ncols2_max) {
            gqa_ratio_eff *= 2;
        }
@@ -468,7 +493,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
    }

    // Use the WMMA kernel if possible:
-    if (ggml_cuda_should_use_wmma_fattn(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 512 && Q->ne[0] != 576) {
+    if (ggml_cuda_should_use_wmma_fattn(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 192 && Q->ne[0] != 512 && Q->ne[0] != 576) {
        if (can_use_vector_kernel && Q->ne[1] <= 2) {
            return BEST_FATTN_KERNEL_VEC;
        }
@@ -501,7 +526,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
    }

    // Use MFMA flash attention for CDNA (MI100+):
-    if (amd_mfma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 256 && Q->ne[0] != 512 && Q->ne[0] != 576) {
+    if (amd_mfma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 192 && Q->ne[0] != 256 && Q->ne[0] != 512 && Q->ne[0] != 576) {
        const int64_t eff_nq = Q->ne[1] * (gqa_opt_applies ? gqa_ratio : 1);
        // MMA vs tile crossover benchmarked on MI300X @ d32768:
        //   hsk=64  (gqa=4): MMA wins at eff >= 128 (+11%)
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2,6 +2,7 @@
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"

+#include "ggml-cuda/allreduce.cuh"
 #include "ggml-cuda/common.cuh"
 #include "ggml-cuda/acc.cuh"
 #include "ggml-cuda/add-id.cuh"
@@ -86,6 +87,9 @@

 static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");

+#define GGML_LOG_WARN_ONCE(str) \
+    { static std::once_flag warn_flag; std::call_once(warn_flag, []() { GGML_LOG_WARN(str); }); }
+
 [[noreturn]]
 void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
    int id = -1; // in case cudaGetDevice fails
@@ -1139,70 +1143,46 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
    /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
 };

-#ifdef GGML_USE_NCCL
+// Communication context for multi-GPU AllReduce during tensor parallelism.
+//
+// Created once per meta backend instance.  Resources for the selected mode
+// (NCCL communicators or the internal AllReduce pipeline) are initialised
+// eagerly during comm_init so any init failure surfaces at startup rather
+// than mid-run.
 struct ggml_backend_cuda_comm_context {
+    using try_allreduce_fn = bool(*)(ggml_backend_cuda_comm_context *, struct ggml_tensor **);
+
    std::vector<ggml_backend_t> backends;
-    std::vector<ncclComm_t> comms;
+    std::vector<int>            dev_ids;
+
+    // Set by the init chain (comm_init_{nccl, internal, none}) to one of
+    // try_allreduce_{nccl, internal, butterfly}.  nccl needs `comms`,
+    // internal needs `ar_pipeline`, butterfly needs nothing.  Per-call
+    // failures return false; the meta backend's generic implementation then
+    // handles that call.
+    try_allreduce_fn            try_allreduce = nullptr;
+
+    ggml_cuda_ar_pipeline *     ar_pipeline = nullptr;
+
+#ifdef GGML_USE_NCCL
+    std::vector<ncclComm_t>     comms;
+#endif // GGML_USE_NCCL

    ~ggml_backend_cuda_comm_context() {
+#ifdef GGML_USE_NCCL
        for (ncclComm_t comm : comms) {
            NCCL_CHECK(ncclCommDestroy(comm));
        }
+#endif // GGML_USE_NCCL
+        ggml_cuda_ar_pipeline_free(ar_pipeline);
    }
 };
-#endif // GGML_USE_NCCL

-static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
-#ifdef GGML_USE_NCCL
-    if (comm_ctx_v == nullptr) {
-        return;
-    }
-    ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
-    delete comm_ctx;
-#else
-    GGML_UNUSED(comm_ctx_v);
-#endif // GGML_USE_NCCL
-}
-
-static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
-#ifdef GGML_USE_NCCL
-    for (size_t i = 0; i < n_backends; i++) {
-        if (!ggml_backend_is_cuda(backends[i])) {
-            return nullptr;
-        }
-    }
-    ggml_backend_cuda_comm_context * ret = new ggml_backend_cuda_comm_context;
-    std::vector<int> dev_ids;
-    ret->backends.reserve(n_backends);
-    dev_ids.reserve(n_backends);
-    for (size_t i = 0; i < n_backends; i++) {
-        ret->backends.push_back(backends[i]);
-        ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
-        dev_ids.push_back(cuda_ctx->device);
-    }
-
-    ret->comms.resize(n_backends);
-    NCCL_CHECK(ncclCommInitAll(ret->comms.data(), n_backends, dev_ids.data()));
-    return ret;
-#else
-    // If NCCL is installed it is used by default for optimal performance.
-    // However, NVIDIA does not distribute NCCL with CUDA so users may be unwittingly missing this package.
-    // RCCL is disabled by default, users are explicitly opting in.
-    // Therefore print no warning for RCCL.
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    static bool warning_printed = false;
-    if (!warning_printed) {
-        GGML_LOG_WARN("%s: NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal\n", __func__);
-        warning_printed = true;
-    }
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    GGML_UNUSED_VARS(backends, n_backends);
-    return nullptr;
-#endif // GGML_USE_NCCL
-}
-
-static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
 #ifdef GGML_USE_NCCL
+// AllReduce via NCCL. Reduces as FP32 for small tensors and BF16 for large
+// tensors (bandwidth-bound), then converts back to FP32.
+static bool ggml_backend_cuda_comm_allreduce_nccl(
+        ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
    const int64_t ne = ggml_nelements(tensors[0]);
    // FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0
    // This then causes a crash in this function
@@ -1210,8 +1190,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
        return true;
    }

-    GGML_ASSERT(comm_ctx_v != nullptr);
-    ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
    const size_t n_backends = comm_ctx->backends.size();

    for (size_t i = 0; i < n_backends; ++i) {
@@ -1236,7 +1214,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
            NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, comm_ctx->comms[i], cuda_ctx->stream()));
        }
        NCCL_CHECK(ncclGroupEnd());
-
        return true;
    }

@@ -1275,10 +1252,184 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
    }

    return true;
-#else
-    GGML_UNUSED_VARS(comm_ctx_v, tensors);
-    return false;
+}
 #endif // GGML_USE_NCCL
+
+// Run the internal AR pipeline.  Returns false on unsupported / failed input
+// -- the caller decides whether to abort (env-forced) or fall back silently.
+static bool ggml_backend_cuda_comm_allreduce_internal(
+        ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
+    GGML_ASSERT(comm_ctx->ar_pipeline != nullptr);
+
+    const size_t n_backends = comm_ctx->backends.size();
+    GGML_ASSERT(n_backends == 2);
+    GGML_ASSERT(tensors[0] != nullptr);
+
+    const int64_t   ne   = ggml_nelements(tensors[0]);
+    const ggml_type type = tensors[0]->type;
+
+    if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16 && type != GGML_TYPE_BF16) {
+        GGML_LOG_DEBUG("%s: internal unsupported: type=%d\n", __func__, (int) type);
+        return false;
+    }
+
+    if (ne == 0) {
+        return true;
+    }
+
+    for (size_t i = 0; i < n_backends; ++i) {
+        if (tensors[i] == nullptr) {
+            GGML_LOG_ERROR("%s: internal failed: tensor[%zu] is null\n", __func__, i);
+            return false;
+        }
+        if (ggml_nelements(tensors[i]) != ne || tensors[i]->type != type) {
+            GGML_LOG_ERROR("%s: internal failed: tensor[%zu] ne=%" PRId64 " type=%d expected ne=%" PRId64 " type=%d\n",
+                           __func__, i, ggml_nelements(tensors[i]), (int) tensors[i]->type, ne, (int) type);
+            return false;
+        }
+        if (!ggml_is_contiguously_allocated(tensors[i])) {
+            GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] is not contiguously allocated: ne=%" PRId64 " nbytes=%zu packed=%zu type=%d\n",
+                           __func__, i, ne, ggml_nbytes(tensors[i]),
+                           (size_t) ne * ggml_type_size(type) / ggml_blck_size(type), (int) type);
+            return false;
+        }
+        if (((uintptr_t) tensors[i]->data & 0xF) != 0) {
+            GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] data pointer is not 16-byte aligned: %p type=%d ne=%" PRId64 "\n",
+                           __func__, i, tensors[i]->data, (int) type, ne);
+            return false;
+        }
+        GGML_ASSERT((ggml_nbytes(tensors[i]) & 0xF) == 0);
+    }
+
+    return ggml_cuda_ar_allreduce(comm_ctx->ar_pipeline, comm_ctx->backends.data(), tensors);
+}
+
+// ---------------------------------------------------------------------------
+// Per-call dispatch -- three variants, one per backend.  Each is set as
+// comm_ctx->try_allreduce by the matching init step.  Per-call failure
+// returns false; the meta backend's generic implementation handles that call.
+// ---------------------------------------------------------------------------
+
+#ifdef GGML_USE_NCCL
+static bool ggml_backend_cuda_comm_try_allreduce_nccl(
+        ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
+    return ggml_backend_cuda_comm_allreduce_nccl(comm_ctx, tensors);
+}
+#endif // GGML_USE_NCCL
+
+static bool ggml_backend_cuda_comm_try_allreduce_internal(
+        ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
+    return ggml_backend_cuda_comm_allreduce_internal(comm_ctx, tensors);
+}
+
+static bool ggml_backend_cuda_comm_try_allreduce_butterfly(
+        ggml_backend_cuda_comm_context *, struct ggml_tensor **) {
+    return false;
+}
+
+static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
+    if (comm_ctx_v == nullptr) {
+        return;
+    }
+    delete static_cast<ggml_backend_cuda_comm_context *>(comm_ctx_v);
+}
+
+// ---------------------------------------------------------------------------
+// Init -- chained nccl -> internal -> none.  Each step tries to bring up its
+// resource; on failure it warns and recurses into the next step.
+// ---------------------------------------------------------------------------
+static void ggml_backend_cuda_comm_init_none(ggml_backend_cuda_comm_context * ret) {
+    ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_butterfly;
+}
+
+static void ggml_backend_cuda_comm_init_internal(ggml_backend_cuda_comm_context * ret) {
+    ret->ar_pipeline = ggml_cuda_ar_pipeline_init(ret->dev_ids.data(), ret->dev_ids.size());
+    if (ret->ar_pipeline) {
+        ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_internal;
+        return;
+    }
+
+    // Clear sticky CUDA error from the failed init.
+    (void) cudaGetLastError();
+    GGML_LOG_WARN("internal AllReduce init failed (n_devices != 2?); "
+                  "falling back to meta-backend butterfly\n");
+    ggml_backend_cuda_comm_init_none(ret);
+}
+
+static void ggml_backend_cuda_comm_init_nccl(ggml_backend_cuda_comm_context * ret) {
+#ifdef GGML_USE_NCCL
+    const size_t n = ret->dev_ids.size();
+    ret->comms.resize(n);
+    ncclResult_t rc = ncclCommInitAll(ret->comms.data(), (int) n, ret->dev_ids.data());
+    if (rc == ncclSuccess) {
+        ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_nccl;
+        return;
+    }
+
+    ret->comms.clear();
+    GGML_LOG_WARN("NCCL init failed (%s); falling back to internal AllReduce\n",
+                  ncclGetErrorString(rc));
+#else // GGML_USE_NCCL
+#ifndef GGML_USE_HIP
+    GGML_LOG_WARN("NCCL not compiled in; falling back to internal AllReduce.  "
+                  "Recompile with -DGGML_CUDA_NCCL=ON for best multi-GPU performance.\n");
+#endif // !GGML_USE_HIP
+#endif // GGML_USE_NCCL
+
+    ggml_backend_cuda_comm_init_internal(ret);
+}
+
+// Top-level init.  Picks one of the three init paths based on
+// GGML_CUDA_ALLREDUCE (or the platform default) and lets the chain handle
+// any fallback.  Unrecognised env values warn and fall through to the
+// platform default.
+static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
+    for (size_t i = 0; i < n_backends; i++) {
+        if (!ggml_backend_is_cuda(backends[i])) {
+            return nullptr;
+        }
+    }
+
+    auto * ret = new ggml_backend_cuda_comm_context;
+    ret->backends.assign(backends, backends + n_backends);
+    ret->dev_ids.reserve(n_backends);
+    for (size_t i = 0; i < n_backends; i++) {
+        ret->dev_ids.push_back(static_cast<ggml_backend_cuda_context *>(backends[i]->context)->device);
+    }
+
+    const char * env = getenv("GGML_CUDA_ALLREDUCE");
+    if (!env) {
+        // Platform default: Linux uses NCCL, otherwise (generally Windows) internal
+#if defined(__linux__)
+        ggml_backend_cuda_comm_init_nccl(ret);
+#else
+        ggml_backend_cuda_comm_init_internal(ret);
+#endif // defined(__linux__)
+    } else {
+        std::string env_str(env);
+        if (env_str == "nccl") {
+            ggml_backend_cuda_comm_init_nccl(ret);
+        } else if (env_str == "internal") {
+            ggml_backend_cuda_comm_init_internal(ret);
+        } else if (env_str == "none") {
+            ggml_backend_cuda_comm_init_none(ret);
+        } else {
+            GGML_LOG_WARN("unknown GGML_CUDA_ALLREDUCE value: %s\n", env);
+            ggml_backend_cuda_comm_init_none(ret);
+        }
+    }
+
+    return ret;
+}
+
+// Top-level dispatch -- calls the function pointer chosen by comm_init.
+// Returns false to let the meta-backend's butterfly run.
+static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
+    if (comm_ctx_v == nullptr) {
+        return false;
+    }
+    auto * comm_ctx = static_cast<ggml_backend_cuda_comm_context *>(comm_ctx_v);
+    return comm_ctx->try_allreduce(comm_ctx, tensors);
 }

 ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu
@@ -2,4 +2,5 @@

 #include "../fattn-mma-f16.cuh"

+DECL_FATTN_MMA_F16_CASE(192, 128, 1, 16);
 DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
@@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 1, 8);
 DECL_FATTN_MMA_F16_CASE(96, 96, 1, 8);
 DECL_FATTN_MMA_F16_CASE(112, 112, 1, 8);
 DECL_FATTN_MMA_F16_CASE(128, 128, 1, 8);
+DECL_FATTN_MMA_F16_CASE(192, 128, 1, 8);
 DECL_FATTN_MMA_F16_CASE(256, 256, 1, 8);
 DECL_FATTN_MMA_F16_CASE(512, 512, 1, 8);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu
@@ -2,4 +2,5 @@

 #include "../fattn-mma-f16.cuh"

+DECL_FATTN_MMA_F16_CASE(192, 128, 2, 16);
 DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
@@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 2, 8);
 DECL_FATTN_MMA_F16_CASE(96, 96, 2, 8);
 DECL_FATTN_MMA_F16_CASE(112, 112, 2, 8);
 DECL_FATTN_MMA_F16_CASE(128, 128, 2, 8);
+DECL_FATTN_MMA_F16_CASE(192, 128, 2, 8);
 DECL_FATTN_MMA_F16_CASE(256, 256, 2, 8);
 DECL_FATTN_MMA_F16_CASE(512, 512, 2, 8);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu
@@ -2,4 +2,5 @@

 #include "../fattn-mma-f16.cuh"

+DECL_FATTN_MMA_F16_CASE(192, 128, 4, 16);
 DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu
@@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 4, 8);
 DECL_FATTN_MMA_F16_CASE(96, 96, 4, 8);
 DECL_FATTN_MMA_F16_CASE(112, 112, 4, 8);
 DECL_FATTN_MMA_F16_CASE(128, 128, 4, 8);
+DECL_FATTN_MMA_F16_CASE(192, 128, 4, 8);
 DECL_FATTN_MMA_F16_CASE(256, 256, 4, 8);
 DECL_FATTN_MMA_F16_CASE(512, 512, 4, 8);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu
@@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 8, 8);
 DECL_FATTN_MMA_F16_CASE(96, 96, 8, 8);
 DECL_FATTN_MMA_F16_CASE(112, 112, 8, 8);
 DECL_FATTN_MMA_F16_CASE(128, 128, 8, 8);
+DECL_FATTN_MMA_F16_CASE(192, 128, 8, 8);
 DECL_FATTN_MMA_F16_CASE(256, 256, 8, 8);
 DECL_FATTN_MMA_F16_CASE(512, 512, 8, 8);
--- a/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-tile.cuh"
+
+DECL_FATTN_TILE_CASE(192, 128);
--- a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
+++ b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
@@ -3,7 +3,10 @@
 from glob import glob
 import os

-HEAD_SIZES_KQ = [40, 64, 72, 80, 96, 112, 128, 256, 320, 512, 576]
+HEAD_SIZES_KQ = [40, 64, 72, 80, 96, 112, 128, 192, 256, 320, 512, 576]
+
+# DKQ -> DV override for asymmetric head dims.
+HEAD_SIZES_V_OVERRIDE = {576: 512, 320: 256, 192: 128}

 TYPES_KV = ["GGML_TYPE_F16", "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_BF16"]

@@ -62,7 +65,7 @@ for filename in glob("*.cu"):
    os.remove(filename)

 for head_size_kq in HEAD_SIZES_KQ:
-    head_size_v = 256 if head_size_kq == 320 else (head_size_kq if head_size_kq != 576 else 512)
+    head_size_v = HEAD_SIZES_V_OVERRIDE.get(head_size_kq, head_size_kq)
    with open(f"fattn-tile-instance-dkq{head_size_kq}-dv{head_size_v}.cu", "w") as f:
        f.write(SOURCE_FATTN_TILE.format(head_size_kq=head_size_kq, head_size_v=head_size_v))

@@ -85,15 +88,17 @@ for ncols in [8, 16, 32, 64]:
                if head_size_kq == 72:
                    continue
                # Skip compilation of unused ncols2 values for niche head sizes:
+                if head_size_kq == 192 and ncols2 not in (8, 16): # MiMo-V2.5
+                    continue
                if head_size_kq == 320 and ncols2 != 32: # Mistral Small 4
                    continue
                if head_size_kq == 512 and ncols2 not in (4, 8): # Gemma 4
                    continue
                if head_size_kq == 576 and ncols2 not in (4, 16, 32): # Deepseek, GLM 4.7 Flash
                    continue
-                if head_size_kq not in (320, 576) and ncols2 in (16, 32):
+                if head_size_kq not in (192, 320, 576) and ncols2 in (16, 32):
                    continue
-                head_size_v = 256 if head_size_kq == 320 else (head_size_kq if head_size_kq != 576 else 512)
+                head_size_v = HEAD_SIZES_V_OVERRIDE.get(head_size_kq, head_size_kq)
                f.write(SOURCE_FATTN_MMA_CASE.format(ncols1=ncols1, ncols2=ncols2, head_size_kq=head_size_kq, head_size_v=head_size_v))

 for type in TYPES_MMQ:
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2261,6 +2261,58 @@ static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_sess
    return true;
 }

+static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * q     = op->src[0];
+    const struct ggml_tensor * k     = op->src[1];
+    const struct ggml_tensor * v     = op->src[2];
+    const struct ggml_tensor * g     = op->src[3];
+    const struct ggml_tensor * beta  = op->src[4];
+    const struct ggml_tensor * state = op->src[5];
+    const struct ggml_tensor * dst   = op;
+
+    if (!q || !k || !v || !g || !beta || !state) {
+        return false;
+    }
+
+    if (q->type != GGML_TYPE_F32 || k->type != GGML_TYPE_F32 || v->type != GGML_TYPE_F32 ||
+        g->type != GGML_TYPE_F32 || beta->type != GGML_TYPE_F32 || state->type != GGML_TYPE_F32 ||
+        dst->type != GGML_TYPE_F32) {
+        return false;
+    }
+
+    if (!ggml_is_contiguous_rows(q) || !ggml_is_contiguous_rows(k) || !ggml_is_contiguous_rows(v) ||
+        !ggml_is_contiguous(g) || !ggml_is_contiguous(beta) || !ggml_is_contiguous(state) ||
+        !ggml_is_contiguous(dst)) {
+        return false;
+    }
+
+    const int64_t S_v      = v->ne[0];
+    const int64_t H        = v->ne[1];
+    const int64_t n_tokens = v->ne[2];
+    const int64_t n_seqs   = v->ne[3];
+
+    if (S_v <= 0 || S_v > 128 || H <= 0 || n_tokens <= 0 || n_seqs <= 0) {
+        return false;
+    }
+    if (q->ne[0] != S_v || k->ne[0] != S_v || q->ne[1] <= 0 || k->ne[1] <= 0 ||
+        q->ne[2] != n_tokens || k->ne[2] != n_tokens || q->ne[3] <= 0 || k->ne[3] <= 0 ||
+        (n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) {
+        return false;
+    }
+    if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) {
+        return false;
+    }
+    if (ggml_nelements(state) != S_v * S_v * H * n_seqs) {
+        return false;
+    }
+    if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs) {
+        return false;
+    }
+
+    GGML_UNUSED(sess);
+    return true;
+}
+
 static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];
@@ -2420,8 +2472,8 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
        return false;
    }

-    // TODO: add support for non-contiguous elements within a row
-    if (!ggml_is_contiguous_rows(src0) || !ggml_is_contiguous_rows(dst)) {
+    // dst must be contiguous; src0 may be non-contiguous
+    if (!ggml_is_contiguous(dst)) {
        return false;
    }

@@ -2777,32 +2829,34 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {

 static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
    switch (t->op) {
-        case GGML_OP_FLASH_ATTN_EXT: return HTP_OP_FLASH_ATTN_EXT;
-        case GGML_OP_MUL_MAT:        return HTP_OP_MUL_MAT;
-        case GGML_OP_MUL_MAT_ID:     return HTP_OP_MUL_MAT_ID;
-        case GGML_OP_MUL:            return HTP_OP_MUL;
-        case GGML_OP_ADD:            return HTP_OP_ADD;
-        case GGML_OP_ADD_ID:         return HTP_OP_ADD_ID;
-        case GGML_OP_SUB:            return HTP_OP_SUB;
-        case GGML_OP_DIV:            return HTP_OP_DIV;
-        case GGML_OP_CPY:            return HTP_OP_CPY;
-        case GGML_OP_CONT:           return HTP_OP_CPY;
-        case GGML_OP_GET_ROWS:       return HTP_OP_GET_ROWS;
-        case GGML_OP_SET_ROWS:       return HTP_OP_SET_ROWS;
-        case GGML_OP_SUM_ROWS:       return HTP_OP_SUM_ROWS;
-        case GGML_OP_ARGSORT:        return HTP_OP_ARGSORT;
-        case GGML_OP_RMS_NORM:       return HTP_OP_RMS_NORM;
-        case GGML_OP_SCALE:          return HTP_OP_SCALE;
-        case GGML_OP_SQR:            return HTP_OP_SQR;
-        case GGML_OP_SQRT:           return HTP_OP_SQRT;
-        case GGML_OP_SOFT_MAX:       return HTP_OP_SOFTMAX;
-        case GGML_OP_SSM_CONV:       return HTP_OP_SSM_CONV;
-        case GGML_OP_ROPE:           return HTP_OP_ROPE;
-        case GGML_OP_REPEAT:         return HTP_OP_REPEAT;
-        case GGML_OP_CUMSUM:         return HTP_OP_CUMSUM;
-        case GGML_OP_FILL:           return HTP_OP_FILL;
-        case GGML_OP_DIAG:           return HTP_OP_DIAG;
-        case GGML_OP_SOLVE_TRI:      return HTP_OP_SOLVE_TRI;
+        case GGML_OP_FLASH_ATTN_EXT:  return HTP_OP_FLASH_ATTN_EXT;
+        case GGML_OP_MUL_MAT:         return HTP_OP_MUL_MAT;
+        case GGML_OP_MUL_MAT_ID:      return HTP_OP_MUL_MAT_ID;
+        case GGML_OP_MUL:             return HTP_OP_MUL;
+        case GGML_OP_ADD:             return HTP_OP_ADD;
+        case GGML_OP_ADD_ID:          return HTP_OP_ADD_ID;
+        case GGML_OP_SUB:             return HTP_OP_SUB;
+        case GGML_OP_DIV:             return HTP_OP_DIV;
+        case GGML_OP_CPY:             return HTP_OP_CPY;
+        case GGML_OP_CONT:            return HTP_OP_CPY;
+        case GGML_OP_GET_ROWS:        return HTP_OP_GET_ROWS;
+        case GGML_OP_SET_ROWS:        return HTP_OP_SET_ROWS;
+        case GGML_OP_SUM_ROWS:        return HTP_OP_SUM_ROWS;
+        case GGML_OP_ARGSORT:         return HTP_OP_ARGSORT;
+        case GGML_OP_L2_NORM:         return HTP_OP_L2_NORM;
+        case GGML_OP_RMS_NORM:        return HTP_OP_RMS_NORM;
+        case GGML_OP_SCALE:           return HTP_OP_SCALE;
+        case GGML_OP_SQR:             return HTP_OP_SQR;
+        case GGML_OP_SQRT:            return HTP_OP_SQRT;
+        case GGML_OP_SOFT_MAX:        return HTP_OP_SOFTMAX;
+        case GGML_OP_SSM_CONV:        return HTP_OP_SSM_CONV;
+        case GGML_OP_GATED_DELTA_NET: return HTP_OP_GATED_DELTA_NET;
+        case GGML_OP_ROPE:            return HTP_OP_ROPE;
+        case GGML_OP_REPEAT:          return HTP_OP_REPEAT;
+        case GGML_OP_CUMSUM:          return HTP_OP_CUMSUM;
+        case GGML_OP_FILL:            return HTP_OP_FILL;
+        case GGML_OP_DIAG:            return HTP_OP_DIAG;
+        case GGML_OP_SOLVE_TRI:       return HTP_OP_SOLVE_TRI;
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(t)) {
                case GGML_UNARY_OP_SILU:     return HTP_OP_UNARY_SILU;
@@ -3253,6 +3307,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
            supp = ggml_hexagon_supported_add_id(sess, op);
            break;

+        case GGML_OP_L2_NORM:
+            supp = ggml_hexagon_supported_unary(sess, op);
+            break;
+
        case GGML_OP_RMS_NORM:
        case GGML_OP_SCALE:
            supp = ggml_hexagon_supported_unary(sess, op);
@@ -3336,6 +3394,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
            supp = ggml_hexagon_supported_ssm_conv(sess, op);
            break;

+        case GGML_OP_GATED_DELTA_NET:
+            supp = ggml_hexagon_supported_gated_delta_net(sess, op);
+            break;
+
        case GGML_OP_CUMSUM:
            supp = ggml_hexagon_supported_cumsum(sess, op);
            break;
--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -37,6 +37,7 @@ add_library(${HTP_LIB} SHARED
    fill-ops.c
    diag-ops.c
    solve-tri-ops.c
+    gated-delta-net-ops.c
 )

 target_compile_definitions(${HTP_LIB} PRIVATE
--- a/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c
+++ b/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c
@@ -0,0 +1,955 @@
+#include <math.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "hvx-utils.h"
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+
+#ifndef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+#define HTP_GDN_MAX_SV 128
+
+struct htp_gdn_context {
+    struct htp_ops_context * octx;
+    uint32_t rows_per_thread;
+    size_t state_bytes;
+    bool use_vtcm;
+    uint8_t * vtcm_state_base;
+    size_t vtcm_state_per_thread;
+};
+
+static inline float gdn_mul_dot_f32(float * restrict dst, const float * restrict mul,
+        const float * restrict dot, uint32_t n) {
+    HVX_Vector acc = Q6_V_vzero();
+
+    const uint32_t epv = 128 / sizeof(float);
+    const uint32_t nvec = n / epv;
+    const uint32_t tail = n % epv;
+    for (uint32_t i = 0; i < nvec; ++i) {
+        HVX_Vector vd = hvx_vmemu(dst + i * epv);
+        HVX_Vector vm = hvx_vmem(mul + i * epv);
+        HVX_Vector vdot = hvx_vmem(dot + i * epv);
+        HVX_Vector out = hvx_vec_mul_f32_f32(vd, vm);
+        hvx_vmemu(dst + i * epv) = out;
+        acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot));
+    }
+
+    if (tail) {
+        const uint32_t off = nvec * epv;
+        HVX_Vector vd = hvx_vmemu(dst + off);
+        HVX_Vector vm = hvx_vmem(mul + off);
+        HVX_Vector vdot = hvx_vmem(dot + off);
+        HVX_Vector out = hvx_vec_mul_f32_f32(vd, vm);
+        hvx_vec_store_u(dst + off, tail * sizeof(float), out);
+        HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
+        HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot);
+        acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero()));
+    }
+
+    return hvx_vec_get_f32(hvx_vec_reduce_sum_f32(acc));
+}
+
+static inline float gdn_mul_scalar_dot_f32(float * restrict dst, float mul,
+        const float * restrict dot, uint32_t n) {
+    HVX_Vector acc = Q6_V_vzero();
+    const HVX_Vector vmul = hvx_vec_splat_f32(mul);
+
+    const uint32_t epv = 128 / sizeof(float);
+    const uint32_t nvec = n / epv;
+    const uint32_t tail = n % epv;
+    for (uint32_t i = 0; i < nvec; ++i) {
+        HVX_Vector vd = hvx_vmemu(dst + i * epv);
+        HVX_Vector vdot = hvx_vmem(dot + i * epv);
+        HVX_Vector out = hvx_vec_mul_f32_f32(vd, vmul);
+        hvx_vmemu(dst + i * epv) = out;
+        acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot));
+    }
+
+    if (tail) {
+        const uint32_t off = nvec * epv;
+        HVX_Vector vd = hvx_vmemu(dst + off);
+        HVX_Vector vdot = hvx_vmem(dot + off);
+        HVX_Vector out = hvx_vec_mul_f32_f32(vd, vmul);
+        hvx_vec_store_u(dst + off, tail * sizeof(float), out);
+        HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
+        HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot);
+        acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero()));
+    }
+
+    return hvx_vec_get_f32(hvx_vec_reduce_sum_f32(acc));
+}
+
+static inline float gdn_add_scaled_dot_f32(float * restrict dst, const float * restrict src,
+        float scale, const float * restrict dot, uint32_t n) {
+    HVX_Vector acc = Q6_V_vzero();
+    const HVX_Vector vscale = hvx_vec_splat_f32(scale);
+
+    const uint32_t epv = 128 / sizeof(float);
+    const uint32_t nvec = n / epv;
+    const uint32_t tail = n % epv;
+    for (uint32_t i = 0; i < nvec; ++i) {
+        HVX_Vector vd = hvx_vmemu(dst + i * epv);
+        HVX_Vector vs = hvx_vmem(src + i * epv);
+        HVX_Vector vdot = hvx_vmem(dot + i * epv);
+        HVX_Vector out = hvx_vec_add_f32_f32(vd, hvx_vec_mul_f32_f32(vs, vscale));
+        hvx_vmemu(dst + i * epv) = out;
+        acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot));
+    }
+
+    if (tail) {
+        const uint32_t off = nvec * epv;
+        HVX_Vector vd = hvx_vmemu(dst + off);
+        HVX_Vector vs = hvx_vmem(src + off);
+        HVX_Vector vdot = hvx_vmem(dot + off);
+        HVX_Vector out = hvx_vec_add_f32_f32(vd, hvx_vec_mul_f32_f32(vs, vscale));
+        hvx_vec_store_u(dst + off, tail * sizeof(float), out);
+        HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
+        HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot);
+        acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero()));
+    }
+
+    return hvx_vec_get_f32(hvx_vec_reduce_sum_f32(acc));
+}
+
+static inline void gdn_mul_dot4_f32(float * restrict dst0, float * restrict dst1,
+        float * restrict dst2, float * restrict dst3, const float * restrict mul,
+        const float * restrict dot, uint32_t n, float * restrict sums) {
+    HVX_Vector acc0 = Q6_V_vzero();
+    HVX_Vector acc1 = Q6_V_vzero();
+    HVX_Vector acc2 = Q6_V_vzero();
+    HVX_Vector acc3 = Q6_V_vzero();
+
+    const uint32_t epv = 128 / sizeof(float);
+    const uint32_t nvec = n / epv;
+    const uint32_t tail = n % epv;
+    for (uint32_t i = 0; i < nvec; ++i) {
+        HVX_Vector vm = hvx_vmem(mul + i * epv);
+        HVX_Vector vdot = hvx_vmem(dot + i * epv);
+
+        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vm);
+        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vm);
+        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vm);
+        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vm);
+
+        hvx_vmemu(dst0 + i * epv) = out0;
+        hvx_vmemu(dst1 + i * epv) = out1;
+        hvx_vmemu(dst2 + i * epv) = out2;
+        hvx_vmemu(dst3 + i * epv) = out3;
+
+        acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
+        acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
+        acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
+        acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
+    }
+
+    if (tail) {
+        const uint32_t off = nvec * epv;
+        HVX_Vector vm = hvx_vmem(mul + off);
+        HVX_Vector vdot = hvx_vmem(dot + off);
+        HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
+        HVX_Vector zero = Q6_V_vzero();
+
+        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vm);
+        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vm);
+        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vm);
+        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vm);
+
+        hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
+        hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
+        hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
+        hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
+
+        acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
+        acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
+        acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
+        acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
+    }
+
+    HVX_Vector_x4 acc = { .v = { acc0, acc1, acc2, acc3 } };
+    hvx_vec_store_u(sums, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(acc));
+}
+
+static inline void gdn_mul_scalar_dot4_f32(float * restrict dst0, float * restrict dst1,
+        float * restrict dst2, float * restrict dst3, float mul,
+        const float * restrict dot, uint32_t n, float * restrict sums) {
+    HVX_Vector acc0 = Q6_V_vzero();
+    HVX_Vector acc1 = Q6_V_vzero();
+    HVX_Vector acc2 = Q6_V_vzero();
+    HVX_Vector acc3 = Q6_V_vzero();
+    const HVX_Vector vmul = hvx_vec_splat_f32(mul);
+
+    const uint32_t epv = 128 / sizeof(float);
+    const uint32_t nvec = n / epv;
+    const uint32_t tail = n % epv;
+    for (uint32_t i = 0; i < nvec; ++i) {
+        HVX_Vector vdot = hvx_vmem(dot + i * epv);
+
+        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vmul);
+        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vmul);
+        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vmul);
+        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vmul);
+
+        hvx_vmemu(dst0 + i * epv) = out0;
+        hvx_vmemu(dst1 + i * epv) = out1;
+        hvx_vmemu(dst2 + i * epv) = out2;
+        hvx_vmemu(dst3 + i * epv) = out3;
+
+        acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
+        acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
+        acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
+        acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
+    }
+
+    if (tail) {
+        const uint32_t off = nvec * epv;
+        HVX_Vector vdot = hvx_vmem(dot + off);
+        HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
+        HVX_Vector zero = Q6_V_vzero();
+
+        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vmul);
+        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vmul);
+        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vmul);
+        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vmul);
+
+        hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
+        hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
+        hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
+        hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
+
+        acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
+        acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
+        acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
+        acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
+    }
+
+    HVX_Vector_x4 acc = { .v = { acc0, acc1, acc2, acc3 } };
+    hvx_vec_store_u(sums, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(acc));
+}
+
+static inline void gdn_add_scaled_dot4_f32(float * restrict dst0, float * restrict dst1,
+        float * restrict dst2, float * restrict dst3, const float * restrict src,
+        const float * restrict scale, const float * restrict dot, uint32_t n,
+        float * restrict sums) {
+    HVX_Vector acc0 = Q6_V_vzero();
+    HVX_Vector acc1 = Q6_V_vzero();
+    HVX_Vector acc2 = Q6_V_vzero();
+    HVX_Vector acc3 = Q6_V_vzero();
+    const HVX_Vector scale0 = hvx_vec_splat_f32(scale[0]);
+    const HVX_Vector scale1 = hvx_vec_splat_f32(scale[1]);
+    const HVX_Vector scale2 = hvx_vec_splat_f32(scale[2]);
+    const HVX_Vector scale3 = hvx_vec_splat_f32(scale[3]);
+
+    const uint32_t epv = 128 / sizeof(float);
+    const uint32_t nvec = n / epv;
+    const uint32_t tail = n % epv;
+    for (uint32_t i = 0; i < nvec; ++i) {
+        HVX_Vector vs = hvx_vmem(src + i * epv);
+        HVX_Vector vdot = hvx_vmem(dot + i * epv);
+
+        HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + i * epv), hvx_vec_mul_f32_f32(vs, scale0));
+        HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + i * epv), hvx_vec_mul_f32_f32(vs, scale1));
+        HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + i * epv), hvx_vec_mul_f32_f32(vs, scale2));
+        HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + i * epv), hvx_vec_mul_f32_f32(vs, scale3));
+
+        hvx_vmemu(dst0 + i * epv) = out0;
+        hvx_vmemu(dst1 + i * epv) = out1;
+        hvx_vmemu(dst2 + i * epv) = out2;
+        hvx_vmemu(dst3 + i * epv) = out3;
+
+        acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
+        acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
+        acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
+        acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
+    }
+
+    if (tail) {
+        const uint32_t off = nvec * epv;
+        HVX_Vector vs = hvx_vmem(src + off);
+        HVX_Vector vdot = hvx_vmem(dot + off);
+        HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
+        HVX_Vector zero = Q6_V_vzero();
+
+        HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + off), hvx_vec_mul_f32_f32(vs, scale0));
+        HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + off), hvx_vec_mul_f32_f32(vs, scale1));
+        HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + off), hvx_vec_mul_f32_f32(vs, scale2));
+        HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + off), hvx_vec_mul_f32_f32(vs, scale3));
+
+        hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
+        hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
+        hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
+        hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
+
+        acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
+        acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
+        acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
+        acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
+    }
+
+    HVX_Vector_x4 acc = { .v = { acc0, acc1, acc2, acc3 } };
+    hvx_vec_store_u(sums, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(acc));
+}
+
+static inline void gdn_mul_dot8_f32(float * restrict dst0, float * restrict dst1,
+        float * restrict dst2, float * restrict dst3, float * restrict dst4,
+        float * restrict dst5, float * restrict dst6, float * restrict dst7,
+        const float * restrict mul, const float * restrict dot, uint32_t n,
+        float * restrict sums) {
+    HVX_Vector acc0 = Q6_V_vzero();
+    HVX_Vector acc1 = Q6_V_vzero();
+    HVX_Vector acc2 = Q6_V_vzero();
+    HVX_Vector acc3 = Q6_V_vzero();
+    HVX_Vector acc4 = Q6_V_vzero();
+    HVX_Vector acc5 = Q6_V_vzero();
+    HVX_Vector acc6 = Q6_V_vzero();
+    HVX_Vector acc7 = Q6_V_vzero();
+
+    const uint32_t epv = 128 / sizeof(float);
+    const uint32_t nvec = n / epv;
+    const uint32_t tail = n % epv;
+    for (uint32_t i = 0; i < nvec; ++i) {
+        HVX_Vector vm = hvx_vmem(mul + i * epv);
+        HVX_Vector vdot = hvx_vmem(dot + i * epv);
+
+        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vm);
+        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vm);
+        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vm);
+        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vm);
+        HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + i * epv), vm);
+        HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + i * epv), vm);
+        HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + i * epv), vm);
+        HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + i * epv), vm);
+
+        hvx_vmemu(dst0 + i * epv) = out0;
+        hvx_vmemu(dst1 + i * epv) = out1;
+        hvx_vmemu(dst2 + i * epv) = out2;
+        hvx_vmemu(dst3 + i * epv) = out3;
+        hvx_vmemu(dst4 + i * epv) = out4;
+        hvx_vmemu(dst5 + i * epv) = out5;
+        hvx_vmemu(dst6 + i * epv) = out6;
+        hvx_vmemu(dst7 + i * epv) = out7;
+
+        acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
+        acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
+        acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
+        acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
+        acc4 = hvx_vec_add_f32_f32(acc4, hvx_vec_mul_f32_f32(out4, vdot));
+        acc5 = hvx_vec_add_f32_f32(acc5, hvx_vec_mul_f32_f32(out5, vdot));
+        acc6 = hvx_vec_add_f32_f32(acc6, hvx_vec_mul_f32_f32(out6, vdot));
+        acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot));
+    }
+
+    if (tail) {
+        const uint32_t off = nvec * epv;
+        HVX_Vector vm = hvx_vmem(mul + off);
+        HVX_Vector vdot = hvx_vmem(dot + off);
+        HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
+        HVX_Vector zero = Q6_V_vzero();
+
+        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vm);
+        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vm);
+        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vm);
+        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vm);
+        HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + off), vm);
+        HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + off), vm);
+        HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + off), vm);
+        HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + off), vm);
+
+        hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
+        hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
+        hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
+        hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
+        hvx_vec_store_u(dst4 + off, tail * sizeof(float), out4);
+        hvx_vec_store_u(dst5 + off, tail * sizeof(float), out5);
+        hvx_vec_store_u(dst6 + off, tail * sizeof(float), out6);
+        hvx_vec_store_u(dst7 + off, tail * sizeof(float), out7);
+
+        acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
+        acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
+        acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
+        acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
+        acc4 = hvx_vec_add_f32_f32(acc4, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out4, vdot), zero));
+        acc5 = hvx_vec_add_f32_f32(acc5, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out5, vdot), zero));
+        acc6 = hvx_vec_add_f32_f32(acc6, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out6, vdot), zero));
+        acc7 = hvx_vec_add_f32_f32(acc7, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out7, vdot), zero));
+    }
+
+    HVX_Vector_x4 accA = { .v = { acc0, acc1, acc2, acc3 } };
+    HVX_Vector_x4 accB = { .v = { acc4, acc5, acc6, acc7 } };
+    hvx_vec_store_u(sums + 0, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accA));
+    hvx_vec_store_u(sums + 4, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accB));
+}
+
+static inline void gdn_mul_scalar_dot8_f32(float * restrict dst0, float * restrict dst1,
+        float * restrict dst2, float * restrict dst3, float * restrict dst4,
+        float * restrict dst5, float * restrict dst6, float * restrict dst7,
+        float mul, const float * restrict dot, uint32_t n, float * restrict sums) {
+    HVX_Vector acc0 = Q6_V_vzero();
+    HVX_Vector acc1 = Q6_V_vzero();
+    HVX_Vector acc2 = Q6_V_vzero();
+    HVX_Vector acc3 = Q6_V_vzero();
+    HVX_Vector acc4 = Q6_V_vzero();
+    HVX_Vector acc5 = Q6_V_vzero();
+    HVX_Vector acc6 = Q6_V_vzero();
+    HVX_Vector acc7 = Q6_V_vzero();
+    const HVX_Vector vmul = hvx_vec_splat_f32(mul);
+
+    const uint32_t epv = 128 / sizeof(float);
+    const uint32_t nvec = n / epv;
+    const uint32_t tail = n % epv;
+    for (uint32_t i = 0; i < nvec; ++i) {
+        HVX_Vector vdot = hvx_vmem(dot + i * epv);
+
+        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vmul);
+        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vmul);
+        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vmul);
+        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vmul);
+        HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + i * epv), vmul);
+        HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + i * epv), vmul);
+        HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + i * epv), vmul);
+        HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + i * epv), vmul);
+
+        hvx_vmemu(dst0 + i * epv) = out0;
+        hvx_vmemu(dst1 + i * epv) = out1;
+        hvx_vmemu(dst2 + i * epv) = out2;
+        hvx_vmemu(dst3 + i * epv) = out3;
+        hvx_vmemu(dst4 + i * epv) = out4;
+        hvx_vmemu(dst5 + i * epv) = out5;
+        hvx_vmemu(dst6 + i * epv) = out6;
+        hvx_vmemu(dst7 + i * epv) = out7;
+
+        acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
+        acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
+        acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
+        acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
+        acc4 = hvx_vec_add_f32_f32(acc4, hvx_vec_mul_f32_f32(out4, vdot));
+        acc5 = hvx_vec_add_f32_f32(acc5, hvx_vec_mul_f32_f32(out5, vdot));
+        acc6 = hvx_vec_add_f32_f32(acc6, hvx_vec_mul_f32_f32(out6, vdot));
+        acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot));
+    }
+
+    if (tail) {
+        const uint32_t off = nvec * epv;
+        HVX_Vector vdot = hvx_vmem(dot + off);
+        HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
+        HVX_Vector zero = Q6_V_vzero();
+
+        HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vmul);
+        HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vmul);
+        HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vmul);
+        HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vmul);
+        HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + off), vmul);
+        HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + off), vmul);
+        HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + off), vmul);
+        HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + off), vmul);
+
+        hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
+        hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
+        hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
+        hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
+        hvx_vec_store_u(dst4 + off, tail * sizeof(float), out4);
+        hvx_vec_store_u(dst5 + off, tail * sizeof(float), out5);
+        hvx_vec_store_u(dst6 + off, tail * sizeof(float), out6);
+        hvx_vec_store_u(dst7 + off, tail * sizeof(float), out7);
+
+        acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
+        acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
+        acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
+        acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
+        acc4 = hvx_vec_add_f32_f32(acc4, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out4, vdot), zero));
+        acc5 = hvx_vec_add_f32_f32(acc5, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out5, vdot), zero));
+        acc6 = hvx_vec_add_f32_f32(acc6, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out6, vdot), zero));
+        acc7 = hvx_vec_add_f32_f32(acc7, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out7, vdot), zero));
+    }
+
+    HVX_Vector_x4 accA = { .v = { acc0, acc1, acc2, acc3 } };
+    HVX_Vector_x4 accB = { .v = { acc4, acc5, acc6, acc7 } };
+    hvx_vec_store_u(sums + 0, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accA));
+    hvx_vec_store_u(sums + 4, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accB));
+}
+
+static inline void gdn_add_scaled_dot8_f32(float * restrict dst0, float * restrict dst1,
+        float * restrict dst2, float * restrict dst3, float * restrict dst4,
+        float * restrict dst5, float * restrict dst6, float * restrict dst7,
+        const float * restrict src, const float * restrict scale,
+        const float * restrict dot, uint32_t n, float * restrict sums) {
+    HVX_Vector acc0 = Q6_V_vzero();
+    HVX_Vector acc1 = Q6_V_vzero();
+    HVX_Vector acc2 = Q6_V_vzero();
+    HVX_Vector acc3 = Q6_V_vzero();
+    HVX_Vector acc4 = Q6_V_vzero();
+    HVX_Vector acc5 = Q6_V_vzero();
+    HVX_Vector acc6 = Q6_V_vzero();
+    HVX_Vector acc7 = Q6_V_vzero();
+    const HVX_Vector scale0 = hvx_vec_splat_f32(scale[0]);
+    const HVX_Vector scale1 = hvx_vec_splat_f32(scale[1]);
+    const HVX_Vector scale2 = hvx_vec_splat_f32(scale[2]);
+    const HVX_Vector scale3 = hvx_vec_splat_f32(scale[3]);
+    const HVX_Vector scale4 = hvx_vec_splat_f32(scale[4]);
+    const HVX_Vector scale5 = hvx_vec_splat_f32(scale[5]);
+    const HVX_Vector scale6 = hvx_vec_splat_f32(scale[6]);
+    const HVX_Vector scale7 = hvx_vec_splat_f32(scale[7]);
+
+    const uint32_t epv = 128 / sizeof(float);
+    const uint32_t nvec = n / epv;
+    const uint32_t tail = n % epv;
+    for (uint32_t i = 0; i < nvec; ++i) {
+        HVX_Vector vs = hvx_vmem(src + i * epv);
+        HVX_Vector vdot = hvx_vmem(dot + i * epv);
+
+        HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + i * epv), hvx_vec_mul_f32_f32(vs, scale0));
+        HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + i * epv), hvx_vec_mul_f32_f32(vs, scale1));
+        HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + i * epv), hvx_vec_mul_f32_f32(vs, scale2));
+        HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + i * epv), hvx_vec_mul_f32_f32(vs, scale3));
+        HVX_Vector out4 = hvx_vec_add_f32_f32(hvx_vmemu(dst4 + i * epv), hvx_vec_mul_f32_f32(vs, scale4));
+        HVX_Vector out5 = hvx_vec_add_f32_f32(hvx_vmemu(dst5 + i * epv), hvx_vec_mul_f32_f32(vs, scale5));
+        HVX_Vector out6 = hvx_vec_add_f32_f32(hvx_vmemu(dst6 + i * epv), hvx_vec_mul_f32_f32(vs, scale6));
+        HVX_Vector out7 = hvx_vec_add_f32_f32(hvx_vmemu(dst7 + i * epv), hvx_vec_mul_f32_f32(vs, scale7));
+
+        hvx_vmemu(dst0 + i * epv) = out0;
+        hvx_vmemu(dst1 + i * epv) = out1;
+        hvx_vmemu(dst2 + i * epv) = out2;
+        hvx_vmemu(dst3 + i * epv) = out3;
+        hvx_vmemu(dst4 + i * epv) = out4;
+        hvx_vmemu(dst5 + i * epv) = out5;
+        hvx_vmemu(dst6 + i * epv) = out6;
+        hvx_vmemu(dst7 + i * epv) = out7;
+
+        acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot));
+        acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot));
+        acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot));
+        acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot));
+        acc4 = hvx_vec_add_f32_f32(acc4, hvx_vec_mul_f32_f32(out4, vdot));
+        acc5 = hvx_vec_add_f32_f32(acc5, hvx_vec_mul_f32_f32(out5, vdot));
+        acc6 = hvx_vec_add_f32_f32(acc6, hvx_vec_mul_f32_f32(out6, vdot));
+        acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot));
+    }
+
+    if (tail) {
+        const uint32_t off = nvec * epv;
+        HVX_Vector vs = hvx_vmem(src + off);
+        HVX_Vector vdot = hvx_vmem(dot + off);
+        HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float));
+        HVX_Vector zero = Q6_V_vzero();
+
+        HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + off), hvx_vec_mul_f32_f32(vs, scale0));
+        HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + off), hvx_vec_mul_f32_f32(vs, scale1));
+        HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + off), hvx_vec_mul_f32_f32(vs, scale2));
+        HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + off), hvx_vec_mul_f32_f32(vs, scale3));
+        HVX_Vector out4 = hvx_vec_add_f32_f32(hvx_vmemu(dst4 + off), hvx_vec_mul_f32_f32(vs, scale4));
+        HVX_Vector out5 = hvx_vec_add_f32_f32(hvx_vmemu(dst5 + off), hvx_vec_mul_f32_f32(vs, scale5));
+        HVX_Vector out6 = hvx_vec_add_f32_f32(hvx_vmemu(dst6 + off), hvx_vec_mul_f32_f32(vs, scale6));
+        HVX_Vector out7 = hvx_vec_add_f32_f32(hvx_vmemu(dst7 + off), hvx_vec_mul_f32_f32(vs, scale7));
+
+        hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0);
+        hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1);
+        hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2);
+        hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3);
+        hvx_vec_store_u(dst4 + off, tail * sizeof(float), out4);
+        hvx_vec_store_u(dst5 + off, tail * sizeof(float), out5);
+        hvx_vec_store_u(dst6 + off, tail * sizeof(float), out6);
+        hvx_vec_store_u(dst7 + off, tail * sizeof(float), out7);
+
+        acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero));
+        acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero));
+        acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero));
+        acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero));
+        acc4 = hvx_vec_add_f32_f32(acc4, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out4, vdot), zero));
+        acc5 = hvx_vec_add_f32_f32(acc5, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out5, vdot), zero));
+        acc6 = hvx_vec_add_f32_f32(acc6, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out6, vdot), zero));
+        acc7 = hvx_vec_add_f32_f32(acc7, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out7, vdot), zero));
+    }
+
+    HVX_Vector_x4 accA = { .v = { acc0, acc1, acc2, acc3 } };
+    HVX_Vector_x4 accB = { .v = { acc4, acc5, acc6, acc7 } };
+    hvx_vec_store_u(sums + 0, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accA));
+    hvx_vec_store_u(sums + 4, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accB));
+}
+
+static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_gdn_context * gctx = (struct htp_gdn_context *) data;
+    struct htp_ops_context * octx = gctx->octx;
+
+    const struct htp_tensor * q     = octx->src[0];
+    const struct htp_tensor * k     = octx->src[1];
+    const struct htp_tensor * v     = octx->src[2];
+    const struct htp_tensor * g     = octx->src[3];
+    const struct htp_tensor * beta  = octx->src[4];
+    const struct htp_tensor * state = octx->src[5];
+    const struct htp_tensor * dst   = octx->dst;
+
+    const uint32_t S_v      = v->ne[0];
+    const uint32_t H        = v->ne[1];
+    const uint32_t n_tokens = v->ne[2];
+    const uint32_t n_seqs   = v->ne[3];
+
+    const uint32_t total_rows = H * n_seqs;
+    if (ith >= total_rows) {
+        return;
+    }
+
+    const uint32_t rq3 = n_seqs / q->ne[3];
+    const uint32_t rk3 = n_seqs / k->ne[3];
+    const float scale = 1.0f / sqrtf((float) S_v);
+
+    float * dst_base       = (float *) (uintptr_t) dst->data;
+    float * state_out_base = dst_base + (uint64_t) S_v * H * n_tokens * n_seqs;
+    const float * state_in_base = (const float *) (uintptr_t) state->data;
+
+    const bool kda = (g->ne[0] == S_v);
+    float local_gate[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
+    float local_q[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
+    float local_k[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
+    float local_sums[4] __attribute__((aligned(128)));
+
+    for (uint32_t ir = ith; ir < total_rows; ir += nth) {
+        const uint32_t iv1 = ir % H;
+        const uint32_t iv3 = ir / H;
+
+        const uint32_t iq1 = iv1 % q->ne[1];
+        const uint32_t ik1 = iv1 % k->ne[1];
+        const uint32_t iq3 = iv3 / rq3;
+        const uint32_t ik3 = iv3 / rk3;
+
+        float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
+        const float * s_in = state_in_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
+
+        memcpy(s_out, s_in, gctx->state_bytes);
+        float * s_work = s_out;
+
+        float * attn_data = dst_base + ((uint64_t) iv3 * n_tokens * H + iv1) * S_v;
+
+        for (uint32_t t = 0; t < n_tokens; ++t) {
+            const float * q_t = (const float *) ((const uint8_t *) (uintptr_t) q->data +
+                    (uint64_t) iq3 * q->nb[3] + (uint64_t) t * q->nb[2] + (uint64_t) iq1 * q->nb[1]);
+            const float * k_t = (const float *) ((const uint8_t *) (uintptr_t) k->data +
+                    (uint64_t) ik3 * k->nb[3] + (uint64_t) t * k->nb[2] + (uint64_t) ik1 * k->nb[1]);
+            const float * v_t = (const float *) ((const uint8_t *) (uintptr_t) v->data +
+                    (uint64_t) iv3 * v->nb[3] + (uint64_t) t * v->nb[2] + (uint64_t) iv1 * v->nb[1]);
+            const float * g_t = (const float *) ((const uint8_t *) (uintptr_t) g->data +
+                    (uint64_t) iv3 * g->nb[3] + (uint64_t) t * g->nb[2] + (uint64_t) iv1 * g->nb[1]);
+            const float beta_val = *(const float *) ((const uint8_t *) (uintptr_t) beta->data +
+                    (uint64_t) iv3 * beta->nb[3] + (uint64_t) t * beta->nb[2] + (uint64_t) iv1 * beta->nb[1]);
+
+            memcpy(local_q, q_t, (size_t) S_v * sizeof(float));
+            memcpy(local_k, k_t, (size_t) S_v * sizeof(float));
+
+            if (kda) {
+                hvx_exp_f32((uint8_t *) local_gate, (const uint8_t *) g_t, S_v, false);
+
+                uint32_t j = 0;
+                for (; j + 4 <= S_v; j += 4) {
+                    float * row0 = s_work + (uint64_t) (j + 0) * S_v;
+                    float * row1 = s_work + (uint64_t) (j + 1) * S_v;
+                    float * row2 = s_work + (uint64_t) (j + 2) * S_v;
+                    float * row3 = s_work + (uint64_t) (j + 3) * S_v;
+                    gdn_mul_dot4_f32(row0, row1, row2, row3, local_gate, local_k, S_v, local_sums);
+                    float local_delta_b[4] __attribute__((aligned(128)));
+                    for (uint32_t r = 0; r < 4; ++r) {
+                        local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
+                    }
+                    gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
+                    for (uint32_t r = 0; r < 4; ++r) {
+                        attn_data[j + r] = local_sums[r] * scale;
+                    }
+                }
+                for (; j < S_v; ++j) {
+                    float * row = s_work + (uint64_t) j * S_v;
+                    const float sum = gdn_mul_dot_f32(row, local_gate, local_k, S_v);
+                    const float dj = (v_t[j] - sum) * beta_val;
+                    attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
+                }
+            } else {
+                const float gate = expf(g_t[0]);
+                uint32_t j = 0;
+                for (; j + 4 <= S_v; j += 4) {
+                    float * row0 = s_work + (uint64_t) (j + 0) * S_v;
+                    float * row1 = s_work + (uint64_t) (j + 1) * S_v;
+                    float * row2 = s_work + (uint64_t) (j + 2) * S_v;
+                    float * row3 = s_work + (uint64_t) (j + 3) * S_v;
+                    gdn_mul_scalar_dot4_f32(row0, row1, row2, row3, gate, local_k, S_v, local_sums);
+                    float local_delta_b[4] __attribute__((aligned(128)));
+                    for (uint32_t r = 0; r < 4; ++r) {
+                        local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
+                    }
+                    gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
+                    for (uint32_t r = 0; r < 4; ++r) {
+                        attn_data[j + r] = local_sums[r] * scale;
+                    }
+                }
+                for (; j < S_v; ++j) {
+                    float * row = s_work + (uint64_t) j * S_v;
+                    const float sum = gdn_mul_scalar_dot_f32(row, gate, local_k, S_v);
+                    const float dj = (v_t[j] - sum) * beta_val;
+                    attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
+                }
+            }
+
+            attn_data += (uint64_t) S_v * H;
+        }
+    }
+}
+
+static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_gdn_context * gctx = (struct htp_gdn_context *) data;
+    struct htp_ops_context * octx = gctx->octx;
+
+    const struct htp_tensor * q     = octx->src[0];
+    const struct htp_tensor * k     = octx->src[1];
+    const struct htp_tensor * v     = octx->src[2];
+    const struct htp_tensor * g     = octx->src[3];
+    const struct htp_tensor * beta  = octx->src[4];
+    const struct htp_tensor * state = octx->src[5];
+    const struct htp_tensor * dst   = octx->dst;
+
+    const uint32_t S_v      = v->ne[0];
+    const uint32_t H        = v->ne[1];
+    const uint32_t n_seqs   = v->ne[3];
+
+    const uint32_t total_rows = H * n_seqs;
+    if (ith >= total_rows) {
+        return;
+    }
+
+    const uint32_t rq3 = n_seqs / q->ne[3];
+    const uint32_t rk3 = n_seqs / k->ne[3];
+    const float scale = 1.0f / sqrtf((float) S_v);
+
+    float * dst_base       = (float *) (uintptr_t) dst->data;
+    float * state_out_base = dst_base + (uint64_t) S_v * H * n_seqs;
+    const float * state_in_base = (const float *) (uintptr_t) state->data;
+
+    const bool kda = (g->ne[0] == S_v);
+    float local_gate[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
+    float local_q[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
+    float local_k[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
+    float local_sums[8] __attribute__((aligned(128)));
+
+    dma_queue * dma = octx->ctx->dma[ith];
+
+    uint8_t * spad = NULL;
+    if (gctx->use_vtcm) {
+        spad = gctx->vtcm_state_base + gctx->vtcm_state_per_thread * ith;
+    }
+
+    for (uint32_t ir = ith; ir < total_rows; ir += nth) {
+        const uint32_t iv1 = ir % H;
+        const uint32_t iv3 = ir / H;
+
+        const uint32_t iq1 = iv1 % q->ne[1];
+        const uint32_t ik1 = iv1 % k->ne[1];
+        const uint32_t iq3 = iv3 / rq3;
+        const uint32_t ik3 = iv3 / rk3;
+
+        float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
+        const float * s_in = state_in_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
+        float * s_work;
+
+        if (spad) {
+            dma_queue_push(dma, dma_make_ptr(spad, s_in),
+                           S_v * sizeof(float), S_v * sizeof(float),
+                           S_v * sizeof(float), S_v);
+            dma_queue_pop(dma);
+            s_work = (float *) spad;
+        } else {
+            s_work = s_out;
+            memcpy(s_work, s_in, gctx->state_bytes);
+        }
+
+        float * attn_data = dst_base + ((uint64_t) iv3 * H + iv1) * S_v;
+
+        const float * q_t = (const float *) ((const uint8_t *) (uintptr_t) q->data +
+                (uint64_t) iq3 * q->nb[3] + (uint64_t) iq1 * q->nb[1]);
+        const float * k_t = (const float *) ((const uint8_t *) (uintptr_t) k->data +
+                (uint64_t) ik3 * k->nb[3] + (uint64_t) ik1 * k->nb[1]);
+        const float * v_t = (const float *) ((const uint8_t *) (uintptr_t) v->data +
+                (uint64_t) iv3 * v->nb[3] + (uint64_t) iv1 * v->nb[1]);
+        const float * g_t = (const float *) ((const uint8_t *) (uintptr_t) g->data +
+                (uint64_t) iv3 * g->nb[3] + (uint64_t) iv1 * g->nb[1]);
+        const float beta_val = *(const float *) ((const uint8_t *) (uintptr_t) beta->data +
+                (uint64_t) iv3 * beta->nb[3] + (uint64_t) iv1 * beta->nb[1]);
+
+        memcpy(local_q, q_t, (size_t) S_v * sizeof(float));
+        memcpy(local_k, k_t, (size_t) S_v * sizeof(float));
+
+        if (kda) {
+            hvx_exp_f32((uint8_t *) local_gate, (const uint8_t *) g_t, S_v, false);
+
+            uint32_t j = 0;
+            for (; j + 8 <= S_v; j += 8) {
+                float * row0 = s_work + (uint64_t) (j + 0) * S_v;
+                float * row1 = s_work + (uint64_t) (j + 1) * S_v;
+                float * row2 = s_work + (uint64_t) (j + 2) * S_v;
+                float * row3 = s_work + (uint64_t) (j + 3) * S_v;
+                float * row4 = s_work + (uint64_t) (j + 4) * S_v;
+                float * row5 = s_work + (uint64_t) (j + 5) * S_v;
+                float * row6 = s_work + (uint64_t) (j + 6) * S_v;
+                float * row7 = s_work + (uint64_t) (j + 7) * S_v;
+                gdn_mul_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
+                                 local_gate, local_k, S_v, local_sums);
+                float local_delta_b[8] __attribute__((aligned(128)));
+                for (uint32_t r = 0; r < 8; ++r) {
+                    local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
+                }
+                gdn_add_scaled_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
+                                        local_k, local_delta_b, local_q, S_v, local_sums);
+                for (uint32_t r = 0; r < 8; ++r) {
+                    attn_data[j + r] = local_sums[r] * scale;
+                }
+            }
+            for (; j + 4 <= S_v; j += 4) {
+                float * row0 = s_work + (uint64_t) (j + 0) * S_v;
+                float * row1 = s_work + (uint64_t) (j + 1) * S_v;
+                float * row2 = s_work + (uint64_t) (j + 2) * S_v;
+                float * row3 = s_work + (uint64_t) (j + 3) * S_v;
+                gdn_mul_dot4_f32(row0, row1, row2, row3, local_gate, local_k, S_v, local_sums);
+                float local_delta_b[4] __attribute__((aligned(128)));
+                for (uint32_t r = 0; r < 4; ++r) {
+                    local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
+                }
+                gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
+                for (uint32_t r = 0; r < 4; ++r) {
+                    attn_data[j + r] = local_sums[r] * scale;
+                }
+            }
+            for (; j < S_v; ++j) {
+                float * row = s_work + (uint64_t) j * S_v;
+                const float sum = gdn_mul_dot_f32(row, local_gate, local_k, S_v);
+                const float dj = (v_t[j] - sum) * beta_val;
+                attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
+            }
+        } else {
+            const float gate = expf(g_t[0]);
+            uint32_t j = 0;
+            for (; j + 8 <= S_v; j += 8) {
+                float * row0 = s_work + (uint64_t) (j + 0) * S_v;
+                float * row1 = s_work + (uint64_t) (j + 1) * S_v;
+                float * row2 = s_work + (uint64_t) (j + 2) * S_v;
+                float * row3 = s_work + (uint64_t) (j + 3) * S_v;
+                float * row4 = s_work + (uint64_t) (j + 4) * S_v;
+                float * row5 = s_work + (uint64_t) (j + 5) * S_v;
+                float * row6 = s_work + (uint64_t) (j + 6) * S_v;
+                float * row7 = s_work + (uint64_t) (j + 7) * S_v;
+                gdn_mul_scalar_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
+                                        gate, local_k, S_v, local_sums);
+                float local_delta_b[8] __attribute__((aligned(128)));
+                for (uint32_t r = 0; r < 8; ++r) {
+                    local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
+                }
+                gdn_add_scaled_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7,
+                                        local_k, local_delta_b, local_q, S_v, local_sums);
+                for (uint32_t r = 0; r < 8; ++r) {
+                    attn_data[j + r] = local_sums[r] * scale;
+                }
+            }
+            for (; j + 4 <= S_v; j += 4) {
+                float * row0 = s_work + (uint64_t) (j + 0) * S_v;
+                float * row1 = s_work + (uint64_t) (j + 1) * S_v;
+                float * row2 = s_work + (uint64_t) (j + 2) * S_v;
+                float * row3 = s_work + (uint64_t) (j + 3) * S_v;
+                gdn_mul_scalar_dot4_f32(row0, row1, row2, row3, gate, local_k, S_v, local_sums);
+                float local_delta_b[4] __attribute__((aligned(128)));
+                for (uint32_t r = 0; r < 4; ++r) {
+                    local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val;
+                }
+                gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums);
+                for (uint32_t r = 0; r < 4; ++r) {
+                    attn_data[j + r] = local_sums[r] * scale;
+                }
+            }
+            for (; j < S_v; ++j) {
+                float * row = s_work + (uint64_t) j * S_v;
+                const float sum = gdn_mul_scalar_dot_f32(row, gate, local_k, S_v);
+                const float dj = (v_t[j] - sum) * beta_val;
+                attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale;
+            }
+        }
+
+        if (spad) {
+            dma_queue_push(dma, dma_make_ptr(s_out, spad),
+                           S_v * sizeof(float), S_v * sizeof(float),
+                           S_v * sizeof(float), S_v);
+            dma_queue_pop(dma);
+        }
+    }
+}
+
+int op_gated_delta_net(struct htp_ops_context * octx) {
+    const struct htp_tensor * q     = octx->src[0];
+    const struct htp_tensor * k     = octx->src[1];
+    const struct htp_tensor * v     = octx->src[2];
+    const struct htp_tensor * g     = octx->src[3];
+    const struct htp_tensor * beta  = octx->src[4];
+    const struct htp_tensor * state = octx->src[5];
+    const struct htp_tensor * dst   = octx->dst;
+
+    if (!q || !k || !v || !g || !beta || !state || !dst) {
+        return HTP_STATUS_INVAL_PARAMS;
+    }
+
+    if (q->type != HTP_TYPE_F32 || k->type != HTP_TYPE_F32 || v->type != HTP_TYPE_F32 ||
+        g->type != HTP_TYPE_F32 || beta->type != HTP_TYPE_F32 || state->type != HTP_TYPE_F32 ||
+        dst->type != HTP_TYPE_F32) {
+        return HTP_STATUS_NO_SUPPORT;
+    }
+
+    const uint32_t S_v      = v->ne[0];
+    const uint32_t H        = v->ne[1];
+    const uint32_t n_tokens = v->ne[2];
+    const uint32_t n_seqs   = v->ne[3];
+
+    if (S_v == 0 || S_v > HTP_GDN_MAX_SV || H == 0 || n_tokens == 0 || n_seqs == 0) {
+        return HTP_STATUS_NO_SUPPORT;
+    }
+    if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) {
+        return HTP_STATUS_NO_SUPPORT;
+    }
+    if (q->ne[0] != S_v || k->ne[0] != S_v || q->ne[1] == 0 || k->ne[1] == 0 ||
+        q->ne[2] != n_tokens || k->ne[2] != n_tokens || q->ne[3] == 0 || k->ne[3] == 0 ||
+        (n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) {
+        return HTP_STATUS_NO_SUPPORT;
+    }
+    if (state->ne[0] * state->ne[1] * state->ne[2] * state->ne[3] != S_v * S_v * H * n_seqs) {
+        return HTP_STATUS_NO_SUPPORT;
+    }
+    if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs) {
+        return HTP_STATUS_NO_SUPPORT;
+    }
+
+    if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
+        return HTP_STATUS_OK;
+    }
+
+    struct htp_gdn_context gctx;
+    gctx.octx = octx;
+    gctx.rows_per_thread = (H * n_seqs + octx->n_threads - 1) / octx->n_threads;
+    gctx.state_bytes = (size_t) S_v * S_v * sizeof(float);
+
+    size_t state_aligned = (size_t) S_v * S_v * sizeof(float);
+    state_aligned = (state_aligned + 127) & ~(size_t)127;
+
+    gctx.use_vtcm = false;
+    gctx.vtcm_state_base = NULL;
+    gctx.vtcm_state_per_thread = 0;
+
+    if (n_tokens == 1 && octx->ctx->vtcm_base) {
+        size_t vtcm_total = state_aligned * octx->n_threads;
+        if (octx->ctx->vtcm_size >= vtcm_total) {
+            gctx.use_vtcm = true;
+            gctx.vtcm_state_base = octx->ctx->vtcm_base;
+            gctx.vtcm_state_per_thread = state_aligned;
+        }
+    }
+
+    if (n_tokens == 1) {
+        worker_pool_run_func(octx->ctx->worker_pool, gated_delta_net_f32_tg_thread, &gctx, octx->n_threads);
+    } else {
+        worker_pool_run_func(octx->ctx->worker_pool, gated_delta_net_f32_pp_thread, &gctx, octx->n_threads);
+    }
+
+    return HTP_STATUS_OK;
+}
--- a/ggml/src/ggml-hexagon/htp/htp-ctx.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h
@@ -106,5 +106,6 @@ int op_cumsum(struct htp_ops_context * octx);
 int op_fill(struct htp_ops_context * octx);
 int op_diag(struct htp_ops_context * octx);
 int op_solve_tri(struct htp_ops_context * octx);
+int op_gated_delta_net(struct htp_ops_context * octx);

 #endif /* HTP_CTX_H */
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -83,6 +83,9 @@ enum htp_op_code {
    HTP_OP_FILL,
    HTP_OP_DIAG,
    HTP_OP_SOLVE_TRI,
+    HTP_OP_L2_NORM,
+    HTP_OP_GATED_DELTA_NET,
+
    HTP_OP_INVALID
 };

--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -542,6 +542,7 @@ static int execute_op(struct htp_ops_context * octx) {
        case HTP_OP_UNARY_SIGMOID:
        case HTP_OP_UNARY_NEG:
        case HTP_OP_UNARY_EXP:
+        case HTP_OP_L2_NORM:
            return op_unary(octx);

        case HTP_OP_UNARY_SILU:
@@ -593,6 +594,9 @@ static int execute_op(struct htp_ops_context * octx) {
        case HTP_OP_SOLVE_TRI:
            return op_solve_tri(octx);

+        case HTP_OP_GATED_DELTA_NET:
+            return op_gated_delta_net(octx);
+
        case HTP_OP_INVALID:
            break;

--- a/ggml/src/ggml-hexagon/htp/unary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/unary-ops.c
@@ -298,6 +298,81 @@ static void softplus_f32(const float * restrict src,
    }
 }

+// --- L2_NORM HVX kernel ---
+// Computes y[i] = x[i] / fmax(sqrt(sum(x[j]^2)), epsilon) for each row.
+// scale = 1/fmax(sqrt(sum), epsilon) is computed entirely in HVX registers
+// using rsqrt + inverse to avoid scalar extraction.
+static void hvx_fast_l2_norm_f32(const uint8_t * restrict src,
+                                 uint8_t * restrict dst,
+                                 uint8_t * restrict pad,
+                                 const int num_elems,
+                                 float     epsilon) {
+    (void)pad;
+
+    const HVX_Vector * restrict v_src = (HVX_Vector *) src;
+    HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;
+
+    HVX_Vector sum_v = hvx_vec_splat_f32(0.0f);
+
+    const int nvec = num_elems / VLEN_FP32;
+    const int nloe = num_elems % VLEN_FP32;
+
+    #pragma unroll(4)
+    for (int i = 0; i < nvec; i++) {
+        HVX_Vector v1 = v_src[i];
+        HVX_Vector sq = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
+        sum_v         = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, sq);
+    }
+
+    // Include tail elements in the sum-of-squares using a predicate mask
+    if (nloe > 0) {
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
+        HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
+        HVX_Vector sq = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
+        sum_v         = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, sq);
+    }
+
+    // Compute scale = 1/fmax(sqrt(sum), epsilon) entirely in HVX registers.
+    // hvx_vec_rsqrt_f32 + hvx_vec_inverse_f32 avoids scalar extraction.
+    HVX_Vector sum_sf    = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v));
+    HVX_Vector rsqrt_v   = hvx_vec_rsqrt_f32(sum_sf);              // 1/sqrt(sum)
+    HVX_Vector sqrt_v    = hvx_vec_inverse_f32(rsqrt_v);            // sqrt(sum)
+    HVX_Vector epsilon_v = hvx_vec_splat_f32(epsilon);
+    HVX_Vector denom_v   = Q6_Vsf_vmax_VsfVsf(sqrt_v, epsilon_v);  // fmax(sqrt(sum), epsilon)
+    HVX_Vector scale_v   = hvx_vec_inverse_f32(denom_v);            // 1/fmax(sqrt(sum), epsilon)
+
+    #pragma unroll(4)
+    for (int i = 0; i < nvec; i++) {
+        HVX_Vector v1 = v_src[i];
+        v_dst[i]      = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(v1, scale_v));
+    }
+
+    if (nloe > 0) {
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
+        HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
+        HVX_Vector result = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(v1, scale_v));
+        hvx_vec_store_a(&v_dst[nvec], nloe * 4, result);
+    }
+}
+
+static void l2_norm_f32(const float * restrict src,
+                        float * restrict dst,
+                        uint8_t * restrict spad,
+                        const uint32_t num_rows,
+                        const uint32_t row_elems,
+                        const size_t   row_size,
+                        int32_t *      op_params) {
+    float epsilon = 0.f;
+    memcpy(&epsilon, op_params, sizeof(float));
+
+    for (uint32_t ir = 0; ir < num_rows; ir++) {
+        const float * restrict src_f = (const float *)((const uint8_t *)src + (ir * row_size));
+        float * restrict dst_f       = (float *)((uint8_t *)dst + (ir * row_size));
+
+        hvx_fast_l2_norm_f32((const uint8_t *)src_f, (uint8_t *)dst_f, spad, row_elems, epsilon);
+    }
+}
+
 static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
    const struct htp_unary_context * uctx = (const struct htp_unary_context *) data;
    struct htp_ops_context * octx = uctx->octx;
@@ -402,6 +477,9 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
            case HTP_OP_UNARY_SOFTPLUS:
                softplus_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
                break;
+            case HTP_OP_L2_NORM:
+                l2_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
+                break;
            default:
                break;
        }
@@ -469,6 +547,9 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
        case HTP_OP_UNARY_SOFTPLUS:
            op_type = "softplus-f32";
            break;
+        case HTP_OP_L2_NORM:
+            op_type = "l2norm-f32";
+            break;

        default:
            FARF(ERROR, "Unsupported unary Op %u\n", octx->op);
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -135,7 +135,11 @@ endif()

 if (GGML_SYCL_TARGET STREQUAL "INTEL")
    add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
-    target_link_options(ggml-sycl PRIVATE  -Xs   -ze-intel-greater-than-4GB-buffer-required)
+    if (NOT GGML_SYCL_DEVICE_ARCH)
+        target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required)
+    else()
+        message(STATUS "Skipping -ze-intel-greater-than-4GB-buffer-required for spir64_gen AOT")
+    endif()

    # Link against Intel oneMKL
    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
@@ -160,7 +164,15 @@ if (GGML_SYCL_HOST_MEM_FALLBACK)
 endif()

 if (GGML_SYCL_DEVICE_ARCH)
-    target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
-    target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
+    message(STATUS "GGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} (AOT via spir64_gen)")
+    target_compile_options(
+        ggml-sycl PRIVATE
+        -fsycl-targets=spir64_gen
+        "SHELL:-Xsycl-target-backend=spir64_gen \"-device ${GGML_SYCL_DEVICE_ARCH}\""
+    )
+    target_link_options(
+        ggml-sycl PRIVATE
+        -fsycl-targets=spir64_gen
+        "SHELL:-Xsycl-target-backend=spir64_gen \"-device ${GGML_SYCL_DEVICE_ARCH}\""
+    )
 endif()
-
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -25,6 +25,7 @@
 #include "presets.hpp"
 #include "type.hpp"
 #include "sycl_hw.hpp"
+#include "fattn-buffers.hpp"

 namespace syclexp = sycl::ext::oneapi::experimental;

@@ -404,12 +405,16 @@ struct ggml_backend_sycl_context {
    std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
    std::unordered_map<sycl::queue *, std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>>> scratchpad_map;

+    std::unique_ptr<ggml_sycl_fattn_kv_buffers> fattn_bufs[GGML_SYCL_MAX_DEVICES];
+
    std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];

    static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);

    static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);

+    static std::unique_ptr<ggml_sycl_fattn_kv_buffers> new_fattn_kv_buffers(queue_ptr qptr, int device);
+
    ggml_sycl_pool & pool(int device) {
        if (pools[device] == nullptr) {
            pools[device] = new_pool_for_device(stream(device,0), device);
@@ -421,6 +426,17 @@ struct ggml_backend_sycl_context {
        return pool(device);
    }

+    ggml_sycl_fattn_kv_buffers & fattn_buffers(int device) {
+        if (fattn_bufs[device] == nullptr) {
+            fattn_bufs[device] = new_fattn_kv_buffers(stream(device, 0), device);
+        }
+        return *fattn_bufs[device];
+    }
+
+    ggml_sycl_fattn_kv_buffers & fattn_buffers() {
+        return fattn_buffers(device);
+    }
+
 #ifdef GGML_SYCL_GRAPH
    std::unique_ptr<sycl_ex::command_graph<sycl_ex::graph_state::executable>> exec_graph = nullptr;
 #endif
--- a/ggml/src/ggml-sycl/convert.cpp
+++ b/ggml/src/ggml-sycl/convert.cpp
@@ -252,6 +252,23 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
 #endif
 }

+template <typename dst_t>
+static void dequantize_row_q5_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
+    const int64_t nb = k / QK_K;
+
+    dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
+
+    stream->submit([&](sycl::handler & cgh) {
+        sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(K_SCALE_SIZE), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
+            [=](sycl::nd_item<3> item_ct1) {
+                dequantize_block_q5_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
+            });
+    });
+}
+
 template <typename dst_t>
 static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k,
                                     dpct::queue_ptr stream) {
@@ -643,7 +660,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
                return dequantize_row_q4_K_sycl;
            }
        case GGML_TYPE_Q5_K:
-            return dequantize_row_q5_K_sycl;
+            if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
+                return dequantize_row_q5_K_sycl_reorder;
+            } else {
+                return dequantize_row_q5_K_sycl;
+            }
        case GGML_TYPE_Q6_K:
            if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
                return dequantize_row_q6_K_sycl_reorder;
@@ -718,7 +739,11 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
                return dequantize_row_q4_K_sycl;
            }
        case GGML_TYPE_Q5_K:
-            return dequantize_row_q5_K_sycl;
+            if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
+                return dequantize_row_q5_K_sycl_reorder;
+            } else {
+                return dequantize_row_q5_K_sycl;
+            }
        case GGML_TYPE_Q6_K:
            if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
                return dequantize_row_q6_K_sycl_reorder;
--- a/ggml/src/ggml-sycl/dequantize.hpp
+++ b/ggml/src/ggml-sycl/dequantize.hpp
@@ -537,6 +537,63 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
 #endif
 }

+template <typename dst_t>
+static void dequantize_block_q5_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                          uint8_t * scales_local, const sycl::nd_item<3> & item_ct1, int64_t n_blocks) {
+    const int64_t ib = item_ct1.get_group(2);
+
+#if QK_K == 256
+    // assume 64 threads
+    const int64_t tid = item_ct1.get_local_id(2);
+    const int64_t il  = tid / 16;   // 0...3
+    const int64_t ir  = tid % 16;   // 0...15
+    const int64_t is  = 2 * il;
+
+    dst_t * y = yy + ib * QK_K + 64 * il + 2 * ir;
+
+    const uint8_t * base = static_cast<const uint8_t *>(vx);
+
+    // Reordered layout: [qs (QK_K/2 per block)] [qh (QK_K/8 per block)] [scales (K_SCALE_SIZE per block)] [dm (half2 per block)]
+    const size_t qs_offset     = ib * (QK_K / 2);
+    const size_t qh_offset     = n_blocks * (QK_K / 2) + ib * (QK_K / 8);
+    const size_t scales_offset = n_blocks * (QK_K / 2) + n_blocks * (QK_K / 8) + ib * K_SCALE_SIZE;
+    const size_t dm_offset     = n_blocks * (QK_K / 2) + n_blocks * (QK_K / 8) + n_blocks * K_SCALE_SIZE + ib * sizeof(ggml_half2);
+
+    const uint8_t *  qs_ptr     = base + qs_offset;
+    const uint8_t *  qh_ptr     = base + qh_offset;
+    const uint8_t *  scales_ptr = base + scales_offset;
+    const ggml_half2 dm_values  = *reinterpret_cast<const ggml_half2 *>(base + dm_offset);
+
+    const float dall = dm_values.x();
+    const float dmin = dm_values.y();
+
+    const uint8_t * ql = qs_ptr + 32 * il + 2 * ir;
+    const uint8_t * qh = qh_ptr + 2 * ir;
+
+    if (tid < K_SCALE_SIZE) {
+        scales_local[tid] = scales_ptr[tid];
+    }
+
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, scales_local, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, scales_local, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+
+    uint8_t hm  = 1 << (2 * il);
+    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
+    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
+    hm <<= 1;
+    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
+    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
+#else
+    GGML_UNUSED(ib); GGML_UNUSED(tid); GGML_UNUSED(yy); GGML_UNUSED(scales_local); GGML_UNUSED(n_blocks);
+    GGML_ABORT("Q5_K reorder dequantize not supported for QK_K != 256");
+#endif
+}
+
 template<typename dst_t>
 static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
                                  const sycl::nd_item<3> &item_ct1) {
--- a/ggml/src/ggml-sycl/fattn-buffers.cpp
+++ b/ggml/src/ggml-sycl/fattn-buffers.cpp
@@ -0,0 +1,56 @@
+//
+// MIT license
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#include "common.hpp"
+
+sycl::half * ggml_sycl_fattn_kv_buffers::kv_buffer::ensure_half(size_t n_elems) {
+    const size_t need_bytes = n_elems * sizeof(sycl::half);
+
+    if (capacity >= need_bytes) {
+        return ptr;
+    }
+
+    if (ptr) {
+        SYCL_CHECK(CHECK_TRY_ERROR(qptr->wait()));
+        SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
+        ptr = nullptr;
+        capacity = 0;
+    }
+
+    size_t cap = 0;
+    while (cap < need_bytes) {
+        cap += CHUNK_SIZE;
+    }
+
+    void * dev_ptr;
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(dev_ptr = sycl::malloc_device(
+                        cap, *qptr)));
+
+    if (!dev_ptr) {
+        GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, cap);
+        GGML_ABORT("fattn buffer alloc failed");
+    }
+
+    ptr = static_cast<sycl::half *>(dev_ptr);
+    capacity = cap;
+    return ptr;
+}
+
+ggml_sycl_fattn_kv_buffers::kv_buffer::~kv_buffer() {
+#ifdef DEBUG_SYCL_POOL
+    GGML_LOG_INFO("ggml_sycl_fattn_kv_buffer[%d]: %.2f MiB\n", device, capacity / 1024.0 / 1024.0);
+#endif
+    if (ptr) {
+        SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
+    }
+}
--- a/ggml/src/ggml-sycl/fattn-buffers.hpp
+++ b/ggml/src/ggml-sycl/fattn-buffers.hpp
@@ -0,0 +1,63 @@
+//
+// MIT license
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_FATTN_BUFFERS_HPP
+#define GGML_SYCL_FATTN_BUFFERS_HPP
+
+#include <sycl/sycl.hpp>
+
+typedef sycl::queue *queue_ptr;
+
+struct ggml_sycl_fattn_kv_buffers {
+    // buffers grow in chunks of this size
+    static constexpr size_t CHUNK_SIZE = 16ull << 20; // 16 MiB
+
+    struct kv_buffer {
+        kv_buffer(queue_ptr qptr_, int device_) : qptr(qptr_), device(device_) {}
+        ~kv_buffer();
+
+        kv_buffer(const kv_buffer &) = delete;
+        kv_buffer & operator=(const kv_buffer &) = delete;
+
+        sycl::half * ensure_half(size_t n_elems);
+
+    private:
+        sycl::half * ptr      = nullptr;
+        size_t       capacity = 0;
+        queue_ptr    qptr     = nullptr;
+        [[maybe_unused]] int device = 0;
+    };
+
+    kv_buffer K;
+    kv_buffer V;
+
+    ggml_sycl_fattn_kv_buffers(queue_ptr qptr, int device) : K(qptr, device), V(qptr, device) {}
+
+    ggml_sycl_fattn_kv_buffers(const ggml_sycl_fattn_kv_buffers &) = delete;
+    ggml_sycl_fattn_kv_buffers & operator=(const ggml_sycl_fattn_kv_buffers &) = delete;
+};
+
+/**
+ * Imitates `ggml_sycl_pool_alloc` to keep the code calling alloc unchanged.
+ */
+struct ggml_sycl_fattn_alloc {
+    ggml_sycl_fattn_kv_buffers::kv_buffer & buf;
+    sycl::half *                         ptr = nullptr;
+
+    explicit ggml_sycl_fattn_alloc(ggml_sycl_fattn_kv_buffers::kv_buffer & buf_) : buf(buf_) {}
+
+    sycl::half * alloc(size_t n_elems) {
+        ptr = buf.ensure_half(n_elems);
+        return ptr;
+    }
+};
+#endif
--- a/ggml/src/ggml-sycl/fattn-common.hpp
+++ b/ggml/src/ggml-sycl/fattn-common.hpp
@@ -5,6 +5,7 @@
 #include "common.hpp"
 #include "convert.hpp"
 #include "vecdotq.hpp"
+#include "fattn-buffers.hpp"

 #include "ggml.h"

@@ -918,12 +919,13 @@ void launch_fattn(
    GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);

    ggml_sycl_pool & pool = ctx.pool();
+    ggml_sycl_fattn_kv_buffers & fbuf = ctx.fattn_buffers();
    dpct::queue_ptr  main_stream = ctx.stream();
    const int id  = ggml_sycl_get_device();
    const int nsm = ggml_sycl_info().devices[id].nsm;

-    ggml_sycl_pool_alloc<sycl::half>   K_f16(pool);
-    ggml_sycl_pool_alloc<sycl::half>   V_f16(pool);
+    ggml_sycl_fattn_alloc        K_f16(fbuf.K);
+    ggml_sycl_fattn_alloc        V_f16(fbuf.V);
    ggml_sycl_pool_alloc<int>    KV_max(pool);
    ggml_sycl_pool_alloc<float>  dst_tmp(pool);
    ggml_sycl_pool_alloc<sycl::float2> dst_tmp_meta(pool);
--- a/ggml/src/ggml-sycl/getrows.cpp
+++ b/ggml/src/ggml-sycl/getrows.cpp
@@ -183,6 +183,10 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
            get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const sycl::half *)dst->src[0]->data,
                                src1_i32, (float *)dst->data, ctx.stream());
            break;
+        case GGML_TYPE_BF16:
+            get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const sycl::ext::oneapi::bfloat16 *)dst->src[0]->data,
+                                src1_i32, (float *)dst->data, ctx.stream());
+            break;
        case GGML_TYPE_F32:
            get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
            src1_i32, (float *)dst->data, ctx.stream());
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -1286,6 +1286,23 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
    explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {}

    ~ggml_sycl_pool_leg() {
+#ifdef DEBUG_SYCL_POOL
+        int    n_cached    = 0;
+        size_t bytes_cached = 0;
+        for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
+            if (buffer_pool[i].ptr != nullptr) {
+                ++n_cached;
+                bytes_cached += buffer_pool[i].size;
+            }
+        }
+        GGML_LOG_INFO("%s: %d buffers, cached = %.2f MiB\n", __func__,
+                      n_cached, bytes_cached / 1024.0 / 1024.0);
+        const auto slots = format_slots_in_alloc_order();
+        if (!slots.empty()) {
+            GGML_LOG_INFO("%s: slots MiB: %s\n", __func__, slots.c_str());
+        }
+#endif
+
        for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
            ggml_sycl_buffer & b = buffer_pool[i];
            if (b.ptr != nullptr) {
@@ -1296,6 +1313,26 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
        GGML_ASSERT(pool_size == 0);
    }

+#ifdef DEBUG_SYCL_POOL
+    std::string format_slots_in_alloc_order() const {
+        std::string line;
+        char buf[32];
+        bool first = true;
+        for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
+            if (buffer_pool[i].ptr == nullptr) {
+                continue;
+            }
+            if (!first) {
+                line += '/';
+            }
+            first = false;
+            snprintf(buf, sizeof(buf), "%.2f", buffer_pool[i].size / 1024.0 / 1024.0);
+            line += buf;
+        }
+        return line;
+    }
+#endif
+
    void * alloc(size_t size, size_t * actual_size) override {
 #ifdef DEBUG_sycl_MALLOC
        int nnz = 0;
@@ -1459,6 +1496,10 @@ std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(q
   return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
 }

+std::unique_ptr<ggml_sycl_fattn_kv_buffers> ggml_backend_sycl_context::new_fattn_kv_buffers(queue_ptr qptr, int device) {
+    return std::unique_ptr<ggml_sycl_fattn_kv_buffers>(new ggml_sycl_fattn_kv_buffers(qptr, device));
+}
+
 // TBD pool with virtual memory management
 // struct ggml_sycl_pool_vmm : public ggml_sycl_pool

@@ -3303,6 +3344,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
        case GGML_TYPE_Q8_0:
            return true;
        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
            return !g_ggml_sycl_prioritize_dmmv;
        default:
@@ -3325,6 +3367,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
            return true;
        default:
@@ -3541,6 +3584,54 @@ static bool reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
    return true;
 }

+static bool reorder_qw_q5_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
+    GGML_ASSERT(size % sizeof(block_q5_K) == 0);
+    GGML_ASSERT(offset % sizeof(block_q5_K) == 0);
+
+    const int nblocks = size / sizeof(block_q5_K);
+
+    sycl_reorder_temp_buffer tmp(stream, size);
+    if (!tmp) {
+        GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
+        return false;
+    }
+    uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
+
+    sycl::event copy_event;
+    SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
+    if (!g_ggml_sycl_use_async_mem_op) {
+        copy_event.wait();
+    }
+
+    auto * qs_ptr     = data_device;
+    auto * qh_ptr     = qs_ptr + (QK_K / 2) * nblocks;
+    auto * scales_ptr = qh_ptr + (QK_K / 8) * nblocks;
+    auto * dm_ptr     = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
+
+    auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
+        const block_q5_K * x  = (const block_q5_K *) tmp_buf;
+        const int          ib = i;
+
+        for (int j = 0; j < QK_K / 2; ++j) {
+            qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
+        }
+
+        for (int j = 0; j < QK_K / 8; ++j) {
+            qh_ptr[ib * (QK_K / 8) + j] = x[ib].qh[j];
+        }
+
+        for (int j = 0; j < K_SCALE_SIZE; ++j) {
+            scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
+        }
+
+        dm_ptr[ib] = x[ib].dm;
+    });
+    if (!g_ggml_sycl_use_async_mem_op) {
+        reorder_event.wait_and_throw();
+    }
+    return true;
+}
+
 static bool reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
    GGML_ASSERT(size % sizeof(block_q6_K) == 0);
    GGML_ASSERT(offset % sizeof(block_q6_K) == 0);
@@ -3607,6 +3698,8 @@ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
            return reorder_qw_q8_0(data_device, ncols, nrows, size, 0, stream);
        case GGML_TYPE_Q4_K:
            return reorder_qw_q4_k(data_device, size, 0, stream);
+        case GGML_TYPE_Q5_K:
+            return reorder_qw_q5_k(data_device, size, 0, stream);
        case GGML_TYPE_Q6_K:
            return reorder_qw_q6_k(data_device, size, 0, stream);
        default:
@@ -4922,6 +5015,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
            {
                switch (op->src[0]->type) {
                    case GGML_TYPE_F16:
+                    case GGML_TYPE_BF16:
                    case GGML_TYPE_F32:
                    case GGML_TYPE_Q4_0:
                    case GGML_TYPE_Q4_1:
@@ -5104,11 +5198,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_ACC:
            return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
        case GGML_OP_PAD:
-            // TODO: add circular padding support for syscl, see https://github.com/ggml-org/llama.cpp/pull/16985
            if (ggml_get_op_params_i32(op, 8) != 0) {
                return false;
            }
-            return ggml_is_contiguous(op->src[0]);
+            return true;
        case GGML_OP_LEAKY_RELU:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_RWKV_WKV6:
--- a/ggml/src/ggml-sycl/mmvq.cpp
+++ b/ggml/src/ggml-sycl/mmvq.cpp
@@ -839,6 +839,26 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
    }
 }

+static void reorder_mul_mat_vec_q5_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
+                                               const int nrows, dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+
+    const int        block_num_y   = ceil_div(nrows, GGML_SYCL_MMV_Y);
+    constexpr size_t num_subgroups = 16;
+    GGML_ASSERT(block_num_y % num_subgroups == 0);
+
+    const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
+    const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
+
+    stream->submit([&](sycl::handler & cgh) {
+        cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
+                            [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q5_K>>(vx, vy, dst, ncols,
+                                                                                            nrows, nd_item);
+                            });
+    });
+}
+
 static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
                                               const int nrows, dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK_K == 0);
@@ -1125,6 +1145,7 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
                    GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q8_0_q8_1_sycl\n");
                    reorder_mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
                } else {
+                    GGML_SYCL_DEBUG("Calling mul_mat_vec_q8_0_q8_1_sycl\n");
                    mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
                }
                break;
@@ -1145,7 +1166,14 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
                }
                break;
            case GGML_TYPE_Q5_K:
-                mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
+                    ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
+                    GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q5_k_q8_1_sycl\n");
+                    reorder_mul_mat_vec_q5_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                } else {
+                    GGML_SYCL_DEBUG("Calling mul_mat_vec_q5_K_q8_1_sycl\n");
+                    mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                }
                break;
            case GGML_TYPE_Q6_K:
                if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
--- a/ggml/src/ggml-sycl/pad.cpp
+++ b/ggml/src/ggml-sycl/pad.cpp
@@ -13,7 +13,8 @@
 //#include "common.hpp"
 #include "pad.hpp"

-static void pad_f32(const float * src, float * dst,
+static void pad_f32(const float * src, size_t s00, size_t s01, size_t s02, size_t s03,
+                    float * dst,
                    const int lp0, const int rp0, const int lp1, const int rp1,
                    const int lp2, const int rp2, const int lp3, const int rp3,
                    const int ne0, const int ne1, const int ne2, const int ne3,
@@ -27,7 +28,6 @@ static void pad_f32(const float * src, float * dst,
        return;
    }

-    // operation
    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
    if ((i0 >= lp0 && i0 < ne0 - rp0) &&
        (i1 >= lp1 && i1 < ne1 - rp1) &&
@@ -37,12 +37,8 @@ static void pad_f32(const float * src, float * dst,
        const int64_t i01 = i1 - lp1;
        const int64_t i02 = i2 - lp2;
        const int64_t i03 = i3 - lp3;
-        const int64_t ne02 = ne2 - lp2 - rp2;
-        const int64_t ne01 = ne1 - lp1 - rp1;
-        const int64_t ne00 = ne0 - lp0 - rp0;

-        const int64_t src_idx = i03 * (ne00 * ne01 * ne02) +
-                                i02 * (ne00 * ne01) + i01 * ne00 + i00;
+        const int64_t src_idx = i03 * s03 + i02 * s02 + i01 * s01 + i00 * s00;

        dst[dst_idx] = src[src_idx];
    } else {
@@ -50,20 +46,19 @@ static void pad_f32(const float * src, float * dst,
    }
 }

-static void pad_f32_sycl(const float *src, float *dst, const int lp0,
-                         const int rp0, const int lp1, const int rp1,
-                         const int lp2, const int rp2, const int lp3,
-                         const int rp3, const int ne0, const int ne1,
-                         const int ne2, const int ne3,
+static void pad_f32_sycl(const float * src, size_t s00, size_t s01, size_t s02, size_t s03,
+                         float * dst, const int lp0, const int rp0, const int lp1, const int rp1,
+                         const int lp2, const int rp2, const int lp3, const int rp3,
+                         const int ne0, const int ne1, const int ne2, const int ne3,
                         dpct::queue_ptr stream) {
    int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
-    dpct::dim3 gridDim(num_blocks, ne1, ne2 * ne3);
+    sycl::range<3> grid(ne2 * ne3, ne1, num_blocks);
    stream->parallel_for(
-        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
+        sycl::nd_range<3>(grid * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
                          sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
        [=](sycl::nd_item<3> item_ct1) {
-            pad_f32(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1,
-                    ne2, ne3, item_ct1);
+            pad_f32(src, s00, s01, s02, s03, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
+                    ne0, ne1, ne2, ne3, item_ct1);
        });
 }

@@ -71,22 +66,27 @@ void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
    float * dst_d = (float *)dst->data;
-    dpct::queue_ptr     stream = ctx.stream();
+    dpct::queue_ptr stream = ctx.stream();

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(ggml_is_contiguous(src0));

-    const int32_t lp0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t rp0 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t lp1 = ((const int32_t*)(dst->op_params))[2];
-    const int32_t rp1 = ((const int32_t*)(dst->op_params))[3];
-    const int32_t lp2 = ((const int32_t*)(dst->op_params))[4];
-    const int32_t rp2 = ((const int32_t*)(dst->op_params))[5];
-    const int32_t lp3 = ((const int32_t*)(dst->op_params))[6];
-    const int32_t rp3 = ((const int32_t*)(dst->op_params))[7];
+    const size_t ts = ggml_type_size(src0->type);
+    const size_t s00 = src0->nb[0] / ts;
+    const size_t s01 = src0->nb[1] / ts;
+    const size_t s02 = src0->nb[2] / ts;
+    const size_t s03 = src0->nb[3] / ts;

-    pad_f32_sycl(src0_d, dst_d,
+    const int32_t lp0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t rp0 = ((const int32_t *)(dst->op_params))[1];
+    const int32_t lp1 = ((const int32_t *)(dst->op_params))[2];
+    const int32_t rp1 = ((const int32_t *)(dst->op_params))[3];
+    const int32_t lp2 = ((const int32_t *)(dst->op_params))[4];
+    const int32_t rp2 = ((const int32_t *)(dst->op_params))[5];
+    const int32_t lp3 = ((const int32_t *)(dst->op_params))[6];
+    const int32_t rp3 = ((const int32_t *)(dst->op_params))[7];
+
+    pad_f32_sycl(src0_d, s00, s01, s02, s03, dst_d,
                 lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
                 dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
 }
--- a/ggml/src/ggml-sycl/quants.hpp
+++ b/ggml/src/ggml-sycl/quants.hpp
@@ -79,6 +79,31 @@ template <> struct block_q_t<GGML_TYPE_Q4_K> {
    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
 };

+template <> struct block_q_t<GGML_TYPE_Q5_K> {
+    struct traits {
+        static constexpr uint32_t qk       = QK_K;
+        static constexpr uint32_t qi       = QI5_K;
+        static constexpr uint32_t qr       = QR5_K;
+        static constexpr uint32_t vdr_mmvq = 2;
+    };
+
+    // Reordered layout: [qs (QK_K/2 per block)] [qh (QK_K/8 per block)] [scales] [dm]
+    static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
+        auto qs_offset = block_index * (QK_K / 2);
+        auto qh_offset = n_blocks * (QK_K / 2) + block_index * (QK_K / 8);
+        return { qs_offset, qh_offset };
+    }
+
+    static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
+        auto nblocks        = (nrows * (ncols / QK_K));
+        auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 8);
+        return { total_qs_bytes + block_index * K_SCALE_SIZE,
+                 total_qs_bytes + nblocks * K_SCALE_SIZE + block_index * sizeof(ggml_half2) };
+    }
+
+    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
+};
+
 template <> struct block_q_t<GGML_TYPE_Q6_K> {
    struct traits {
        static constexpr uint32_t qk       = QK_K;
--- a/ggml/src/ggml-sycl/vecdotq.hpp
+++ b/ggml/src/ggml-sycl/vecdotq.hpp
@@ -357,38 +357,31 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q8_0> {
    using q8_0_block  = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q8_0>;
    using q8_0_traits = typename q8_0_block::traits;

-    __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int * v, const int * u, const float & d8_0, const sycl::half2 & ds8) {
-        int sumi = 0;
-
-#pragma unroll
-        for (size_t i = 0; i < q8_0_traits::vdr_mmvq; ++i) {
-            // Q8_0 values are signed int8, no nibble extraction needed
-            // Direct dp4a: each int packs 4 int8 values
-            sumi = dpct::dp4a(v[i], u[i], sumi);
-        }
-
-        const sycl::float2 ds8f = ds8.convert<float, sycl::rounding_mode::automatic>();
-
-        // Q8_0 has no bias term (values are signed), so just scale
-        return d8_0 * sumi * ds8f.x();
-    }
-
    __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
                                     const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
                                     const sycl::half2 * q8_1_ds, const int & iqs) {
-        const int8_t * bq8_0 = static_cast<const int8_t *>(vbq) + ibx_offset.first;
-        const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset.first));
-        int             v[q8_0_traits::vdr_mmvq];
-        int             u[q8_0_traits::vdr_mmvq];
+        const uint8_t * base = static_cast<const uint8_t *>(vbq);
+        const int8_t *  qs   = reinterpret_cast<const int8_t *>(base + ibx_offset.first);
+        const ggml_half  d   = *reinterpret_cast<const ggml_half *>(base + d_offset.first);
+
+        int v[q8_0_traits::vdr_mmvq];
+        int u[q8_0_traits::vdr_mmvq];

 #pragma unroll
        for (size_t i = 0; i < q8_0_traits::vdr_mmvq; ++i) {
-            v[i] = get_int_from_int8(bq8_0, iqs + i);
+            v[i] = get_int_from_int8(qs, iqs + i);
            u[i] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i);
        }

-        return vec_dot_q8_0_q8_1_impl(v, u, d, *q8_1_ds);
-    };
+        int sumi = 0;
+#pragma unroll
+        for (size_t i = 0; i < q8_0_traits::vdr_mmvq; ++i) {
+            sumi = dpct::dp4a(v[i], u[i], sumi);
+        }
+
+        const sycl::half2 ds_values = *q8_1_ds;
+        return static_cast<float>(d) * static_cast<float>(ds_values[0]) * sumi;
+    }
 };

 static inline float vec_dot_q4_K_q8_1_common(const int * __restrict__ q4, const uint16_t * __restrict__ scales,
@@ -481,6 +474,65 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
    }
 };

+template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q5_K> {
+    static constexpr ggml_type gtype = GGML_TYPE_Q5_K;
+
+    using q5_k_block  = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q5_K>;
+    using q5_k_traits = typename q5_k_block::traits;
+
+    __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
+                                     const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
+                                     const sycl::half2 * q8_1_ds, const int & iqs) {
+        const uint8_t *    base           = static_cast<const uint8_t *>(vbq);
+        const uint8_t *    qs             = base + ibx_offset.first;   // low 4 bits
+        const uint8_t *    qh_base        = base + ibx_offset.second;  // high bit
+        const uint8_t *    scs            = base + d_offset.first;
+        const ggml_half2 * dms            = reinterpret_cast<const ggml_half2 *>(base + d_offset.second);
+
+        const int        bq8_offset = QR5_K * ((iqs / 2) / (QI8_1 / 2));
+        const int *      ql_ptr     = (const int *) (qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
+        const int *      qh_ptr     = (const int *) (qh_base + 4 * ((iqs / 2) % 4));
+        const uint16_t * scales     = (const uint16_t *) scs;
+
+        int   vl[2];
+        int   vh[2];
+        int   u[2 * QR5_K];
+        float d8[QR5_K];
+
+        vl[0] = ql_ptr[0];
+        vl[1] = ql_ptr[4];
+
+        vh[0] = qh_ptr[0] >> bq8_offset;
+        vh[1] = qh_ptr[4] >> bq8_offset;
+
+        uint16_t  aux[2];
+        const int j = (QR5_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
+        if (j < 2) {
+            aux[0] = scales[j + 0] & 0x3f3f;
+            aux[1] = scales[j + 2] & 0x3f3f;
+        } else {
+            aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
+            aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
+        }
+
+        const uint8_t * sc = (const uint8_t *) aux;
+        const uint8_t * m  = sc + 2;
+
+        for (int i = 0; i < QR5_K; ++i) {
+            const int8_t* quant_base_ptr = q8_1_quant_ptr + (bq8_offset + i) * QK8_1;
+            sycl::half2 ds_values = *(q8_1_ds + bq8_offset + i);
+
+            d8[i]                   = ds_values[0];
+
+            const int * q8 = (const int *) quant_base_ptr + ((iqs / 2) % 4);
+            u[2 * i + 0]   = q8[0];
+            u[2 * i + 1]   = q8[4];
+        }
+
+        return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, *dms, d8);
+    }
+};
+
 template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K> {
    static constexpr ggml_type gtype = GGML_TYPE_Q6_K;

--- a/ggml/src/ggml-virtgpu/ggml-backend-device.cpp
+++ b/ggml/src/ggml-virtgpu/ggml-backend-device.cpp
@@ -1,5 +1,7 @@
 #include "ggml-remoting.h"

+#include <mutex>
+
 static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
    virtgpu * gpu = DEV_TO_GPU(dev);

--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -2443,6 +2443,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
        MODEL_TENSOR.FFN_GATE_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
        MODEL_TENSOR.FFN_DOWN_EXP,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_POST_NORM,
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-ac6f7b44f60fde0091f0b3d99afde48f8c99b13a
+628249b398293fc8d2fa81a449ae2920a02c6523
--- a/scripts/sync_vendor.py
+++ b/scripts/sync_vendor.py
@@ -5,7 +5,7 @@ import os
 import sys
 import subprocess

-HTTPLIB_VERSION = "refs/tags/v0.43.3"
+HTTPLIB_VERSION = "refs/tags/v0.43.4"

 vendor = {
    "https://github.com/nlohmann/json/releases/latest/download/json.hpp":     "vendor/nlohmann/json.hpp",
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1131,10 +1131,6 @@ void llama_model_base::load_hparams(llama_model_loader & ml) {
        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false);
    }

-    // for differentiating model types
-    uint32_t n_vocab = 0;
-    ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
-
    // for classifier models
    ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
    if (!classifier_labels.empty()) {
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -503,6 +503,14 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                };
                byte_encode = false; // uses raw UTF-8, not GPT-2 byte encoding
                break;
+            case LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE:
+                // Sarvam uses SPM-style BPE (same shape as Gemma4): spaces replaced with U+2581
+                // by the normalizer, BPE merges over the whole text on raw UTF-8.
+                regex_exprs = {
+                    "[^\\n]+|[\\n]+",
+                };
+                byte_encode = false;
+                break;
            default:
                // default regex for BPE tokenization pre-processing
                regex_exprs = {
@@ -2005,6 +2013,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "gemma4") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
                escape_whitespaces = true;
+            } else if (
+                    tokenizer_pre == "sarvam-moe") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE;
+                escape_whitespaces = true;
+                clean_spaces = false;
            } else if (
                    tokenizer_pre == "jina-v1-en" ||
                    tokenizer_pre == "jina-v2-code" ||
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -59,6 +59,7 @@ enum llama_vocab_pre_type {
    LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM       = 48,
    LLAMA_VOCAB_PRE_TYPE_JAIS2           = 49,
    LLAMA_VOCAB_PRE_TYPE_GEMMA4          = 50,
+    LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE      = 51,
 };

 struct LLM_KV;
--- a/src/models/deepseek2.cpp
+++ b/src/models/deepseek2.cpp
@@ -1,7 +1,8 @@
 #include "models.h"

 void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) {
-    const auto n_vocab = vocab.n_tokens();
+    uint32_t n_vocab = 0;
+    ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);

    // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B
    const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));
--- a/src/models/gemma4.cpp
+++ b/src/models/gemma4.cpp
@@ -110,7 +110,13 @@ void llama_model_gemma4::load_arch_tensors(llama_model_loader &) {
            layer.ffn_post_norm_2 = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM_2, "weight", i), {n_embd}, 0);

            // MoE FFN
-            layer.ffn_gate_up_exps  = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS,  "weight", i), {n_embd, n_ff_exp * 2, n_expert}, 0);
+            layer.ffn_gate_up_exps  = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS,  "weight", i), {n_embd, n_ff_exp * 2, n_expert}, TENSOR_NOT_REQUIRED);
+
+            if (layer.ffn_gate_up_exps == nullptr) {
+                layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
+                layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
+            }
+
            layer.ffn_down_exps     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS,     "weight", i), {n_ff_exp, n_embd, n_expert}, 0);

            // per-expert scale will be loaded as down_exps_s at the end of the current switch case
@@ -286,8 +292,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para

            cur_moe = build_moe_ffn(cur_moe,
                    nullptr, // gate_inp
-                    nullptr, // up_exps
-                    nullptr, // gate_exps
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
                    model.layers[il].ffn_down_exps,
                    nullptr, // exp_probs_b (not used for gemma4)
                    n_expert, n_expert_used,
@@ -296,8 +302,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                    il, logits,
                    model.layers[il].ffn_gate_up_exps,
-                    nullptr, // up_exps_s
-                    nullptr, // gate_exps_s
+                    model.layers[il].ffn_up_exps_s,
+                    model.layers[il].ffn_gate_exps_s,
                    model.layers[il].ffn_down_exps_s);
            cur_moe = build_norm(cur_moe,
                    model.layers[il].ffn_post_norm_2, nullptr,
--- a/src/models/llama.cpp
+++ b/src/models/llama.cpp
@@ -1,7 +1,8 @@
 #include "models.h"

 void llama_model_llama::load_arch_hparams(llama_model_loader & ml) {
-    const auto n_vocab = vocab.n_tokens();
+    uint32_t n_vocab = 0;
+    ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);

    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -8861,8 +8861,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
                                if (nh == 1 && hsk != 320 && hsk != 576) continue;
                                for (int nr3 : { 1, 3, }) {
                                    if (hsk > 64 && nr3 > 1) continue; // skip broadcast for large head sizes
-                                    for (int nr2 : { 1, 4, 12, 20, 32 }) {
+                                    for (int nr2 : { 1, 4, 8, 12, 16, 20, 32 }) {
+                                        if (nr2 ==  8 && hsk != 192) continue;
                                        if (nr2 == 12 && hsk != 128) continue;
+                                        if (nr2 == 16 && hsk != 192) continue;
                                        if (nr2 == 20 && (nh != 1 || hsk != 576)) continue;
                                        if (nr2 == 32 && (nh != 1 || hsk != 320)) continue;
                                        //for (int kv : { 1, 17, 31, 33, 61, 113, 65, 127, 129, 130, 255, 260, 371, 380, 407, 512, 1024, }) {
--- a/tests/test-reasoning-budget.cpp
+++ b/tests/test-reasoning-budget.cpp
@@ -70,20 +70,20 @@ static void test_reasoning_budget(
        llama_sampler_apply(sampler, &cur_p);

        // Check if forcing is active (all logits except one should be -INFINITY)
-        size_t not_neg_inf = 0;
-        llama_token not_neg_inf_token = -1;
+        size_t finite_count = 0;
+        llama_token finite_token = -1;
        for (size_t j = 0; j < cur.size(); j++) {
-            if (std::isfinite(cur[j].logit) || cur[j].logit > 0) { // +INFINITY
-                not_neg_inf++;
-                not_neg_inf_token = cur[j].id;
+            if (std::isfinite(cur[j].logit)) {
+                finite_count++;
+                finite_token = cur[j].id;
            }
        }

        llama_sampler_accept(sampler, sequence[i]);

-        fprintf(stderr, "    i=%zu: token=%d, not_neg_inf_count=%zu, not_neg_inf_token=%d\n", i, (int)sequence[i], not_neg_inf, (int)not_neg_inf_token);
+        fprintf(stderr, "    i=%zu: token=%d, finite_count=%zu, finite_token=%d\n", i, (int)sequence[i], finite_count, (int)finite_token);

-        if (not_neg_inf == 1) {
+        if (finite_count == 1) {
            if (actual_force_start == SIZE_MAX) {
                actual_force_start = i;
            }
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -1651,6 +1651,7 @@ Note:
 2. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow:
    - If a model is running but updated or removed from the source, it will be unloaded
    - If a model is not running, it will be added or updated according to the source
+3. When the model is loaded, the info from `/v1/models` is forwarded to router's `/v1/models`. This includes metadata about the model and the runtime instance.

 The `status` object can be:

--- a/tools/server/public/bundle.js
+++ b/tools/server/public/bundle.js
@@ -6479,13 +6479,13 @@ currentMessageId)},createToolResultMessage:async(toolCallId,content2,extras)=>{c
 activeMessages[conversationsStore.activeMessages.length-1],msg=await DatabaseService.createMessageBranch({convId,type:MessageType.TEXT,role:MessageRole.ASSISTANT,content:"",timestamp:Date.now(),toolCalls:"",children:[],model:resolvedModel},lastMsg.id);return conversationsStore.addMessageToActive(msg),currentMessageId=msg.id,msg},onFlowComplete:finalTimings=>{if(finalTimings){const idx=conversationsStore.findMessageIndex(assistantMessage.id);conversationsStore.updateMessageAtIndex(idx,{timings:finalTimings}),
 DatabaseService.updateMessage(assistantMessage.id,{timings:finalTimings}).catch(console.error)}cleanupStreamingState(),onComplete&&onComplete(streamedContent),isRouterMode()&&modelsStore.fetchRouterModels().catch(console.error),config$1().preEncodeConversation&&this.triggerPreEncode(allMessages,assistantMessage,streamedContent,effectiveModel,!!config$1().excludeReasoningFromContext)},onError:error2=>{if(this.setStreamingActive(!1),isAbortError(error2)){cleanupStreamingState();const pending=this.
 consumePendingMessage(convId);pending&&this.sendMessage(pending.content,pending.extras);return}console.error("Streaming error:",error2),cleanupStreamingState(),this.clearPendingMessage(convId);const idx=conversationsStore.findMessageIndex(assistantMessage.id);if(idx!==-1){const failedMessage=conversationsStore.removeMessageAtIndex(idx);failedMessage&&DatabaseService.deleteMessage(failedMessage.id).catch(console.error)}const contextInfo=error2.contextInfo;this.showErrorDialog({type:error2.name===
-"TimeoutError"?ErrorDialogType.TIMEOUT:ErrorDialogType.SERVER,message:error2.message,contextInfo}),onError&&onError(error2)}},perChatOverrides=conversationsStore.activeConversation?.mcpServerOverrides;if((await agenticStore.runAgenticFlow({conversationId:convId,messages:allMessages,options:{...this.getApiOptions(),...effectiveModel?{model:effectiveModel}:{}},callbacks:streamCallbacks,signal:abortController.signal,perChatOverrides})).handled){const pending=agenticStore.consumePendingSteeringMessage(
-convId);pending&&await this.sendMessage(pending.content,pending.extras);return}await ChatService.sendMessage(allMessages,{...this.getApiOptions(),...effectiveModel?{model:effectiveModel}:{},stream:!0,onChunk:streamCallbacks.onChunk,onReasoningChunk:streamCallbacks.onReasoningChunk,onModel:streamCallbacks.onModel,onTimings:streamCallbacks.onTimings,onComplete:async(finalContent,reasoningContent,timings,toolCalls)=>{const content2=streamedContent||finalContent||"",reasoning=streamedReasoningContent||
-reasoningContent,updateData={content:content2,reasoningContent:reasoning||void 0,toolCalls:toolCalls||"",timings};resolvedModel&&!modelPersisted&&(updateData.model=resolvedModel),await DatabaseService.updateMessage(currentMessageId,updateData);const idx=conversationsStore.findMessageIndex(currentMessageId),uiUpdate={content:content2,reasoningContent:reasoning||void 0,toolCalls:toolCalls||""};timings&&(uiUpdate.timings=timings),resolvedModel&&(uiUpdate.model=resolvedModel),conversationsStore.updateMessageAtIndex(
-idx,uiUpdate),await conversationsStore.updateCurrentNode(currentMessageId),cleanupStreamingState(),onComplete&&await onComplete(content2),isRouterMode()&&modelsStore.fetchRouterModels().catch(console.error),firstUserMessageContent&&await this.generateTitleWithLLM(firstUserMessageContent,streamedContent,convId);const pending=this.consumePendingMessage(convId);pending&&await this.sendMessage(pending.content,pending.extras)},onError:streamCallbacks.onError},convId,abortController.signal)}async stopGeneration(){
-const activeConv=conversationsStore.activeConversation;activeConv&&await this.stopGenerationForChat(activeConv.id)}async stopGenerationForChat(convId){await this.savePartialResponseIfNeeded(convId),this.setStreamingActive(!1),this.abortRequest(convId),this.setChatLoading(convId,!1),this.clearChatStreaming(convId),this.setProcessingState(convId,null),this.clearPendingMessage(convId)}async generateTitleWithLLM(userContent,assistantContent,convId){const effectiveModel=isRouterMode()&&selectedModelName()?
-selectedModelName():void 0,configValue=config$1(),titlePrompt=(typeof configValue.titleGenerationPrompt=="string"&&configValue.titleGenerationPrompt.trim()?configValue.titleGenerationPrompt:TITLE_GENERATION.DEFAULT_PROMPT).replace("{{USER}}",String(userContent||"")).replace("{{ASSISTANT}}",String(assistantContent||"")),titleMessage={role:MessageRole.USER,content:titlePrompt},titleResponse=await ChatService.generateTitle(titleMessage,effectiveModel);if(!titleResponse)return;let cleanTitle=titleResponse.
-trim();if(cleanTitle=cleanTitle.replace(TITLE_GENERATION.PREFIX_PATTERN,"").replace(TITLE_GENERATION.QUOTE_PATTERN,"").trim(),!cleanTitle||cleanTitle.length<TITLE_GENERATION.MIN_LENGTH){const firstLine=userContent.split(`
+"TimeoutError"?ErrorDialogType.TIMEOUT:ErrorDialogType.SERVER,message:error2.message,contextInfo}),onError&&onError(error2)}},perChatOverrides=conversationsStore.activeConversation?.mcpServerOverrides;if((await agenticStore.runAgenticFlow({conversationId:convId,messages:allMessages,options:{...this.getApiOptions(),...effectiveModel?{model:effectiveModel}:{}},callbacks:streamCallbacks,signal:abortController.signal,perChatOverrides})).handled){firstUserMessageContent&&await this.generateTitleWithLLM(
+firstUserMessageContent,streamedContent,convId);const pending=agenticStore.consumePendingSteeringMessage(convId);pending&&await this.sendMessage(pending.content,pending.extras);return}await ChatService.sendMessage(allMessages,{...this.getApiOptions(),...effectiveModel?{model:effectiveModel}:{},stream:!0,onChunk:streamCallbacks.onChunk,onReasoningChunk:streamCallbacks.onReasoningChunk,onModel:streamCallbacks.onModel,onTimings:streamCallbacks.onTimings,onComplete:async(finalContent,reasoningContent,timings,toolCalls)=>{
+const content2=streamedContent||finalContent||"",reasoning=streamedReasoningContent||reasoningContent,updateData={content:content2,reasoningContent:reasoning||void 0,toolCalls:toolCalls||"",timings};resolvedModel&&!modelPersisted&&(updateData.model=resolvedModel),await DatabaseService.updateMessage(currentMessageId,updateData);const idx=conversationsStore.findMessageIndex(currentMessageId),uiUpdate={content:content2,reasoningContent:reasoning||void 0,toolCalls:toolCalls||""};timings&&(uiUpdate.timings=
+timings),resolvedModel&&(uiUpdate.model=resolvedModel),conversationsStore.updateMessageAtIndex(idx,uiUpdate),await conversationsStore.updateCurrentNode(currentMessageId),cleanupStreamingState(),onComplete&&await onComplete(content2),isRouterMode()&&modelsStore.fetchRouterModels().catch(console.error),firstUserMessageContent&&await this.generateTitleWithLLM(firstUserMessageContent,streamedContent,convId);const pending=this.consumePendingMessage(convId);pending&&await this.sendMessage(pending.content,
+pending.extras)},onError:streamCallbacks.onError},convId,abortController.signal)}async stopGeneration(){const activeConv=conversationsStore.activeConversation;activeConv&&await this.stopGenerationForChat(activeConv.id)}async stopGenerationForChat(convId){await this.savePartialResponseIfNeeded(convId),this.setStreamingActive(!1),this.abortRequest(convId),this.setChatLoading(convId,!1),this.clearChatStreaming(convId),this.setProcessingState(convId,null),this.clearPendingMessage(convId)}async generateTitleWithLLM(userContent,assistantContent,convId){
+const effectiveModel=isRouterMode()&&selectedModelName()?selectedModelName():void 0,configValue=config$1(),titlePrompt=(typeof configValue.titleGenerationPrompt=="string"&&configValue.titleGenerationPrompt.trim()?configValue.titleGenerationPrompt:TITLE_GENERATION.DEFAULT_PROMPT).replace("{{USER}}",String(userContent||"")).replace("{{ASSISTANT}}",String(assistantContent||"")),titleMessage={role:MessageRole.USER,content:titlePrompt},titleResponse=await ChatService.generateTitle(titleMessage,effectiveModel);
+if(!titleResponse)return;let cleanTitle=titleResponse.trim();if(cleanTitle=cleanTitle.replace(TITLE_GENERATION.PREFIX_PATTERN,"").replace(TITLE_GENERATION.QUOTE_PATTERN,"").trim(),!cleanTitle||cleanTitle.length<TITLE_GENERATION.MIN_LENGTH){const firstLine=userContent.split(`
 `).find(l=>l.trim().length>0);cleanTitle=firstLine?firstLine.trim():TITLE_GENERATION.FALLBACK}cleanTitle&&cleanTitle.length>=TITLE_GENERATION.MIN_LENGTH&&await conversationsStore.updateConversationName(convId,cleanTitle)}async savePartialResponseIfNeeded(convId){const conversationId=convId||conversationsStore.activeConversation?.id;if(!conversationId)return;const streamingState=this.getChatStreaming(conversationId);if(!streamingState||!streamingState.response.trim())return;const messages=conversationId===
 conversationsStore.activeConversation?.id?conversationsStore.activeMessages:await conversationsStore.getConversationMessages(conversationId);if(!messages.length)return;const lastMessage=messages[messages.length-1];if(lastMessage?.role===MessageRole.ASSISTANT)try{const updateData={content:streamingState.response},lastKnownState=this.getProcessingState(conversationId);lastKnownState&&(updateData.timings={prompt_n:lastKnownState.promptTokens||0,prompt_ms:lastKnownState.promptMs,predicted_n:lastKnownState.
 tokensDecoded||0,cache_n:lastKnownState.cacheTokens||0,predicted_ms:lastKnownState.tokensPerSecond&&lastKnownState.tokensDecoded?lastKnownState.tokensDecoded/lastKnownState.tokensPerSecond*1e3:void 0}),await DatabaseService.updateMessage(lastMessage.id,updateData),lastMessage.content=streamingState.response,updateData.timings&&(lastMessage.timings=updateData.timings)}catch(error2){lastMessage.content=streamingState.response,console.error("Failed to save partial response:",error2)}}async updateMessage(messageId,newContent){
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -1317,7 +1317,7 @@ private:
                return false;
            }

-            const bool need_logits = task.params.sampling.n_probs > 0;
+            const bool need_pre_sample_logits = task.params.sampling.n_probs > 0 && !task.params.post_sampling_probs;

            bool backend_sampling = true;

@@ -1326,8 +1326,8 @@ private:
            // TODO: speculative decoding requires multiple samples per batch - not supported yet
            backend_sampling &= !(slot.can_speculate() && common_speculative_n_max(slot.spec.get(), task.params.speculative) > 0);

-            // TODO: getting post/pre sampling logits is not yet supported with backend sampling
-            backend_sampling &= !need_logits;
+            // TODO: getting pre sampling logits is not yet supported with backend sampling
+            backend_sampling &= !need_pre_sample_logits;

            // TODO: tmp until backend sampling is fully implemented
            if (backend_sampling) {
@@ -1504,6 +1504,12 @@ private:
            // set probability for top n_probs tokens
            result.probs.reserve(n_probs);
            for (size_t i = 0; i < n_probs; i++) {
+                // Some samplers do return 0.0 probabilities, others don't.
+                // Filter 0.0 probailities, to ensure the behavior is consistent.
+                if (cur_p->data[i].p == 0.0) {
+                    break;
+                }
+
                result.probs.push_back({
                    cur_p->data[i].id,
                    common_token_to_piece(ctx, cur_p->data[i].id, special),
@@ -3926,22 +3932,7 @@ void server_routes::init_routes() {
            }},
            {"object", "list"},
            {"data", {
-                {
-                    {"id",       meta->model_name},
-                    {"aliases",  meta->model_aliases},
-                    {"tags",     meta->model_tags},
-                    {"object",   "model"},
-                    {"created",  std::time(0)},
-                    {"owned_by", "llamacpp"},
-                    {"meta",     {
-                        {"vocab_type",  meta->model_vocab_type},
-                        {"n_vocab",     meta->model_vocab_n_tokens},
-                        {"n_ctx_train", meta->model_n_ctx_train},
-                        {"n_embd",      meta->model_n_embd_inp},
-                        {"n_params",    meta->model_n_params},
-                        {"size",        meta->model_size},
-                    }},
-                },
+                get_model_info(),
            }}
        };

@@ -4155,6 +4146,26 @@ void server_routes::init_routes() {
    };
 }

+json server_routes::get_model_info() const {
+    return json {
+        {"id",       meta->model_name},
+        {"aliases",  meta->model_aliases},
+        {"tags",     meta->model_tags},
+        {"object",   "model"},
+        {"created",  std::time(0)},
+        {"owned_by", "llamacpp"},
+        {"meta",     {
+            {"vocab_type",  meta->model_vocab_type},
+            {"n_vocab",     meta->model_vocab_n_tokens},
+            {"n_ctx",       meta->slot_n_ctx},
+            {"n_ctx_train", meta->model_n_ctx_train},
+            {"n_embd",      meta->model_n_embd_inp},
+            {"n_params",    meta->model_n_params},
+            {"size",        meta->model_size},
+        }},
+    };
+}
+
 std::unique_ptr<server_res_generator> server_routes::handle_slots_save(const server_http_req & req, int id_slot) {
    auto res = create_response();
    const json request_data = json::parse(req.body);
--- a/tools/server/server-context.h
+++ b/tools/server/server-context.h
@@ -122,6 +122,10 @@ struct server_routes {
    server_http_context::handler_t post_rerank;
    server_http_context::handler_t get_lora_adapters;
    server_http_context::handler_t post_lora_adapters;
+
+    // to be used in router mode
+    json get_model_info() const;
+
 private:
    std::unique_ptr<server_res_generator> handle_completions_impl(
            const server_http_req & req,
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -4,7 +4,9 @@

 #include <cpp-httplib/httplib.h>

+#include <cstdlib>
 #include <functional>
+#include <future>
 #include <string>
 #include <thread>

@@ -51,11 +53,51 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
    SRV_DBG("response: %s\n", res.body.c_str());
 }

+// For Google Cloud Platform deployment compatibility
+struct gcp_params {
+    bool enabled;
+    std::string path_health;
+    std::string path_predict;
+    int port;
+
+    // Ref: https://docs.cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements#aip-variables
+    gcp_params() {
+        enabled = getenv("AIP_MODE", "") == "PREDICTION";
+        path_health = getenv("AIP_HEALTH_ROUTE", "", true); // default: using the route defined in server.cpp
+        path_predict = getenv("AIP_PREDICT_ROUTE", "/predict", true);
+        port = std::stoi(getenv("AIP_HTTP_PORT", "8080"));
+    }
+
+    static std::string getenv(const char * name, const std::string & default_value, bool ensure_leading_slash = false) {
+        const char * value = std::getenv(name);
+        if (value == nullptr || value[0] == '\0') {
+            return default_value;
+        }
+        std::string val = value;
+        if (ensure_leading_slash && !val.empty() && val[0] != '/') {
+            val.insert(val.begin(), '/');
+        }
+        return val;
+    }
+};
+
 bool server_http_context::init(const common_params & params) {
+    const gcp_params gcp;
+
    path_prefix = params.api_prefix;
    port = params.port;
    hostname = params.hostname;

+    if (gcp.enabled) {
+        LOG_INF("%s: Google Cloud Platform compat: health route = %s, predict route = %s, port = %d\n", __func__, gcp.path_health.c_str(), gcp.path_predict.c_str(), gcp.port);
+
+        if (port != gcp.port) {
+            LOG_WRN("%s: Google Cloud Platform compat: overriding server port %d with AIP_HTTP_PORT %d\n", __func__, port, gcp.port);
+        }
+
+        port = gcp.port;
+    }
+
    auto & srv = pimpl->srv;

 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
@@ -420,6 +462,7 @@ static void process_handler_response(server_http_req_ptr && request, server_http
 }

 void server_http_context::get(const std::string & path, const server_http_context::handler_t & handler) const {
+    handlers.emplace(path, handler);
    pimpl->srv->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
        server_http_req_ptr request = std::make_unique<server_http_req>(server_http_req{
            get_params(req),
@@ -436,6 +479,7 @@ void server_http_context::get(const std::string & path, const server_http_contex
 }

 void server_http_context::post(const std::string & path, const server_http_context::handler_t & handler) const {
+    handlers.emplace(path, handler);
    pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
        std::string body = req.body;
        std::map<std::string, uploaded_file> files;
@@ -481,3 +525,176 @@ void server_http_context::post(const std::string & path, const server_http_conte
    });
 }

+//
+// Vertex AI Prediction protocol (AIP_PREDICT_ROUTE)
+// https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements
+//
+
+// Derives the camelCase @requestFormat alias for a registered path.
+// e.g. "/v1/chat/completions" -> "chatCompletions", "/apply-template" -> "applyTemplate"
+static std::string path_to_gcp_format(const std::string & path) {
+    std::string s = path;
+    if (s.size() > 3 && s[0] == '/' && s[1] == 'v' && s[2] == '1') {
+        s = s.substr(3);
+    }
+    if (!s.empty() && s[0] == '/') {
+        s = s.substr(1);
+    }
+    std::string result;
+    bool cap = false;
+    for (unsigned char c : s) {
+        if (c == ':') break; // stop before path parameters
+        if (c == '/' || c == '-' || c == '_') {
+            cap = true;
+        } else {
+            result += cap ? (char)std::toupper(c) : (char)c;
+            cap = false;
+        }
+    }
+    return result;
+}
+
+static json parse_gcp_predict_response(const server_http_res_ptr & res) {
+    if (res == nullptr) {
+        throw std::runtime_error("empty response from internal handler");
+    }
+    if (res->is_stream()) {
+        throw std::invalid_argument("predict route does not support streaming responses");
+    }
+    if (res->data.empty()) {
+        return nullptr;
+    }
+    try {
+        return json::parse(res->data);
+    } catch (...) {
+        return res->data;
+    }
+}
+
+void server_http_context::register_gcp_compat() {
+    const gcp_params gcp;
+
+    if (!gcp.enabled) {
+        // do nothing
+        return;
+    }
+
+    if (handlers.count(gcp.path_predict)) {
+        LOG_ERR("%s: AIP_PREDICT_ROUTE=%s conflicts with an existing llama-server route\n", __func__, gcp.path_predict.c_str());
+        exit(1);
+    }
+
+    // camelCase alias -> canonical path (first registration wins on collision)
+    // e.g. "chatCompletions" -> "/v1/chat/completions"
+    std::unordered_map<std::string, std::string> alias_to_path;
+    for (const auto & [path, _] : handlers) {
+        alias_to_path.emplace(path_to_gcp_format(path), path);
+    }
+
+    if (!gcp.path_health.empty()) {
+        auto health_handler = handlers.find("/health");
+        GGML_ASSERT(health_handler != handlers.end());
+        get(gcp.path_health, health_handler->second);
+    }
+
+    post(gcp.path_predict, [this, alias_to_path = std::move(alias_to_path)](const server_http_req & req) -> server_http_res_ptr {
+        static const auto build_error = [](const std::string & message, error_type type) -> json {
+            return json {{"error", format_error_response(message, type)}};
+        };
+
+        json data;
+        try {
+            data = json::parse(req.body);
+        } catch (const std::exception & e) {
+            auto res = std::make_unique<server_http_res>();
+            res->status = 400;
+            res->data = safe_json_to_str({{"error", format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST)}});
+            return res;
+        }
+        if (!data.is_object()) {
+            auto res = std::make_unique<server_http_res>();
+            res->status = 400;
+            res->data = safe_json_to_str({{"error", format_error_response("request body must be a JSON object", ERROR_TYPE_INVALID_REQUEST)}});
+            return res;
+        }
+        if (!data.contains("instances") || !data.at("instances").is_array()) {
+            auto res = std::make_unique<server_http_res>();
+            res->status = 400;
+            res->data = safe_json_to_str({{"error", format_error_response("request body must include an array field named instances", ERROR_TYPE_INVALID_REQUEST)}});
+            return res;
+        }
+
+        const json & instances = data.at("instances");
+        static const size_t MAX_INSTANCES = 128;
+        if (instances.size() > MAX_INSTANCES) {
+            auto res = std::make_unique<server_http_res>();
+            res->status = 400;
+            res->data = safe_json_to_str({{"error", format_error_response("instances array exceeds maximum size of " + std::to_string(MAX_INSTANCES), ERROR_TYPE_INVALID_REQUEST)}});
+            return res;
+        }
+
+        std::vector<std::future<json>> futures;
+        futures.reserve(instances.size());
+
+        for (const auto & instance : instances) {
+            futures.push_back(std::async(std::launch::async, [this, &req, &alias_to_path, instance]() -> json {
+                if (!instance.is_object()) {
+                    return build_error("each instance must be a JSON object", ERROR_TYPE_INVALID_REQUEST);
+                }
+                if (!instance.contains("@requestFormat") || !instance.at("@requestFormat").is_string()) {
+                    return build_error("each instance must include a string @requestFormat", ERROR_TYPE_INVALID_REQUEST);
+                }
+
+                try {
+                    json payload = instance;
+                    const std::string format = payload.at("@requestFormat").get<std::string>();
+                    payload.erase("@requestFormat");
+
+                    if (payload.contains("stream")) {
+                        LOG_WRN("%s: ignoring client-provided stream field in instance, streaming is not supported in predict route\n", __func__);
+                        payload["stream"] = false;
+                    }
+
+                    // accept both camelCase aliases (e.g. "chatCompletions") and direct paths
+                    std::string dispatch_path;
+                    auto it_alias = alias_to_path.find(format);
+                    if (it_alias != alias_to_path.end()) {
+                        dispatch_path = it_alias->second;
+                    } else if (handlers.count(format)) {
+                        dispatch_path = format;
+                    } else {
+                        return build_error("no handler registered for @requestFormat: " + format, ERROR_TYPE_INVALID_REQUEST);
+                    }
+
+                    const server_http_req internal_req {
+                        req.params,
+                        req.headers,
+                        path_prefix + dispatch_path,
+                        req.query_string,
+                        payload.dump(),
+                        {},
+                        req.should_stop,
+                    };
+
+                    server_http_res_ptr internal_res = handlers.at(dispatch_path)(internal_req);
+                    return parse_gcp_predict_response(internal_res);
+                } catch (const std::invalid_argument & e) {
+                    return build_error(e.what(), ERROR_TYPE_INVALID_REQUEST);
+                } catch (const std::exception & e) {
+                    return build_error(e.what(), ERROR_TYPE_SERVER);
+                } catch (...) {
+                    return build_error("unknown error", ERROR_TYPE_SERVER);
+                }
+            }));
+        }
+
+        json predictions = json::array();
+        for (auto & future : futures) {
+            predictions.push_back(future.get());
+        }
+
+        auto res = std::make_unique<server_http_res>();
+        res->data = safe_json_to_str({{"predictions", predictions}});
+        return res;
+    });
+}
--- a/tools/server/server-http.h
+++ b/tools/server/server-http.h
@@ -67,6 +67,10 @@ struct server_http_context {
    std::thread thread; // server thread
    std::atomic<bool> is_ready = false;

+    // note: the handler should never throw exceptions
+    using handler_t = std::function<server_http_res_ptr(const server_http_req & req)>;
+    mutable std::unordered_map<std::string, handler_t> handlers;
+
    std::string path_prefix;
    std::string hostname;
    int port;
@@ -78,12 +82,13 @@ struct server_http_context {
    bool start();
    void stop() const;

-    // note: the handler should never throw exceptions
-    using handler_t = std::function<server_http_res_ptr(const server_http_req & req)>;
-
    void get(const std::string & path, const handler_t & handler) const;
    void post(const std::string & path, const handler_t & handler) const;

+    // Register the Google Cloud Platform (Vertex AI) compat (AIP_PREDICT_ROUTE env var, or /predict)
+    // Must be called AFTER all other API routes are registered
+    void register_gcp_compat();
+
    // for debugging
    std::string listening_address;
 };
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -44,6 +44,7 @@ extern char **environ;
 #define CMD_ROUTER_TO_CHILD_EXIT  "cmd_router_to_child:exit"
 #define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready" // also sent when waking up from sleep
 #define CMD_CHILD_TO_ROUTER_SLEEP "cmd_child_to_router:sleep"
+#define CMD_CHILD_TO_ROUTER_INFO  "cmd_child_to_router:info:" // followed by json string

 // address for child process, this is needed because router may run on 0.0.0.0
 // ref: https://github.com/ggml-org/llama.cpp/issues/17862
@@ -718,10 +719,11 @@ void server_models::load(const std::string & name) {

    // prepare new instance info
    instance_t inst;
-    inst.meta           = meta;
-    inst.meta.port      = get_free_port();
-    inst.meta.status    = SERVER_MODEL_STATUS_LOADING;
-    inst.meta.last_used = ggml_time_ms();
+    inst.meta             = meta;
+    inst.meta.port        = get_free_port();
+    inst.meta.status      = SERVER_MODEL_STATUS_LOADING;
+    inst.meta.loaded_info = json{};
+    inst.meta.last_used   = ggml_time_ms();

    if (inst.meta.port <= 0) {
        throw std::runtime_error("failed to get a port number");
@@ -767,12 +769,14 @@ void server_models::load(const std::string & name) {
            // read stdout/stderr and forward to main server log
            // also handle status report from child process
            if (stdout_file) {
-                char buffer[4096];
+                char buffer[128 * 1024]; // large buffer for storing info
                while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) {
                    LOG("[%5d] %s", port, buffer);
                    std::string str(buffer);
                    if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
                        this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0);
+                    } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_INFO)) {
+                        this->update_loaded_info(name, str);
                    } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) {
                        this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0);
                    }
@@ -916,6 +920,29 @@ void server_models::update_status(const std::string & name, server_model_status
    cv.notify_all();
 }

+void server_models::update_loaded_info(const std::string & name, std::string & raw_info) {
+    if (!string_starts_with(raw_info, CMD_CHILD_TO_ROUTER_INFO)) {
+        SRV_WRN("invalid loaded info format from child for model name=%s: %s\n", name.c_str(), raw_info.c_str());
+        return;
+    }
+
+    json info;
+    try {
+        info = json::parse(raw_info.substr(strlen(CMD_CHILD_TO_ROUTER_INFO)));
+    } catch (const std::exception & e) {
+        SRV_WRN("failed to parse loaded info from child for model name=%s: %s\n", name.c_str(), e.what());
+        return;
+    }
+
+    std::unique_lock<std::mutex> lk(mutex);
+    auto it = mapping.find(name);
+    if (it != mapping.end()) {
+        auto & meta = it->second.meta;
+        meta.loaded_info = info;
+    }
+    cv.notify_all();
+}
+
 void server_models::wait_until_loading_finished(const std::string & name) {
    std::unique_lock<std::mutex> lk(mutex);
    cv.wait(lk, [this, &name]() {
@@ -994,12 +1021,14 @@ bool server_models::is_child_server() {
    return router_port != nullptr;
 }

-std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler) {
+std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info) {
    // send a notification to the router server that a model instance is ready
    common_log_pause(common_log_main());
    fflush(stdout);
    fprintf(stdout, "%s\n", CMD_CHILD_TO_ROUTER_READY);
    fflush(stdout);
+    fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_INFO, safe_json_to_str(model_info).c_str());
+    fflush(stdout);
    common_log_resume(common_log_main());

    // setup thread for monitoring stdin
@@ -1176,7 +1205,8 @@ void server_models_routes::init_routes() {
                status["exit_code"] = meta.exit_code;
                status["failed"]    = true;
            }
-            models_json.push_back(json {
+
+            json model_info = json {
                {"id",       meta.name},
                {"aliases",  meta.aliases},
                {"tags",     meta.tags},
@@ -1185,7 +1215,17 @@ void server_models_routes::init_routes() {
                {"created",  t},          // for OAI-compat
                {"status",   status},
                // TODO: add other fields, may require reading GGUF metadata
-            });
+            };
+
+            // merge with loaded_info from the child process if available
+            if (meta.is_running()) {
+                for (auto it = meta.loaded_info.begin(); it != meta.loaded_info.end(); ++it) {
+                    if (!model_info.contains(it.key())) {
+                        model_info[it.key()] = it.value();
+                    }
+                }
+            }
+            models_json.push_back(model_info);
        }
        res_ok(res, {
            {"data", models_json},
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -63,6 +63,7 @@ struct server_model_meta {
    server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
    int64_t last_used = 0; // for LRU unloading
    std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
+    json loaded_info; // info to be reflected via /v1/models endpoint
    int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
    int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown

@@ -145,6 +146,7 @@ public:

    // update the status of a model instance (thread-safe)
    void update_status(const std::string & name, server_model_status status, int exit_code);
+    void update_loaded_info(const std::string & name, std::string & raw_info);

    // wait until the model instance is fully loaded (thread-safe)
    // return when the model no longer in "loading" state
@@ -163,7 +165,7 @@ public:

    // notify the router server that a model instance is ready
    // return the monitoring thread (to be joined by the caller)
-    static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler);
+    static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info);

    // notify the router server that the sleeping state has changed
    static void notify_router_sleeping_state(bool sleeping);
--- a/tools/server/server-queue.cpp
+++ b/tools/server/server-queue.cpp
@@ -381,7 +381,8 @@ server_task_result_ptr server_response_reader::next(const std::function<bool()>
        if (result == nullptr) {
            // timeout, check stop condition
            if (should_stop()) {
-                SRV_DBG("%s", "stopping wait for next result due to should_stop condition\n");
+                SRV_WRN("%s", "stopping wait for next result due to should_stop condition (adjust the --timeout argument if needed)\n");
+                SRV_WRN("%s", "ref: https://github.com/ggml-org/llama.cpp/pull/22907\n");
                return nullptr;
            }
        } else {
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -204,6 +204,10 @@ int main(int argc, char ** argv) {
    // Save & load slots
    ctx_http.get ("/slots",                    ex_wrapper(routes.get_slots));
    ctx_http.post("/slots/:id_slot",           ex_wrapper(routes.post_slots));
+
+    // Google Cloud Platform (Vertex AI) compat
+    ctx_http.register_gcp_compat();
+
    // CORS proxy (EXPERIMENTAL, only used by the Web UI for MCP)
    if (params.webui_mcp_proxy) {
        SRV_WRN("%s", "-----------------\n");
@@ -334,7 +338,8 @@ int main(int argc, char ** argv) {
        // optionally, notify router server that this instance is ready
        std::thread monitor_thread;
        if (server_models::is_child_server()) {
-            monitor_thread = server_models::setup_child_server(shutdown_handler);
+            json model_info = routes.get_model_info();
+            monitor_thread = server_models::setup_child_server(shutdown_handler, model_info);
        }

        // this call blocks the main thread until queue_tasks.terminate() is called
--- a/tools/server/tests/unit/test_compat_gcp.py
+++ b/tools/server/tests/unit/test_compat_gcp.py
@@ -0,0 +1,60 @@
+import pytest
+from utils import *
+
+server: ServerProcess
+
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.gcp_compat = True
+
+
+def test_gcp_predict_camel_case():
+    global server
+    server.start()
+    res = server.make_request("POST", "/predict", data={
+        "instances": [
+            {
+                "@requestFormat": "chatCompletions",
+                "max_tokens": 8,
+                "messages": [
+                    {"role": "user", "content": "What is the meaning of life?"},
+                ],
+            }
+        ],
+    })
+    assert res.status_code == 200
+    assert "predictions" in res.body
+    assert len(res.body["predictions"]) == 1
+    prediction = res.body["predictions"][0]
+    assert "choices" in prediction
+    assert len(prediction["choices"]) == 1
+    assert prediction["choices"][0]["message"]["role"] == "assistant"
+    assert len(prediction["choices"][0]["message"]["content"]) > 0
+
+
+def test_gcp_predict_multiple_instances():
+    global server
+    server.n_slots = 2
+    server.start()
+    res = server.make_request("POST", "/predict", data={
+        "instances": [
+            {
+                "@requestFormat": "chatCompletions",
+                "max_tokens": 8,
+                "messages": [{"role": "user", "content": "Say hello"}],
+            },
+            {
+                "@requestFormat": "chatCompletions",
+                "max_tokens": 8,
+                "messages": [{"role": "user", "content": "Say world"}],
+            },
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body["predictions"]) == 2
+    for prediction in res.body["predictions"]:
+        assert "choices" in prediction
+        assert len(prediction["choices"][0]["message"]["content"]) > 0
--- a/tools/server/tests/unit/test_completion.py
+++ b/tools/server/tests/unit/test_completion.py
@@ -491,29 +491,82 @@ def test_n_probs_post_sampling():
    global server
    server.start()
    res = server.make_request("POST", "/completion", data={
-        "prompt": "I believe the meaning of life is",
+        "prompt": "Today was the day. Today I would finally become a",
        "n_probs": 10,
-        "temperature": 0.0,
+        "temperature": 1.0,
        "n_predict": 5,
        "post_sampling_probs": True,
    })
    assert res.status_code == 200
    assert "completion_probabilities" in res.body
    assert len(res.body["completion_probabilities"]) == 5
-    for tok in res.body["completion_probabilities"]:
+    for (i, tok) in enumerate(res.body["completion_probabilities"]):
        assert "id" in tok and tok["id"] > 0
        assert "token" in tok and type(tok["token"]) == str
        assert "prob" in tok and 0.0 < tok["prob"] <= 1.0
        assert "bytes" in tok and type(tok["bytes"]) == list
-        assert len(tok["top_probs"]) == 10
+        assert "top_probs" in tok and type(tok["top_probs"]) == list
+
        for prob in tok["top_probs"]:
            assert "id" in prob and prob["id"] > 0
            assert "token" in prob and type(prob["token"]) == str
-            assert "prob" in prob and 0.0 <= prob["prob"] <= 1.0
+            # 0.0 probability tokens should never be returned by the server
+            assert "prob" in prob and 0.0 < prob["prob"] <= 1.0
            assert "bytes" in prob and type(prob["bytes"]) == list
-        # because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs
-        assert any(prob["prob"] == 1.0 for prob in tok["top_probs"])

+        if i == 0:
+            # The prompt is vague enough that we should get at least 10 possibilities
+            # for the first token.
+            assert len(tok["top_probs"]) == 10
+
+        if len(tok["top_probs"]) < 10:
+            # Getting less than the requested number of probabilities should only happen
+            # if the ones we did get already sum to 1.0.
+            assert sum(p["prob"] for p in tok["top_probs"]) == pytest.approx(1.0)
+
+def test_n_probs_post_backend_sampling():
+    """Verify that the same probabilities are returned with and without backend sampling."""
+    global server
+    server.backend_sampling = True
+    server.start()
+
+    def make_request(backend_sampling):
+        n_predict = 20
+
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "The countries of Europe, in random order, are:",
+            "n_probs": 10,
+            "n_predict": n_predict,
+            "post_sampling_probs": True,
+            "seed": 4242,
+            "backend_sampling": backend_sampling,
+        })
+        assert res.status_code == 200
+
+        total_probs = 0
+        completions = res.body["completion_probabilities"]
+        assert len(completions) == n_predict
+        for tok in completions:
+            # Handling of 0.0 probabilities differs between samplers and backend sampling. Filter them to normalize the
+            # data.
+            tok["top_probs"] = [x for x in tok["top_probs"] if x["prob"] > 0.0]
+            total_probs += len(tok["top_probs"])
+        # Verify that we got at least two top probs on average, to ensure the effectiveness of the test.
+        assert total_probs >= 2 * n_predict
+        return completions
+
+    def verify_token(a, b):
+        assert a["id"] == b["id"]
+        assert a["token"] == b["token"]
+        assert a["bytes"] == b["bytes"]
+        assert a["prob"] == pytest.approx(b["prob"], abs=0.01)
+
+    for (a, b) in zip(make_request(True), make_request(False)):
+        verify_token(a, b)
+        assert len(a["top_probs"]) == len(b["top_probs"])
+
+        for (aa, bb) in zip(a["top_probs"], b["top_probs"]):
+            verify_token(aa, bb)

@pytest.mark.parametrize("tokenize,openai_style", [(False, False), (False, True), (True, False), (True, True)])
 def test_logit_bias(tokenize, openai_style):
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -108,6 +108,8 @@ class ServerProcess:
    no_cache_idle_slots: bool = False
    log_path: str | None = None
    webui_mcp_proxy: bool = False
+    backend_sampling: bool = False
+    gcp_compat: bool = False

    # session variables
    process: subprocess.Popen | None = None
@@ -122,6 +124,9 @@ class ServerProcess:
        self.external_server = "DEBUG_EXTERNAL" in os.environ

    def start(self, timeout_seconds: int = DEFAULT_HTTP_TIMEOUT) -> None:
+        env = {**os.environ}
+        if "LLAMA_CACHE" not in os.environ:
+            env["LLAMA_CACHE"] = "tmp"
        if self.external_server:
            print(f"[external_server]: Assuming external server running on {self.server_host}:{self.server_port}")
            return
@@ -248,6 +253,10 @@ class ServerProcess:
            server_args.append("--no-cache-idle-slots")
        if self.webui_mcp_proxy:
            server_args.append("--webui-mcp-proxy")
+        if self.backend_sampling:
+            server_args.append("--backend_sampling")
+        if self.gcp_compat:
+            env["AIP_MODE"] = "PREDICTION"

        args = [str(arg) for arg in [server_path, *server_args]]
        print(f"tests: starting server with: {' '.join(args)}")
@@ -268,7 +277,7 @@ class ServerProcess:
            creationflags=flags,
            stdout=self._log,
            stderr=self._log if self._log != sys.stdout else sys.stdout,
-            env={**os.environ, "LLAMA_CACHE": "tmp"} if "LLAMA_CACHE" not in os.environ else None,
+            env=env,
        )
        server_instances.add(self)

--- a/tools/server/webui/src/lib/stores/chat.svelte.ts
+++ b/tools/server/webui/src/lib/stores/chat.svelte.ts
@@ -856,6 +856,10 @@ class ChatStore {
 				perChatOverrides
 			});
 			if (agenticResult.handled) {
+				// Generate LLM based title for new conversations after agentic flow completes
+				if (firstUserMessageContent) {
+					await this.generateTitleWithLLM(firstUserMessageContent, streamedContent, convId);
+				}
 				// Check if there's a pending steering message to re-send
 				const pending = agenticStore.consumePendingSteeringMessage(convId);
 				if (pending) {
--- a/vendor/cpp-httplib/CMakeLists.txt
+++ b/vendor/cpp-httplib/CMakeLists.txt
@@ -41,7 +41,7 @@ if (LLAMA_BUILD_BORINGSSL)
    set(FIPS OFF CACHE BOOL "Enable FIPS (BoringSSL)")

    set(BORINGSSL_GIT "https://boringssl.googlesource.com/boringssl" CACHE STRING "BoringSSL git repository")
-    set(BORINGSSL_VERSION "0.20260413.0" CACHE STRING "BoringSSL version")
+    set(BORINGSSL_VERSION "0.20260508.0" CACHE STRING "BoringSSL version")

    message(STATUS "Fetching BoringSSL version ${BORINGSSL_VERSION}")

--- a/vendor/cpp-httplib/httplib.cpp
+++ b/vendor/cpp-httplib/httplib.cpp
@@ -8980,10 +8980,22 @@ ssize_t ChunkedDecoder::read_payload(char *buf, size_t len,
    stream_line_reader lr(strm, line_buf, sizeof(line_buf));
    if (!lr.getline()) { return -1; }

-    char *endptr = nullptr;
-    unsigned long chunk_len = std::strtoul(lr.ptr(), &endptr, 16);
-    if (endptr == lr.ptr()) { return -1; }
-    if (chunk_len == ULONG_MAX) { return -1; }
+    // RFC 9112 §7.1: chunk-size = 1*HEXDIG
+    const char *p = lr.ptr();
+    int v = 0;
+    if (!is_hex(*p, v)) { return -1; }
+
+    size_t chunk_len = 0;
+    constexpr size_t chunk_len_max = (std::numeric_limits<size_t>::max)();
+    for (; is_hex(*p, v); ++p) {
+      if (chunk_len > (chunk_len_max >> 4)) { return -1; }
+      chunk_len = (chunk_len << 4) | static_cast<size_t>(v);
+    }
+
+    while (is_space_or_tab(*p)) {
+      ++p;
+    }
+    if (*p != '\0' && *p != ';' && *p != '\r' && *p != '\n') { return -1; }

    if (chunk_len == 0) {
      chunk_remaining = 0;
@@ -8993,7 +9005,7 @@ ssize_t ChunkedDecoder::read_payload(char *buf, size_t len,
      return 0;
    }

-    chunk_remaining = static_cast<size_t>(chunk_len);
+    chunk_remaining = chunk_len;
    last_chunk_total = chunk_remaining;
    last_chunk_offset = 0;
  }
--- a/vendor/cpp-httplib/httplib.h
+++ b/vendor/cpp-httplib/httplib.h
@@ -8,8 +8,8 @@
 #ifndef CPPHTTPLIB_HTTPLIB_H
 #define CPPHTTPLIB_HTTPLIB_H

-#define CPPHTTPLIB_VERSION "0.43.3"
-#define CPPHTTPLIB_VERSION_NUM "0x002b03"
+#define CPPHTTPLIB_VERSION "0.43.4"
+#define CPPHTTPLIB_VERSION_NUM "0x002b04"

 #ifdef _WIN32
 #if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00