ggml-hexagon: cpy: add contiguous fast-path in reshape copy (#23076 )

HIP: RDNA3 mma FA, faster AMD transpose, tune AMD (#22880 )
Adds RDNA3 support to the CUDA mma FA kernel. To make the RDNA3 tensor cores work with the FP16 accumulation for VKQ the tiles they need to be 32 logical units long in direction of the attention head; for head sizes 80 and 112 that are not exactly divided by 32 the regular length of 16 with FP32 accumulation is used instead. The longer tiles also enable more efficient transposition for a warp size of 32 which is why it's also used for RDNA4. However, this scrambles the data layout of the accumulators along the attention head dimension. To prevent accidental misuse I added another entry to ggml_cuda_mma::data_layout. I also tuned the kernel parameters for RDNA3, RDNA4, and CDNA1 in general, during which I discovered that the kernel can be made to work for head sizes up to 256 for CDNA. For RDNA3/4 I was not able to get better performance that the tile kernel for head sizes > 128.
2026-05-21 17:17:24 +03:00 · 2026-05-14 16:55:54 -07:00 · 2026-05-14 22:58:58 +02:00 · 2026-05-14 13:58:34 -07:00 · 2026-05-14 09:41:32 -07:00 · 2026-05-14 09:31:36 -07:00
197 changed files with 32411 additions and 25724 deletions
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -5,8 +5,15 @@ ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
 FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build

 ARG GGML_SYCL_F16=OFF
+ARG LEVEL_ZERO_VERSION=1.28.2
+ARG LEVEL_ZERO_UBUNTU_VERSION=u24.04
 RUN apt-get update && \
-    apt-get install -y git libssl-dev
+    apt-get install -y git libssl-dev wget ca-certificates && \
+    cd /tmp && \
+    wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb && \
+    wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb && \
+    apt-get -o Dpkg::Options::="--force-overwrite" install -y ./level-zero.deb ./level-zero-devel.deb && \
+    rm -f /tmp/level-zero.deb /tmp/level-zero-devel.deb

 WORKDIR /app

@@ -33,11 +40,11 @@ RUN mkdir -p /app/full \

 FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

-ARG IGC_VERSION=v2.32.7
-ARG IGC_VERSION_FULL=2_2.32.7+21184
-ARG COMPUTE_RUNTIME_VERSION=26.14.37833.4
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.14.37833.4-0
-ARG IGDGMM_VERSION=22.9.0
+ARG IGC_VERSION=v2.20.5
+ARG IGC_VERSION_FULL=2_2.20.5+19972
+ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10
+ARG COMPUTE_RUNTIME_VERSION_FULL=25.40.35563.10-0
+ARG IGDGMM_VERSION=22.8.2
 RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
@@ -109,4 +116,3 @@ WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

 ENTRYPOINT [ "/app/llama-server" ]
-
--- a/.editorconfig
+++ b/.editorconfig
@@ -53,14 +53,6 @@ charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset

-[tools/server/public/**]
-indent_style = unset
-indent_size = unset
-end_of_line = unset
-charset = unset
-trim_trailing_whitespace = unset
-insert_final_newline = unset
-
 [benches/**]
 indent_style = unset
 indent_size = unset
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,4 +0,0 @@
-# Treat the generated single-file WebUI build as binary for diff purposes.
-# Git's pack-file delta compression still works (byte-level), but this prevents
-# git diff from printing the entire minified file on every change.
-tools/server/public/index.html -diff
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -77,7 +77,6 @@ server/webui:
    - changed-files:
        - any-glob-to-any-file:
            - tools/server/webui/**
-            - tools/server/public/**
 server:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/workflows/build-and-test-snapdragon.yml
+++ b/.github/workflows/build-and-test-snapdragon.yml
@@ -58,14 +58,45 @@ jobs:
          name: llama-cpp-android-arm64-snapdragon
          path: pkg-snapdragon/llama.cpp

+  linux-iot-snapdragon:
+    runs-on: ubuntu-latest
+    container:
+      image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.1'
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          lfs: false
+
+      - name: Build Llama.CPP for Snapdragon Linux IoT
+        id: build_llama_cpp_snapdragon_linux
+        run: |
+          cp docs/backend/snapdragon/CMakeUserPresets.json .
+          cmake --preset arm64-linux-snapdragon-release -B build-snapdragon -DGGML_OPENCL=ON
+          cmake --build build-snapdragon -j $(nproc)
+          cmake --install build-snapdragon --prefix pkg-snapdragon/llama.cpp
+
+      - name: Upload Llama.CPP Snapdragon Linux IoT Build Artifact
+        if: ${{ always() && steps.build_llama_cpp_snapdragon_linux.outcome == 'success' }}
+        uses: actions/upload-artifact@v6
+        with:
+          name: llama-cpp-linux-arm64-snapdragon
+          path: pkg-snapdragon/llama.cpp
+
  test-snapdragon-qdc:
-    name: Test on QDC Android Device (${{ matrix.device }})
-    needs: [android-ndk-snapdragon]
-    runs-on: ubuntu-slim
+    name: Test on QDC Device (${{ matrix.device }})
+    needs: [android-ndk-snapdragon, linux-iot-snapdragon]
+    runs-on: ubuntu-24.04-arm
+    timeout-minutes: 90
    strategy:
      fail-fast: false
      matrix:
-        device: [SM8750, SM8650, SM8850]
+        device: [SM8750, SM8850, QCS9075M]

    steps:
      - name: Checkout
@@ -74,11 +105,11 @@ jobs:
      - name: Download build artifact
        uses: actions/download-artifact@v7
        with:
-          name: llama-cpp-android-arm64-snapdragon
+          name: ${{ startsWith(matrix.device, 'QCS') && 'llama-cpp-linux-arm64-snapdragon' || 'llama-cpp-android-arm64-snapdragon' }}
          path: pkg-snapdragon/llama.cpp

      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
        with:
          python-version: '3.x'
          cache: pip
@@ -107,7 +138,8 @@ jobs:
              --test       all \
              --pkg-dir    pkg-snapdragon/llama.cpp \
              --model-url  "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf" \
-              --device     ${{ matrix.device }}
+              --device     ${{ matrix.device }} \
+              ${{ startsWith(matrix.device, 'QCS') && '--retries 2 --retry-delay 300' || '' }}
        env:
          QDC_API_KEY: ${{ secrets.QDC_API_KEY }}

--- a/.github/workflows/build-cross.yml
+++ b/.github/workflows/build-cross.yml
@@ -301,16 +301,17 @@ jobs:
          export RISCV_ROOT_PATH=${PWD}/spacemit_toolchain
          cmake -B build -DLLAMA_OPENSSL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
+                         -DGGML_CPU_REPACK=OFF \
                         -DLLAMA_BUILD_TOOLS=ON \
                         -DLLAMA_BUILD_TESTS=OFF \
                         -DGGML_CPU_RISCV64_SPACEMIT=ON \
                         -DGGML_RVV=ON \
+                         -DGGML_RV_ZVFH=ON \
                         -DGGML_RV_ZFH=ON \
                         -DGGML_RV_ZICBOP=ON \
                         -DGGML_RV_ZIHINTPAUSE=ON \
-                         -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
+                         -DGGML_RV_ZBA=ON \
                         -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake

          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@@ -55,7 +55,22 @@ env:
  LLAMA_LOG_TIMESTAMPS: 1

 jobs:
+  determine-tag:
+    name: Determine tag name
+    runs-on: ubuntu-slim
+    outputs:
+      tag_name: ${{ steps.tag.outputs.name }}
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
  ggml-ci-nvidia-cuda:
+    needs: determine-tag
    runs-on: [self-hosted, Linux, NVIDIA]

    steps:
@@ -65,11 +80,14 @@ jobs:

      - name: Test
        id: ggml-ci
+        env:
+          HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          nvidia-smi
          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  ggml-ci-nvidia-vulkan-cm:
+    needs: determine-tag
    runs-on: [self-hosted, Linux, NVIDIA]

    steps:
@@ -79,11 +97,14 @@ jobs:

      - name: Test
        id: ggml-ci
+        env:
+          HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  ggml-ci-nvidia-vulkan-cm2:
+    needs: determine-tag
    runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]

    steps:
@@ -93,39 +114,40 @@ jobs:

      - name: Test
        id: ggml-ci
+        env:
+          HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

-  # TODO: investigate slight precision issues in some operations for test-backend-ops on the WebGPU backend.
-  #ggml-ci-nvidia-webgpu:
-  #  runs-on: [self-hosted, Linux, NVIDIA]
+  ggml-ci-nvidia-webgpu:
+    runs-on: [self-hosted, Linux, NVIDIA]

-  #  steps:
-  #    - name: Clone
-  #      id: checkout
-  #      uses: actions/checkout@v6
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6

-  #    - name: Dawn Dependency
-  #      id: dawn-depends
-  #      run: |
-  #        DAWN_VERSION="v20260317.182325"
-  #        DAWN_OWNER="google"
-  #        DAWN_REPO="dawn"
-  #        DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
-  #        echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-  #        curl -L -o artifact.tar.gz \
-  #          "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-  #        mkdir dawn
-  #        tar -xvf artifact.tar.gz -C dawn --strip-components=1
+      - name: Dawn Dependency
+        id: dawn-depends
+        run: |
+          DAWN_VERSION="v20260317.182325"
+          DAWN_OWNER="google"
+          DAWN_REPO="dawn"
+          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
+          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          curl -L -o artifact.tar.gz \
+            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          mkdir dawn
+          tar -xvf artifact.tar.gz -C dawn --strip-components=1

-  #    - name: Test
-  #      id: ggml-ci
-  #      run: |
-  #        GG_BUILD_WEBGPU=1 \
-  #        GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
-  #        GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
-  #          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+      - name: Test
+        id: ggml-ci
+        run: |
+          GG_BUILD_WEBGPU=1 \
+          GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
+          GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
+            bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  # TODO: provision AMX-compatible machine
  #ggml-ci-cpu-amx:
@@ -172,6 +194,7 @@ jobs:
  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  ggml-ci-mac-metal:
+    needs: determine-tag
    runs-on: [self-hosted, macOS, ARM64]

    steps:
@@ -181,10 +204,13 @@ jobs:

      - name: Test
        id: ggml-ci
+        env:
+          HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  ggml-ci-mac-webgpu:
+    needs: determine-tag
    runs-on: [self-hosted, macOS, ARM64]

    steps:
@@ -207,11 +233,14 @@ jobs:

      - name: Test
        id: ggml-ci
+        env:
+          HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  ggml-ci-mac-vulkan:
+    needs: determine-tag
    runs-on: [self-hosted, macOS, ARM64]

    steps:
@@ -221,11 +250,14 @@ jobs:

      - name: Test
        id: ggml-ci
+        env:
+          HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  ggml-ci-linux-intel-vulkan:
+    needs: determine-tag
    runs-on: [self-hosted, Linux, Intel]

    steps:
@@ -237,11 +269,14 @@ jobs:

      - name: Test
        id: ggml-ci
+        env:
+          HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  ggml-ci-win-intel-vulkan:
+    needs: determine-tag
    runs-on: [self-hosted, Windows, X64, Intel]

    steps:
@@ -256,6 +291,7 @@ jobs:
          MSYSTEM: UCRT64
          CHERE_INVOKING: 1
          PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }}
+          HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          vulkaninfo --summary
          # Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create
@@ -263,6 +299,7 @@ jobs:
          LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp

  ggml-ci-intel-openvino-gpu-low-perf:
+    needs: determine-tag
    runs-on: [self-hosted, Linux, Intel, OpenVINO]

    concurrency:
@@ -294,6 +331,8 @@ jobs:

      - name: Test
        id: ggml-ci
+        env:
+          HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          source ./openvino_toolkit/setupvars.sh
          GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
--- a/.github/workflows/build-sycl.yml
+++ b/.github/workflows/build-sycl.yml
@@ -50,6 +50,8 @@ jobs:
    env:
      ONEAPI_ROOT: /opt/intel/oneapi/
      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+      LEVEL_ZERO_VERSION: "1.28.2"
+      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"

    continue-on-error: true

@@ -71,6 +73,14 @@ jobs:
          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept

+      - name: Install Level Zero SDK
+        shell: bash
+        run: |
+          cd /tmp
+          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
+          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
+          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
+
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
@@ -107,6 +117,7 @@ jobs:
    env:
      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
      ONEAPI_INSTALLER_VERSION: "2025.3.3"
    steps:
@@ -127,6 +138,13 @@ jobs:
        run: |
          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL

+      - name: Install Level Zero SDK
+        shell: pwsh
+        run: |
+          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
+          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
+          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
--- a/.github/workflows/build-virtgpu.yml
+++ b/.github/workflows/build-virtgpu.yml
@@ -0,0 +1,50 @@
+name: CI (virtgpu)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-virtgpu.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-virtgpu.yml',
+      'ggml/src/ggml-virtgpu/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  ubuntu-24-virtgpu:
+    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential libdrm-dev pkg-config libssl-dev
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DGGML_VIRTGPU=ON \
+            -DGGML_VIRTGPU_BACKEND=ON
+          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -456,7 +456,8 @@ jobs:
        run: |
          cd build
          # This is using llvmpipe and runs slower than other backends
-          ctest -L main --verbose --timeout 900
+          # test-backend-ops is too slow on llvmpipe, skip it
+          ctest -L main -E test-backend-ops --verbose --timeout 900

  ubuntu-24-webgpu-wasm:
    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
--- a/.github/workflows/code-style.yml
+++ b/.github/workflows/code-style.yml
@@ -0,0 +1,51 @@
+name: Code Style Checker
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  model-naming:
+    runs-on: ubuntu-slim
+    steps:
+      - uses: actions/checkout@v6
+      - name: Check model naming conventions
+        run: |
+          python3 - << 'EOF'
+          import re, os, sys
+
+          pairs = re.findall(
+              r'case\s+(LLM_ARCH_\w+)\s*:\s*\n\s+return new (llama_model_\w+)\s*\(',
+              open("src/llama-model.cpp").read())
+
+          errors = []
+          for arch, cls in pairs:
+              suffix  = arch[len("LLM_ARCH_"):]
+              csuffix = cls[len("llama_model_"):]
+              fname   = csuffix.replace("_", "-") + ".cpp"
+
+              if not re.fullmatch(r'[A-Z][A-Z0-9_]*',   suffix):
+                  errors.append(f"{arch}: suffix not upper snake case, example: LLM_ARCH_MY_MODEL")
+
+              if not re.fullmatch(r'[a-z][a-z0-9_]*', csuffix):
+                  errors.append(f"{arch}: class suffix not lower snake case, example: llama_model_my_model")
+
+              elif suffix.lower() != csuffix:
+                  errors.append(f"{arch}: arch/class name mismatch, expected class 'llama_model_{suffix.lower()}' but got '{cls}'")
+
+              elif not os.path.isfile(f"src/models/{fname}"):
+                  errors.append(f"{arch}: expects model file name to be src/models/{fname}, but not found")
+
+          if errors:
+              print('\n'.join(f"  - {e}" for e in errors)); sys.exit(1)
+          print(f"OK: {len(pairs)} mappings validated.")
+          EOF
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -2,11 +2,6 @@ name: EditorConfig Checker

 on:
  workflow_dispatch: # allows manual triggering
-    inputs:
-      create_release:
-        description: 'Create new release'
-        required: true
-        type: boolean
  push:
    branches:
      - master
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -31,7 +31,7 @@ jobs:
        uses: actions/setup-python@v6
        with:
          python-version: "3.11"
-          pip-install: -r requirements/requirements-all.txt ty==0.0.33
+          pip-install: -r requirements/requirements-all.txt ty==0.0.35
      # - name: Type-check with Pyright
      #   uses: jakebailey/pyright-action@v2
      #   with:
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -36,7 +36,14 @@ env:
  CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"

 jobs:
+  webui-build:
+    name: Build WebUI
+    uses: ./.github/workflows/webui-build.yml
+
  macOS-cpu:
+    needs:
+      - webui-build
+
    strategy:
      matrix:
        include:
@@ -64,6 +71,12 @@ jobs:
        with:
          fetch-depth: 0

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -100,6 +113,9 @@ jobs:
          name: llama-bin-macos-${{ matrix.build }}.tar.gz

  ubuntu-cpu:
+    needs:
+      - webui-build
+
    strategy:
      matrix:
        include:
@@ -119,6 +135,12 @@ jobs:
        with:
          fetch-depth: 0

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: ccache
        if: ${{ matrix.build != 's390x' }}
        uses: ggml-org/ccache-action@v1.2.21
@@ -169,6 +191,9 @@ jobs:
          name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz

  ubuntu-vulkan:
+    needs:
+      - webui-build
+
    strategy:
      matrix:
        include:
@@ -186,6 +211,12 @@ jobs:
        with:
          fetch-depth: 0

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -237,6 +268,9 @@ jobs:
          name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz

  android-arm64:
+    needs:
+      - webui-build
+
    runs-on: ubuntu-latest

    env:
@@ -249,6 +283,12 @@ jobs:
        with:
          fetch-depth: 0

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -306,6 +346,9 @@ jobs:
          name: llama-bin-android-arm64.tar.gz

  ubuntu-24-openvino:
+    needs:
+      - webui-build
+
    runs-on: ubuntu-24.04

    outputs:
@@ -327,6 +370,12 @@ jobs:
        with:
          fetch-depth: 0

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -386,6 +435,9 @@ jobs:
          name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz

  windows-cpu:
+    needs:
+      - webui-build
+
    runs-on: windows-2025

    strategy:
@@ -400,6 +452,12 @@ jobs:
        with:
          fetch-depth: 0

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -438,6 +496,9 @@ jobs:
          name: llama-bin-win-cpu-${{ matrix.arch }}.zip

  windows:
+    needs:
+      - webui-build
+
    runs-on: windows-2025

    env:
@@ -461,6 +522,12 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -520,6 +587,9 @@ jobs:
          name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip

  windows-cuda:
+    needs:
+      - webui-build
+
    runs-on: windows-2022

    strategy:
@@ -531,6 +601,12 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: Install ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -591,6 +667,9 @@ jobs:
          name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip

  windows-sycl:
+    needs:
+      - webui-build
+
    runs-on: windows-2022

    defaults:
@@ -600,6 +679,7 @@ jobs:
    env:
      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
      ONEAPI_INSTALLER_VERSION: "2025.3.3"

@@ -621,6 +701,19 @@ jobs:
        run: |
          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL

+      - name: Install Level Zero SDK
+        shell: pwsh
+        run: |
+          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
+          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
+          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
+
+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -655,6 +748,13 @@ jobs:
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
+          ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
+          if [ -n "$ZE_LOADER_DLL" ]; then
+            echo "Using Level Zero loader: $ZE_LOADER_DLL"
+            cp "$ZE_LOADER_DLL" ./build/bin
+          else
+            echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
+          fi

          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
@@ -681,6 +781,9 @@ jobs:
          name: llama-bin-win-sycl-x64.zip

  ubuntu-24-sycl:
+    needs:
+      - webui-build
+
    strategy:
      matrix:
        build: [fp32, fp16]
@@ -695,6 +798,8 @@ jobs:
    env:
      ONEAPI_ROOT: /opt/intel/oneapi/
      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+      LEVEL_ZERO_VERSION: "1.28.2"
+      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"

    steps:
      - name: Clone
@@ -718,6 +823,20 @@ jobs:
          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept

+      - name: Install Level Zero SDK
+        shell: bash
+        run: |
+          cd /tmp
+          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
+          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
+          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
+
+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -757,6 +876,9 @@ jobs:
          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz

  ubuntu-22-rocm:
+    needs:
+      - webui-build
+
    runs-on: ubuntu-22.04

    strategy:
@@ -773,6 +895,12 @@ jobs:
        with:
          fetch-depth: 0

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: Free up disk space
        uses: ggml-org/free-disk-space@v1.3.1
        with:
@@ -860,6 +988,9 @@ jobs:
          name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz

  windows-hip:
+    needs:
+      - webui-build
+
    runs-on: windows-2022

    env:
@@ -876,6 +1007,12 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: Grab rocWMMA package
        id: grab_rocwmma
        run: |
@@ -1122,6 +1259,7 @@ jobs:
    runs-on: ubuntu-slim

    needs:
+      - webui-build
      - windows
      - windows-cpu
      - windows-cuda
@@ -1137,6 +1275,9 @@ jobs:
      - ios-xcode-build
      - openEuler-cann

+    outputs:
+      tag_name: ${{ steps.tag.outputs.name }}
+
    steps:
      - name: Clone
        id: checkout
@@ -1262,3 +1403,15 @@ jobs:
                });
              }
            }
+
+  webui-publish:
+    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+
+    needs:
+      - release
+
+    uses: ./.github/workflows/webui-publish.yml
+    with:
+      version_tag: ${{ needs.release.outputs.tag_name }}
+    secrets:
+      hf_token: ${{ secrets.HF_TOKEN_WEBUI_STATIC_OUTPUT }}
--- a/.github/workflows/server-self-hosted.yml
+++ b/.github/workflows/server-self-hosted.yml
@@ -39,7 +39,12 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  webui-build:
+    name: Build WebUI
+    uses: ./.github/workflows/webui-build.yml
+
  server-metal:
+    needs: webui-build
    runs-on: [self-hosted, llama-server, macOS, ARM64]

    name: server-metal (${{ matrix.wf_name }})
@@ -67,6 +72,12 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: Build
        id: cmake_build
        run: |
--- a/.github/workflows/server-webui.yml
+++ b/.github/workflows/server-webui.yml
@@ -1,7 +1,7 @@
 name: Server WebUI

 on:
-  workflow_dispatch: # allows manual triggering
+  workflow_dispatch:
    inputs:
      sha:
        description: 'Commit SHA1 to build'
@@ -13,16 +13,14 @@ on:
    paths: [
      '.github/workflows/server-webui.yml',
      'tools/server/webui/**.*',
-      'tools/server/tests/**.*',
-      'tools/server/public/**'
+      'tools/server/tests/**.*'
    ]
  pull_request:
    types: [opened, synchronize, reopened]
    paths: [
      '.github/workflows/server-webui.yml',
      'tools/server/webui/**.*',
-      'tools/server/tests/**.*',
-      'tools/server/public/**'
+      'tools/server/tests/**.*'
    ]

 env:
@@ -36,9 +34,14 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  webui-check:
+  webui-build:
+    name: Build WebUI
+    uses: ./.github/workflows/webui-build.yml
+
+  webui-checks:
    name: WebUI Checks
-    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+    needs: webui-build
+    runs-on: ubuntu-24.04-arm
    continue-on-error: true
    steps:
      - name: Checkout code
@@ -51,7 +54,7 @@ jobs:
        id: node
        uses: actions/setup-node@v6
        with:
-          node-version: "22"
+          node-version: "24"
          cache: "npm"
          cache-dependency-path: "tools/server/webui/package-lock.json"

@@ -71,6 +74,47 @@ jobs:
        run: npm run lint
        working-directory: tools/server/webui

+      - name: Install Playwright browsers
+        id: playwright
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        run: npx playwright install --with-deps
+        working-directory: tools/server/webui
+
+      - name: Run Client tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
+        run: npm run test:client
+        working-directory: tools/server/webui
+
+      - name: Run Unit tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
+        run: npm run test:unit
+        working-directory: tools/server/webui
+
+  e2e-tests:
+    name: E2E Tests
+    needs: webui-build
+    runs-on: ubuntu-24.04-arm
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Setup Node.js
+        id: node
+        uses: actions/setup-node@v6
+        with:
+          node-version: "24"
+          cache: "npm"
+          cache-dependency-path: "tools/server/webui/package-lock.json"
+
+      - name: Install dependencies
+        id: setup
+        if: ${{ steps.node.conclusion == 'success' }}
+        run: npm ci
+        working-directory: tools/server/webui
+
      - name: Build application
        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run build
@@ -87,16 +131,6 @@ jobs:
        run: npm run build-storybook
        working-directory: tools/server/webui

-      - name: Run Client tests
-        if: ${{ always() && steps.playwright.conclusion == 'success' }}
-        run: npm run test:client
-        working-directory: tools/server/webui
-
-      - name: Run Unit tests
-        if: ${{ always() && steps.playwright.conclusion == 'success' }}
-        run: npm run test:unit
-        working-directory: tools/server/webui
-
      - name: Run UI tests
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:ui -- --testTimeout=60000
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -54,7 +54,12 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  webui-build:
+    name: Build WebUI
+    uses: ./.github/workflows/webui-build.yml
+
  server:
+    needs: webui-build
    runs-on: ubuntu-latest

    name: server (${{ matrix.wf_name }})
@@ -93,6 +98,12 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: Build
        id: cmake_build
        run: |
@@ -125,6 +136,7 @@ jobs:
          SLOW_TESTS=1 pytest -v -x

  server-windows:
+    needs: webui-build
    runs-on: windows-2022

    steps:
@@ -135,6 +147,12 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: Build
        id: cmake_build
        run: |
--- a/.github/workflows/webui-build.yml
+++ b/.github/workflows/webui-build.yml
@@ -0,0 +1,44 @@
+name: Build WebUI
+
+on:
+  workflow_call:
+
+jobs:
+  build:
+    name: Build WebUI
+    runs-on: ubuntu-slim
+    env:
+      BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v6
+        with:
+          node-version: "24"
+          cache: "npm"
+          cache-dependency-path: "tools/server/webui/package-lock.json"
+
+      - name: Install dependencies
+        run: npm ci
+        working-directory: tools/server/webui
+
+      - name: Build application
+        run: npm run build
+        working-directory: tools/server/webui
+
+      - name: Generate checksums
+        run: |
+          cd tools/server/public
+          for f in *; do
+            sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
+          done
+
+      - name: Upload built webui
+        uses: actions/upload-artifact@v6
+        with:
+          name: webui-build
+          path: tools/server/public/
+          retention-days: 1
--- a/.github/workflows/webui-publish.yml
+++ b/.github/workflows/webui-publish.yml
@@ -0,0 +1,65 @@
+name: WebUI Publish
+
+on:
+  workflow_call:
+    inputs:
+      version_tag:
+        description: 'Version tag to publish under (e.g., b1234)'
+        required: true
+        type: string
+    secrets:
+      hf_token:
+        description: 'Hugging Face token with write access'
+        required: true
+
+jobs:
+  publish:
+    name: Publish WebUI Static Output
+    runs-on: ubuntu-24.04-arm
+
+    permissions:
+      contents: read
+
+    env:
+      HF_BUCKET_NAME: ${{ vars.HF_BUCKET_WEBUI_STATIC_OUTPUT }}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 1
+
+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
+      - name: Install Hugging Face Hub CLI
+        run: pip install -U huggingface_hub
+
+      - name: Authenticate with Hugging Face
+        run: hf auth login --token ${{ secrets.hf_token }}
+
+      - name: Sync built files to Hugging Face bucket (version tag)
+        run: |
+          # Upload the built files to the Hugging Face bucket under the release version
+          hf buckets sync tools/server/public hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet
+
+      - name: Sync built files to Hugging Face bucket (latest)
+        run: |
+          # Also upload to the 'latest' directory for fallback downloads
+          hf buckets sync tools/server/public hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet
+
+      - name: Verify upload
+        run: |
+          # List the files in the bucket to verify the upload
+          hf buckets list hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} -R -h
+
+      - name: Clean up root-level files
+        run: |
+          # Clean up any old root-level files from previous non-versioned deployments
+          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/index.html --yes 2>/dev/null || true
+          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/bundle.js --yes 2>/dev/null || true
+          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/bundle.css --yes 2>/dev/null || true
+          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/loading.html --yes 2>/dev/null || true
--- a/.gitignore
+++ b/.gitignore
@@ -54,6 +54,7 @@
 /tmp/
 /autogen-*.md
 /common/build-info.cpp
+/tools/server/public

 # Deprecated

@@ -96,8 +97,6 @@

 /tools/server/webui/node_modules
 /tools/server/webui/dist
-# we no longer use gz for index.html
-/tools/server/public/index.html.gz

 # Python

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -104,13 +104,14 @@ option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
 option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})

 # extra artifacts
-option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_TOOLS    "llama: build tools"          ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_WEBUI    "llama: build the embedded Web UI for server"  ON)
-option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_DEFAULT})
-option(LLAMA_TESTS_INSTALL  "llama: install tests"        ON)
+option(LLAMA_BUILD_TESTS            "llama: build tests"                                                                            ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_TOOLS            "llama: build tools"                                                                            ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES         "llama: build examples"                                                                         ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_SERVER           "llama: build server example"                                                                   ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_WEBUI            "llama: build the embedded Web UI for server"                                                   ON)
+option(LLAMA_USE_PREBUILT_WEBUI     "llama: use prebuilt WebUI from HF Bucket when available (requires LLAMA_BUILD_WEBUI=ON)"       ON)
+option(LLAMA_TOOLS_INSTALL          "llama: install tools"                                                                          ${LLAMA_TOOLS_INSTALL_DEFAULT})
+option(LLAMA_TESTS_INSTALL          "llama: install tests"                                                                          ON)

 # 3rd party libs
 option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" ON)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -46,7 +46,9 @@ Before submitting your PR:
    - provide KL divergence data calculated vs. the FP16/BF16 (whichever is the native precision) version for both the new type as well as types of similar size
    - provide [performance data](https://github.com/ggml-org/llama.cpp/tree/master/tools/llama-bench) for the new type in comparison to types of similar size on pure CPU
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If you are a new contributor, limit your open PRs to 1.
+- If you are a new contributor
+    - Limit your open PRs to 1
+    - Do not submit trivial fixes (e.g. typos, formatting changes)

 After submitting your PR:
 - Expect requests for modifications to ensure the code meets llama.cpp's standards for quality and long-term maintainability
--- a/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
+++ b/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
@@ -24,6 +24,6 @@ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
-set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}")
-set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}")
+set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zvfh_zba_zicbop -mabi=lp64d -fno-tree-vectorize -fno-tree-loop-vectorize ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zvfh_zba_zicbop -mabi=lp64d -fno-tree-vectorize -fno-tree-loop-vectorize ${CMAKE_CXX_FLAGS}")
 set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic")
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -308,12 +308,14 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
    common_download_opts opts;
    opts.bearer_token = params.hf_token;
    opts.offline = params.offline;
+
+    LOG_TRC("%s: looking for remote preset at %s\n", __func__, preset_url.c_str());
    const int status = common_download_file_single(preset_url, preset_path, opts);
    const bool has_preset = status >= 200 && status < 400;

    // remote preset is optional, so we don't error out if not found
    if (has_preset) {
-        LOG_INF("applying remote preset from %s\n", preset_url.c_str());
+        LOG_TRC("%s: applying remote preset from %s\n", __func__, preset_url.c_str());
        common_preset_context ctx(ex, /* only_remote_allowed */ true);
        common_preset global;
        auto remote_presets = ctx.load_from_ini(preset_path, global);
@@ -326,7 +328,7 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
            throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
        }
    } else {
-        LOG_INF("%s", "no remote preset found, skipping\n");
+        LOG_TRC("%s: no remote preset found, skipping\n", __func__);
    }

    return has_preset;
@@ -357,8 +359,7 @@ static handle_model_result common_params_handle_model(struct common_params_model
        auto download_result = common_download_model(model, opts, true);

        if (download_result.model_path.empty()) {
-            LOG_ERR("error: failed to download model from Hugging Face\n");
-            exit(1);
+            throw std::runtime_error("failed to download model from Hugging Face");
        }

        model.name = model.hf_repo;
@@ -380,8 +381,7 @@ static handle_model_result common_params_handle_model(struct common_params_model
        opts.offline = offline;
        auto download_result = common_download_model(model, opts);
        if (download_result.model_path.empty()) {
-            LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
-            exit(1);
+            throw std::runtime_error("failed to download model from " + model.url);
        }
    }

@@ -435,6 +435,25 @@ static bool parse_bool_value(const std::string & value) {
 // CLI argument parsing functions
 //

+void common_params_handle_models(common_params & params, llama_example curr_ex) {
+    auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
+    if (params.no_mmproj) {
+        params.mmproj = {};
+    } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
+        // optionally, handle mmproj model when -hf is specified
+        params.mmproj = res.mmproj;
+    }
+    // only download mmproj if the current example is using it
+    for (const auto & ex : mmproj_examples) {
+        if (curr_ex == ex) {
+            common_params_handle_model(params.mmproj,    params.hf_token, params.offline);
+            break;
+        }
+    }
+    common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
+    common_params_handle_model(params.vocoder.model,             params.hf_token, params.offline);
+}
+
 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
    common_params & params = ctx_arg.params;

@@ -588,22 +607,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context

    // handle model and download
    if (!skip_model_download) {
-        auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
-        if (params.no_mmproj) {
-            params.mmproj = {};
-        } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
-            // optionally, handle mmproj model when -hf is specified
-            params.mmproj = res.mmproj;
-        }
-        // only download mmproj if the current example is using it
-        for (const auto & ex : mmproj_examples) {
-            if (ctx_arg.ex == ex) {
-                common_params_handle_model(params.mmproj,    params.hf_token, params.offline);
-                break;
-            }
-        }
-        common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
-        common_params_handle_model(params.vocoder.model,             params.hf_token, params.offline);
+        common_params_handle_models(params, ctx_arg.ex);
    }

    // model is required (except for server)
@@ -622,10 +626,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
            string_process_escapes(seq_breaker);
        }
-        for (auto & pair : params.speculative.draft.replacements) {
-            string_process_escapes(pair.first);
-            string_process_escapes(pair.second);
-        }
    }

    if (!params.kv_overrides.empty()) {
@@ -2223,7 +2223,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    if (llama_supports_rpc()) {
        add_opt(common_arg(
            {"--rpc"}, "SERVERS",
-            "comma separated list of RPC servers (host:port)",
+            "comma-separated list of RPC servers (host:port)",
            [](common_params & params, const std::string & value) {
                add_rpc_devices(value);
                GGML_UNUSED(params);
@@ -3303,18 +3303,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_env("LLAMA_LOG_VERBOSITY"));
    add_opt(common_arg(
        {"--log-prefix"},
+        {"--no-log-prefix"},
        "Enable prefix in log messages",
-        [](common_params &) {
-            common_log_set_prefix(common_log_main(), true);
+        [](common_params &, bool value) {
+            common_log_set_prefix(common_log_main(), value);
        }
-    ).set_env("LLAMA_LOG_PREFIX"));
+    ).set_env("LLAMA_ARG_LOG_PREFIX"));
    add_opt(common_arg(
        {"--log-timestamps"},
+        {"--no-log-timestamps"},
        "Enable timestamps in log messages",
-        [](common_params &) {
-            common_log_set_timestamps(common_log_main(), true);
+        [](common_params &, bool value) {
+            common_log_set_timestamps(common_log_main(), value);
        }
-    ).set_env("LLAMA_LOG_TIMESTAMPS"));
+    ).set_env("LLAMA_ARG_LOG_TIMESTAMPS"));

    //
    // speculative parameters
@@ -3518,13 +3520,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.speculative.draft.p_min = std::stof(value);
        }
    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_P_MIN"));
-    add_opt(common_arg(
-        {"--spec-draft-ctx-size", "-cd", "--ctx-size-draft"}, "N",
-        string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.draft.n_ctx),
-        [](common_params & params, int value) {
-            params.speculative.draft.n_ctx = value;
-        }
-    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_CTX_SIZE"));
    add_opt(common_arg(
        {"--spec-draft-device", "-devd", "--device-draft"}, "<dev1,dev2,..>",
        "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@@ -3561,32 +3556,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_MODEL"));
    add_opt(common_arg(
-        {"--spec-draft-replace", "--spec-replace"}, "TARGET", "DRAFT",
-        "translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
-        [](common_params & params, const std::string & tgt, const std::string & dft) {
-            params.speculative.draft.replacements.push_back({ tgt, dft });
-        }
-    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
-    add_opt(common_arg(
-        {"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]",
-        string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
-            common_speculative_type_to_str(params.speculative.type).c_str()),
+        {"--spec-type"}, common_speculative_all_types_str(),
+        string_format("comma-separated list of types of speculative decoding to use (default: %s)\n",
+            common_speculative_type_name_str(params.speculative.types).c_str()),
        [](common_params & params, const std::string & value) {
-            if (value == "none") {
-                params.speculative.type = COMMON_SPECULATIVE_TYPE_NONE;
-            } else if (value == "ngram-cache") {
-                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_CACHE;
-            } else if (value == "ngram-simple") {
-                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE;
-            } else if (value == "ngram-map-k") {
-                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
-            } else if (value == "ngram-map-k4v") {
-                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
-            } else if (value == "ngram-mod") {
-                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
-            } else {
-                throw std::invalid_argument("unknown speculative decoding type without draft model");
-            }
+            const auto enabled_types = string_split<std::string>(value, ',');
+            params.speculative.types = common_speculative_types_from_names(enabled_types);
        }
    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_TYPE"));
    add_opt(common_arg(
@@ -4075,7 +4050,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--spec-default"},
        string_format("enable default speculative decoding config"),
        [](common_params & params) {
-            params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
+            params.speculative.types = { COMMON_SPECULATIVE_TYPE_NGRAM_MOD };
            params.speculative.ngram_mod.n_match = 24;
            params.speculative.ngram_mod.n_min = 48;
            params.speculative.ngram_mod.n_max = 64;
--- a/common/arg.h
+++ b/common/arg.h
@@ -129,5 +129,8 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
 // see: https://github.com/ggml-org/llama.cpp/issues/18163
 void common_params_add_preset_options(std::vector<common_arg> & args);

+// Populate model paths (main model, mmproj, etc) from -hf if necessary
+void common_params_handle_models(common_params & params, llama_example curr_ex);
+
 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -366,15 +366,29 @@ void common_init() {
    SetConsoleCP(CP_UTF8);
 #endif

-    llama_log_set(common_log_default_callback, NULL);
+    common_log_set_prefix(common_log_main(), true);
+    common_log_set_timestamps(common_log_main(), true);

+    llama_log_set(common_log_default_callback, NULL);
+}
+
+void common_params_print_info(const common_params & params) {
 #ifdef NDEBUG
    const char * build_type = "";
 #else
    const char * build_type = " (debug)";
 #endif
+    LOG_TRC("%s: build %d (%s) with %s for %s%s\n", __func__, llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);

-    LOG_DBG("build: %d (%s) with %s for %s%s\n", llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
+    LOG_INF("log_info: verbosity = %d (adjust with the `-lv N` CLI arg)\n", common_log_get_verbosity_thold());
+    LOG_INF("device_info:\n");
+    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+        auto * dev = ggml_backend_dev_get(i);
+        size_t free, total;
+        ggml_backend_dev_memory(dev, &free, &total);
+        LOG_INF("  - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
+    }
+    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
 }

 std::string common_params_get_system_info(const common_params & params) {
@@ -1147,7 +1161,8 @@ common_init_result::common_init_result(common_params & params) :
    auto cparams = common_context_params_to_llama(params);

    if (params.fit_params) {
-        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
+        LOG_INF("%s: fitting params to device memory ...\n", __func__);
+        LOG_INF("%s: (for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on)\n", __func__);
        common_fit_params(params.model.path.c_str(), &mparams, &cparams,
            params.tensor_split,
            params.tensor_buft_overrides.data(),
@@ -1196,7 +1211,7 @@ common_init_result::common_init_result(common_params & params) :
    // initialize once
    for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
        if (llama_vocab_is_eog(vocab, i)) {
-            LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
+            LOG_TRC("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
            params.sampling.logit_bias_eog.push_back({i, -INFINITY});
        }
    }
@@ -1209,12 +1224,12 @@ common_init_result::common_init_result(common_params & params) :
    }

    //if (params.sampling.penalty_last_n == -1) {
-    //    LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    LOG_TRC("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
    //    params.sampling.penalty_last_n = llama_n_ctx(lctx);
    //}

    //if (params.sampling.dry_penalty_last_n == -1) {
-    //    LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    LOG_TRC("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
    //    params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
    //}

@@ -1422,7 +1437,7 @@ common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {

    // try to remove the last tokens
    if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
-        LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
+        LOG_TRC("%s: the context does not support partial sequence removal\n", __func__);
        res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
        goto done;
    }
@@ -1960,3 +1975,102 @@ bool common_prompt_batch_decode(

    return true;
 }
+
+size_t common_prompt_checkpoint::size() const {
+    return data_tgt.size() + data_dft.size();
+}
+
+bool common_prompt_checkpoint::empty() const {
+    return data_tgt.empty();
+}
+
+void common_prompt_checkpoint::clear() {
+    n_tokens = 0;
+
+    pos_min = 0;
+    pos_max = 0;
+
+    data_tgt.clear();
+    data_dft.clear();
+}
+
+void common_prompt_checkpoint::update_pos(
+        int64_t n_tokens,
+        llama_pos pos_min,
+        llama_pos pos_max) {
+    this->n_tokens = n_tokens;
+    this->pos_min  = pos_min;
+    this->pos_max  = pos_max;
+}
+
+void common_prompt_checkpoint::update_tgt(
+        llama_context * ctx,
+        llama_seq_id seq_id,
+        llama_state_seq_flags flags) {
+    if (ctx == nullptr) {
+        return;
+    }
+
+    const size_t ckpt_size = llama_state_seq_get_size_ext(ctx, seq_id, flags);
+
+    data_tgt.resize(ckpt_size);
+
+    const size_t n = llama_state_seq_get_data_ext(ctx, data_tgt.data(), ckpt_size, seq_id, flags);
+    if (n != ckpt_size) {
+        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size, n);
+    }
+}
+
+void common_prompt_checkpoint::update_dft(
+        llama_context * ctx,
+        llama_seq_id seq_id,
+        llama_state_seq_flags flags) {
+    if (ctx == nullptr) {
+        return;
+    }
+
+    const size_t ckpt_size = llama_state_seq_get_size_ext(ctx, seq_id, flags);
+
+    data_dft.resize(ckpt_size);
+
+    const size_t n = llama_state_seq_get_data_ext(ctx, data_dft.data(), ckpt_size, seq_id, flags);
+    if (n != ckpt_size) {
+        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size, n);
+    }
+}
+
+void common_prompt_checkpoint::load_tgt(
+        llama_context * ctx,
+        llama_seq_id seq_id,
+        llama_state_seq_flags flags) const {
+    if (ctx == nullptr) {
+        return;
+    }
+
+    if (data_tgt.empty()) {
+        return;
+    }
+
+    const size_t n = llama_state_seq_set_data_ext(ctx, data_tgt.data(), data_tgt.size(), seq_id, flags);
+    if (n != data_tgt.size()) {
+        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_tgt.size(), n);
+    }
+}
+
+void common_prompt_checkpoint::load_dft(
+        llama_context * ctx,
+        llama_seq_id seq_id,
+        llama_state_seq_flags flags) const {
+    if (ctx == nullptr) {
+        return;
+    }
+
+    if (data_dft.empty()) {
+        return;
+    }
+
+    const size_t n = llama_state_seq_set_data_ext(ctx, data_dft.data(), data_dft.size(), seq_id, flags);
+    if (n != data_dft.size()) {
+        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_dft.size(), n);
+    }
+}
--- a/common/common.h
+++ b/common/common.h
@@ -157,9 +157,9 @@ enum common_params_sampling_config : uint64_t {

 enum common_speculative_type {
    COMMON_SPECULATIVE_TYPE_NONE,          // no speculative decoding
-    COMMON_SPECULATIVE_TYPE_DRAFT,         // draft model
-    COMMON_SPECULATIVE_TYPE_EAGLE3,        // eagle draft model
-    COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding
+    COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE,  // standalone draft model speculative decoding
+    COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3,  // Eagle3 speculative decoding
+    COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding based on n-grams
    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,   // self-speculative decoding with n-gram keys only
    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
    COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
@@ -295,8 +295,6 @@ struct common_params_model {
    std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
 };

-struct common_ngram_mod;
-
 // draft-model-based speculative decoding parameters
 struct common_params_speculative_draft {
    int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
@@ -307,11 +305,9 @@ struct common_params_speculative_draft {

    common_params_model mparams;

-    llama_model * model = nullptr; // a llama_model that can be shared by multiple speculative contexts
+    llama_context * ctx_tgt = nullptr;
+    llama_context * ctx_dft = nullptr;

-    llama_context_params cparams; // these are the parameters for the draft llama_context
-
-    int32_t n_ctx        = 0;  // draft context size
    int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)

    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
@@ -322,7 +318,6 @@ struct common_params_speculative_draft {

    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

-    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
 };

@@ -331,9 +326,6 @@ struct common_params_speculative_ngram_mod {

    int32_t n_max = 64;
    int32_t n_min = 48;
-
-    // shared instance of the ngram container for all speculative decoding contexts
-    std::shared_ptr<common_ngram_mod> obj;
 };

 struct common_params_speculative_ngram_map {
@@ -348,9 +340,9 @@ struct common_params_speculative_ngram_cache {
 };

 struct common_params_speculative {
-    // TODO: become a vector in order to support "chains of speculators"
-    common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE;
+    std::vector<enum common_speculative_type> types = { COMMON_SPECULATIVE_TYPE_NONE };

+    // used by Simple, MTP, Eagle3, etc. - all methods that require some kind of draft model
    common_params_speculative_draft draft;

    common_params_speculative_ngram_mod ngram_mod;
@@ -613,7 +605,11 @@ struct common_params {
    std::map<std::string, std::string> default_template_kwargs;

    // webui configs
-    bool webui = true;
+#ifdef LLAMA_WEBUI_DEFAULT_ENABLED
+    bool webui = LLAMA_WEBUI_DEFAULT_ENABLED != 0;
+#else
+    bool webui = true; // default to enabled when not set
+#endif
    bool webui_mcp_proxy = false;
    std::string webui_config_json;

@@ -694,6 +690,7 @@ struct common_params {
 // initializes the logging system and prints info about the build
 void common_init();

+void common_params_print_info(const common_params & params);
 std::string common_params_get_system_info(const common_params & params);

 bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
@@ -1026,3 +1023,47 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std

 // "adamw" or "sgd" (case insensitive)
 enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
+
+//
+// prompt utils
+//
+
+struct common_prompt_checkpoint {
+    int64_t n_tokens;
+
+    llama_pos pos_min;
+    llama_pos pos_max;
+
+    std::vector<uint8_t> data_tgt;
+    std::vector<uint8_t> data_dft;
+
+    size_t size() const;
+
+    bool empty() const;
+    void clear();
+
+    void update_pos(
+            int64_t n_tokens,
+            llama_pos pos_min,
+            llama_pos pos_max);
+
+    void update_tgt(
+            llama_context * ctx,
+            llama_seq_id seq_id,
+            llama_state_seq_flags flags);
+
+    void update_dft(
+            llama_context * ctx,
+            llama_seq_id seq_id,
+            llama_state_seq_flags flags);
+
+    void load_tgt(
+            llama_context * ctx,
+            llama_seq_id seq_id,
+            llama_state_seq_flags flags) const;
+
+    void load_dft(
+            llama_context * ctx,
+            llama_seq_id seq_id,
+            llama_state_seq_flags flags) const;
+};
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -320,9 +320,9 @@ static int common_download_file_single_online(const std::string & url,

    auto head = cli.Head(parts.path);
    if (!head || head->status < 200 || head->status >= 300) {
-        LOG_WRN("%s: HEAD failed, status: %d\n", __func__, head ? head->status : -1);
+        LOG_TRC("%s: HEAD failed, status: %d\n", __func__, head ? head->status : -1);
        if (file_exists) {
-            LOG_INF("%s: using cached file (HEAD failed): %s\n", __func__, path.c_str());
+            LOG_TRC("%s: using cached file (HEAD failed): %s\n", __func__, path.c_str());
            return 304; // 304 Not Modified - fake cached response
        }
        return head ? head->status : -1;
--- a/common/fit.cpp
+++ b/common/fit.cpp
@@ -168,7 +168,7 @@ static void common_params_fit_impl(

    // step 1: get data for default parameters and check whether any changes are necessary in the first place

-    LOG_INF("%s: getting device memory data for initial parameters:\n", __func__);
+    LOG_TRC("%s: getting device memory data for initial parameters:\n", __func__);
    const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
    const size_t nd = devs.size(); // number of devices

@@ -213,13 +213,13 @@ static void common_params_fit_impl(
        LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
            __func__, sum_projected_used/MiB, sum_free/MiB);
        if (sum_projected_free >= margins[0]) {
-            LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
+            LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
                __func__, sum_projected_free/MiB, margins[0]/MiB);
            return;
        }
    } else {
        if (nd > 1) {
-            LOG_INF("%s: projected memory use with initial parameters [MiB]:\n", __func__);
+            LOG_TRC("%s: projected memory use with initial parameters [MiB]:\n", __func__);
        }
        for (size_t id = 0; id < nd; id++) {
            const llama_device_memory_data & dmd = dmds_full[id];
@@ -234,16 +234,16 @@ static void common_params_fit_impl(
            sum_projected_model += dmd.mb.model;

            if (nd > 1) {
-                LOG_INF("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
+                LOG_TRC("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
                    __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
            }
        }
        assert(sum_free >= 0 && sum_projected_used >= 0);
-        LOG_INF("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
+        LOG_TRC("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
            __func__, sum_projected_used/MiB, sum_free/MiB);
        if (nd == 1) {
            if (projected_free_per_device[0] >= margins[0]) {
-                LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
+                LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
                    __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
                return;
            }
@@ -256,7 +256,7 @@ static void common_params_fit_impl(
                }
            }
            if (!changes_needed) {
-                LOG_INF("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
+                LOG_TRC("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
                return;
            }
        }
@@ -275,10 +275,10 @@ static void common_params_fit_impl(
        }
        if (global_surplus < 0) {
            if (nd <= 1) {
-                LOG_INF("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
+                LOG_TRC("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
                    __func__, margins[0]/MiB, -global_surplus/MiB);
            } else {
-                LOG_INF(
+                LOG_TRC(
                    "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
                    __func__, -global_surplus/MiB);
            }
@@ -320,28 +320,28 @@ static void common_params_fit_impl(

                        const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
                        const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
-                        LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
+                        LOG_TRC("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
                        if (nd <= 1) {
-                            LOG_INF("%s: entire model can be fit by reducing context\n", __func__);
+                            LOG_TRC("%s: entire model can be fit by reducing context\n", __func__);
                            return;
                        }
-                        LOG_INF("%s: entire model should be fit across devices by reducing context\n", __func__);
+                        LOG_TRC("%s: entire model should be fit across devices by reducing context\n", __func__);
                    } else {
                        const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
-                        LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
+                        LOG_TRC("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
                    }
                } else {
                    if (n_ctx_min == UINT32_MAX) {
-                        LOG_INF("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
+                        LOG_TRC("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
                    } else {
-                        LOG_INF("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
+                        LOG_TRC("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
                            __func__, hp_nct, n_ctx_min);
                    }
                }
            } else {
-                LOG_INF("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
+                LOG_TRC("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
            }
        }
    }
@@ -485,10 +485,10 @@ static void common_params_fit_impl(
        const dmds_t dmd_nl = common_get_device_memory_data(
            path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);

-        LOG_INF("%s: memory for test allocation by device:\n", func_name);
+        LOG_TRC("%s: memory for test allocation by device:\n", func_name);
        for (size_t id = 0; id < nd; id++) {
            const ngl_t & n = ngl_per_device[id];
-            LOG_INF(
+            LOG_TRC(
                "%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
                func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
        }
@@ -509,7 +509,7 @@ static void common_params_fit_impl(
        tensor_buft_overrides[1] = {nullptr, nullptr};
        mparams->tensor_buft_overrides = tensor_buft_overrides;

-        LOG_INF("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
+        LOG_TRC("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
        const dmds_t dmds_cpu_moe = common_get_device_memory_data(
            path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);

@@ -519,10 +519,10 @@ static void common_params_fit_impl(
        }

        if (global_surplus_cpu_moe > 0) {
-            LOG_INF("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
+            LOG_TRC("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
                __func__, global_surplus_cpu_moe/MiB);
        } else {
-            LOG_INF("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
+            LOG_TRC("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
                __func__, -global_surplus_cpu_moe/MiB);
        }

@@ -535,7 +535,7 @@ static void common_params_fit_impl(
    targets.reserve(nd);
    for (size_t id = 0; id < nd; id++) {
        targets.push_back(dmds_full[id].free - margins[id]);
-        LOG_INF("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
+        LOG_TRC("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
    }

    std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
@@ -555,9 +555,9 @@ static void common_params_fit_impl(
    //   - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
    //   - the last device has the output layer, which cannot be a partial layer
    if (hp_nex == 0) {
-        LOG_INF("%s: filling dense layers back-to-front:\n", __func__);
+        LOG_TRC("%s: filling dense layers back-to-front:\n", __func__);
    } else {
-        LOG_INF("%s: filling dense-only layers back-to-front:\n", __func__);
+        LOG_TRC("%s: filling dense-only layers back-to-front:\n", __func__);
    }
    for (int id = nd - 1; id >= 0; id--) {
        uint32_t n_unassigned = hp_ngl + 1;
@@ -576,7 +576,7 @@ static void common_params_fit_impl(
            if (mem_high[id] > targets[id]) {
                assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
                uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
-                LOG_INF("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
+                LOG_TRC("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
                while (delta > 1) {
                    uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
                    step_size = std::max(step_size, uint32_t(1));
@@ -593,11 +593,11 @@ static void common_params_fit_impl(
                    if (mem_test[id] <= targets[id]) {
                        ngl_per_device = ngl_per_device_test;
                        mem            = mem_test;
-                        LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
+                        LOG_TRC("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
                    } else {
                        ngl_per_device_high = ngl_per_device_test;
                        mem_high            = mem_test;
-                        LOG_INF("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
+                        LOG_TRC("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
                    }
                    delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
                }
@@ -605,12 +605,12 @@ static void common_params_fit_impl(
                assert(ngl_per_device_high[id].n_layer == n_unassigned);
                ngl_per_device = ngl_per_device_high;
                mem            = mem_high;
-                LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
+                LOG_TRC("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
            }
        }

        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_INF(
+        LOG_TRC(
            "%s:   - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
    }
@@ -634,7 +634,7 @@ static void common_params_fit_impl(
    }
    assert(id_dense_start < nd);

-    LOG_INF("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
+    LOG_TRC("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
    for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
        for (size_t jd = id_dense_start; jd < nd; jd++) {
@@ -674,13 +674,13 @@ static void common_params_fit_impl(
                    ngl_per_device = ngl_per_device_test;
                    mem            = mem_test;
                    id_dense_start = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
+                    LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
                } else {
                    ngl_per_device_high = ngl_per_device_test;
                    mem_high            = mem_test;
                    id_dense_start_high = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
+                    LOG_TRC("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
                        __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
                }
                assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
@@ -690,7 +690,7 @@ static void common_params_fit_impl(
            ngl_per_device = ngl_per_device_high;
            mem            = mem_high;
            id_dense_start = id_dense_start_high;
-            LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
+            LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
                __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
        }

@@ -710,44 +710,44 @@ static void common_params_fit_impl(
            if (id < nd - 1) {
                overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
            }
-            LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
+            LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
            if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
                ngl_per_device = ngl_per_device_test;
                overflow_bufts = overflow_bufts_test;
                mem            = mem_test;
                id_dense_start = id_dense_start_test;
-                LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
+                LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
                    __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);

                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
-                LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
+                LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
                    ngl_per_device = ngl_per_device_test;
                    overflow_bufts = overflow_bufts_test;
                    mem            = mem_test;
                    id_dense_start = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
+                    LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
                }
            } else {
                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
-                LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
+                LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
                    ngl_per_device = ngl_per_device_test;
                    overflow_bufts = overflow_bufts_test;
                    mem            = mem_test;
                    id_dense_start = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
+                    LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
                }
            }
        }

        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_INF(
+        LOG_TRC(
            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
    }
@@ -755,7 +755,7 @@ static void common_params_fit_impl(
    // print info for devices that were not changed during the conversion from dense only to full layers:
    for (size_t id = id_dense_start + 1; id < nd; id++) {
        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_INF(
+        LOG_TRC(
            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
    }
@@ -776,7 +776,7 @@ enum common_params_fit_status common_fit_params(
    common_params_fit_status status = COMMON_PARAMS_FIT_STATUS_SUCCESS;
    try {
        common_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
-        LOG_INF("%s: successfully fit params to free device memory\n", __func__);
+        LOG_TRC("%s: successfully fit params to free device memory\n", __func__);
    } catch (const common_params_fit_exception & e) {
        LOG_WRN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
        status = COMMON_PARAMS_FIT_STATUS_FAILURE;
@@ -785,7 +785,7 @@ enum common_params_fit_status common_fit_params(
        status = COMMON_PARAMS_FIT_STATUS_ERROR;
    }
    const int64_t t1_us = llama_time_us();
-    LOG_INF("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
+    LOG_TRC("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
    return status;
 }

@@ -925,7 +925,7 @@ void common_memory_breakdown_print(const struct llama_context * ctx) {
        }
    }
    for (const auto & td : table_data) {
-        LOG_INF(td[0].c_str(),
+        LOG_TRC(td[0].c_str(),
            __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
            td[6].c_str(), td[7].c_str(), td[8].c_str());
    }
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -435,10 +435,10 @@ void common_log_flush(struct common_log * log) {
 static int common_get_verbosity(enum ggml_log_level level) {
    switch (level) {
        case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
-        case GGML_LOG_LEVEL_INFO:  return LOG_LEVEL_INFO;
+        case GGML_LOG_LEVEL_INFO:  return LOG_LEVEL_TRACE;
        case GGML_LOG_LEVEL_WARN:  return LOG_LEVEL_WARN;
        case GGML_LOG_LEVEL_ERROR: return LOG_LEVEL_ERROR;
-        case GGML_LOG_LEVEL_CONT:  return LOG_LEVEL_INFO; // same as INFO
+        case GGML_LOG_LEVEL_CONT:  return LOG_LEVEL_TRACE;
        case GGML_LOG_LEVEL_NONE:
        default:
            return LOG_LEVEL_OUTPUT;
--- a/common/log.h
+++ b/common/log.h
@@ -21,7 +21,8 @@
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif

-#define LOG_LEVEL_DEBUG  4
+#define LOG_LEVEL_DEBUG  5
+#define LOG_LEVEL_TRACE  4
 #define LOG_LEVEL_INFO   3
 #define LOG_LEVEL_WARN   2
 #define LOG_LEVEL_ERROR  1
@@ -111,13 +112,15 @@ void common_log_flush         (struct common_log * log);                    // f
 #define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity,        __VA_ARGS__)

 #define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_LEVEL_DEBUG,  __VA_ARGS__)
+#define LOG_TRC(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  LOG_LEVEL_TRACE,  __VA_ARGS__)
 #define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  LOG_LEVEL_INFO,   __VA_ARGS__)
 #define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  LOG_LEVEL_WARN,   __VA_ARGS__)
 #define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, LOG_LEVEL_ERROR,  __VA_ARGS__)
 #define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  LOG_LEVEL_INFO,   __VA_ARGS__) // same as INFO

+#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
+#define LOG_TRCV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_TRACE, verbosity, __VA_ARGS__)
 #define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
 #define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
 #define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
-#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
 #define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  verbosity, __VA_ARGS__)
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -163,8 +163,13 @@ void common_preset::merge(const common_preset & other) {
    }
 }

-void common_preset::apply_to_params(common_params & params) const {
+void common_preset::apply_to_params(common_params & params, const std::set<std::string> & handled_keys) const {
    for (const auto & [opt, val] : options) {
+        if (!handled_keys.empty()) {
+            if (!opt.env || handled_keys.find(opt.env) == handled_keys.end()) {
+                continue;
+            }
+        }
        // apply each option to params
        if (opt.handler_string) {
            opt.handler_string(params, val);
--- a/common/preset.h
+++ b/common/preset.h
@@ -43,7 +43,8 @@ struct common_preset {
    void merge(const common_preset & other);

    // apply preset options to common_params
-    void apply_to_params(common_params & params) const;
+    // optionally specify handled_keys to only apply a subset of options (identified by their env), if empty, apply all options
+    void apply_to_params(common_params & params, const std::set<std::string> & handled_keys = std::set<std::string>()) const;
 };

 // interface for multiple presets in one file
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -5,8 +5,14 @@

 struct common_speculative;

+// comma separated list the provided types
+std::string common_speculative_type_name_str(const std::vector<enum common_speculative_type> & types);
+
 // comma separated list of all types
-std::string common_speculative_type_name_str();
+const char * common_speculative_all_types_str();
+
+// parse user provided types
+std::vector<enum common_speculative_type> common_speculative_types_from_names(const std::vector<std::string> & names);

 // convert string to type
 enum common_speculative_type common_speculative_type_from_name(const std::string & name);
@@ -14,27 +20,44 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
 // convert type to string
 std::string common_speculative_type_to_str(enum common_speculative_type type);

-common_speculative * common_speculative_init(
-        common_params_speculative & params,
-        llama_context             * ctx_tgt);
+common_speculative * common_speculative_init(common_params_speculative & params, uint32_t n_seq);

 void common_speculative_free(common_speculative * spec);

+struct common_speculative_draft_params {
+    // this flag is used to chain the drafts through all the available implementations
+    // after the first successful draft from an implementation, we set it
+    //   to false to prevent further drafts for that sequence
+    // at the end of the draft() call, all drafting flags will be reset to false
+    bool drafting = false;
+
+    // overrides individual configurations (-1 disabled)
+    // can be used to constraint the max draft based on the remaining context size
+    int32_t n_max = -1;
+
+    llama_pos   n_past;
+    llama_token id_last;
+
+    // TODO: remove in the future by keeping track of the prompt from the _begin() call and the consecutive accept calls
+    const llama_tokens * prompt;
+
+    // the generated draft from the last _draft() call
+    llama_tokens * result;
+};
+
+common_speculative_draft_params & common_speculative_get_draft_params(common_speculative * spec, llama_seq_id seq_id);
+
 // optionally call once at the beginning of a new generation
-void common_speculative_begin(common_speculative * spec, const llama_tokens & prompt);
+void common_speculative_begin(common_speculative * spec, llama_seq_id seq_id, const llama_tokens & prompt);

-// sample up to n_draft tokens and add them to the batch using the draft model
-llama_tokens common_speculative_draft(
-                     common_speculative * spec,
-        const common_params_speculative & params,
-                     const llama_tokens & prompt,
-                            llama_token   id_last);
+// process the batch and update the internal state of the speculative context
+bool common_speculative_process(common_speculative * spec, const llama_batch & batch);

-// informs the speculative decoder that n_accepted tokens were accepted by the target model
-void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);
+// generate drafts for the sequences specified with `common_speculative_get_draft_params`
+void common_speculative_draft(common_speculative * spec);

-int32_t common_speculative_n_max(const common_speculative * spec, const common_params_speculative & params);
-int32_t common_speculative_n_min(const common_speculative * spec, const common_params_speculative & params);
+// informs the speculative context that n_accepted tokens were accepted by the target model
+void common_speculative_accept(common_speculative * spec, llama_seq_id, uint16_t n_accepted);

 // print statistics about the speculative decoding
 void common_speculative_print_stats(const common_speculative * spec);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2865,8 +2865,12 @@ class LlamaModel(TextModel):
        # fix for SmolVLM2, missing `num_attention_heads` in config.json
        if self.hf_arch == "VLlama3ForCausalLM":
            self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
-        hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
-        self.origin_hf_arch = hparams.get('architectures', [None])[0]
+        # Mistral consolidated format has no config.json; origin_hf_arch is HF-only.
+        if self.is_mistral_format:
+            self.origin_hf_arch = None
+        else:
+            hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
+            self.origin_hf_arch = hparams.get('architectures', [None])[0]

    def set_vocab(self):
        if self.origin_hf_arch == "GlmasrModel":
@@ -9760,6 +9764,73 @@ class MimoV2Model(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


+@ModelBase.register("MiMoV2ForCausalLM")
+class MiMoV2VisionModel(MmprojModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.hparams_vision is not None
+        hp = self.hparams_vision
+
+        hp["image_size"] = hp.get("image_size", 560)
+        hp["num_attention_heads"] = hp.get("num_heads", 32)
+        hp["num_hidden_layers"] = hp.get("depth", 28)
+
+        self.n_q_heads = int(hp["num_heads"])
+        self.num_kv_heads = int(hp.get("num_key_value_heads", 8))
+        self.head_dim = int(hp.get("qk_channels", 64))
+        self.spatial_merge_size = int(hp["spatial_merge_size"])
+        # MiMoV2 vision RMSNorm: HF uses getattr(config, "rms_norm_eps", 1e-6) and the
+        # field is absent from MiMo-V2.5's vision_config
+        self.rms_norm_eps = float(hp.get("rms_norm_eps", 1e-6))
+
+        # fullatt_block_indexes are also reflected in vit_window_attn_types as -1
+        self.fullatt_block_indexes = list(hp.get("fullatt_block_indexes") or [])
+        self.vit_window_attn_types = list(hp.get("vit_window_attn_types") or [])
+        self.visual_token_window_size = int(hp.get("visual_token_window_size", -1))
+        self.use_sink = bool(hp.get("use_sink", False))
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MIMOVL)
+        self.gguf_writer.add_vision_use_silu(True)
+        self.gguf_writer.add_vision_head_count_kv(self.num_kv_heads)
+        self.gguf_writer.add_vision_spatial_merge_size(self.spatial_merge_size)
+        self.gguf_writer.add_uint32(gguf.Keys.ClipVision.WINDOW_SIZE, self.visual_token_window_size)
+        self.gguf_writer.add_vision_wa_pattern_mode(self.vit_window_attn_types)
+        self.gguf_writer.add_vision_attention_layernorm_eps(self.rms_norm_eps)
+        self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"]))
+        self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"]))
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        # Sinks must be F32: any sink-style softmax/mask add in ggml requires
+        # F32, and we fold sinks into a host-built F32 mask at encode time.
+        if new_name.endswith(".attn_sinks"):
+            return gguf.GGMLQuantizationType.F32
+        return super().tensor_force_quant(name, new_name, bid, n_dims)
+
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, _ = item
+        if not name.startswith("visual."):
+            return None
+        return super().filter_tensors(item)
+
+    def modify_tensors(self, data_torch, name, bid):
+        # Conv3D patch embed: split along the temporal axis (kt=2) into two Conv2D
+        # weights that the existing qwen2vl-style two-Conv2D path consumes.
+        if name == "visual.patch_embed.proj.weight":
+            _, _, kt, _, _ = data_torch.shape
+            if kt != 2:
+                raise ValueError(f"unexpected temporal_patch_size: {kt}")
+            embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH]
+            yield (embd_name + ".weight",   data_torch[:, :, 0, ...])
+            yield (embd_name + ".weight.1", data_torch[:, :, 1, ...])
+            return
+
+        yield from super().modify_tensors(data_torch, name, bid)
+
+
@ModelBase.register("Step3p5ForCausalLM")
 class Step35Model(TextModel):
    model_arch = gguf.MODEL_ARCH.STEP35
@@ -13342,7 +13413,7 @@ class PixtralModel(LlavaVisionModel):
        self.gguf_writer.add_vision_use_silu(True)

        # spatial_merge_size
-        if self.find_vparam(["mm_projector_id"]) == "patch_merge":
+        if self.find_vparam(["mm_projector_id"], optional=True) == "patch_merge":
            self.gguf_writer.add_vision_spatial_merge_size(
                self.find_vparam(["spatial_merge_size"])
            )
@@ -13350,8 +13421,12 @@ class PixtralModel(LlavaVisionModel):
    def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
        if name == "vision_language_adapter.w_in.weight":
            return "mm.1.weight"
+        elif name == "vision_language_adapter.w_in.bias":
+            return "mm.1.bias"
        elif name == "vision_language_adapter.w_out.weight":
            return "mm.2.weight"
+        elif name == "vision_language_adapter.w_out.bias":
+            return "mm.2.bias"
        return super().map_tensor_name(name, try_suffixes)


--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -188,6 +188,24 @@ class LoraTorchTensor:
    def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
        return self.transpose(axis0, axis1)

+    def split(self, split_size: int | Sequence[int], dim: int = 0) -> tuple[LoraTorchTensor, ...]:
+        shape = self.shape
+        ndim = len(shape)
+        if dim < 0:
+            dim += ndim
+        if dim == ndim - 1:
+            A_chunks = self._lora_A.split(split_size, dim=-1)
+            return tuple(LoraTorchTensor(a, self._lora_B) for a in A_chunks)
+        elif dim == ndim - 2:
+            B_chunks = self._lora_B.split(split_size, dim=-2)
+            return tuple(LoraTorchTensor(self._lora_A, b) for b in B_chunks)
+        else:
+            B_chunks = self._lora_B.split(split_size, dim=dim)
+            if self._lora_A.shape[dim] == 1:
+                return tuple(LoraTorchTensor(self._lora_A, b) for b in B_chunks)
+            A_chunks = self._lora_A.split(split_size, dim=dim)
+            return tuple(LoraTorchTensor(a, b) for a, b in zip(A_chunks, B_chunks))
+
    def to(self, *args, **kwargs):
        return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))

@@ -230,6 +248,11 @@ class LoraTorchTensor:
                )
            else:
                raise NotImplementedError
+        elif func is torch.split:
+            assert len(args) and len(args) >= 2
+            tensor, split_size = args[0], args[1]
+            dim = args[2] if len(args) > 2 else kwargs.get("dim", 0)
+            return tensor.split(split_size, dim=dim)
        else:
            raise NotImplementedError

--- a/docs/backend/OPENVINO.md
+++ b/docs/backend/OPENVINO.md
@@ -57,17 +57,22 @@ Although OpenVINO supports a wide range of [Intel hardware](https://docs.openvin

 ## Validated Models

-The following models have been validated for functionality on Intel® Core™ Ultra Series 1 and Series 2:
+The following models were validated on Intel® Core™ Ultra Series 2. While our testing was limited, the OpenVINO backend is expected to work across a broad range of [Intel hardware](https://docs.openvino.ai/2026/about-openvino/release-notes-openvino/system-requirements.html).
+- Use `GGML_OPENVINO_STATEFUL_EXECUTION=1` when using GPU device.
+- `-fa 1` is required when running llama-bench with the OpenVINO backend.
+- Additional model support, quantization formats and validations are work in progress.

- [Llama-3.2-1B-Instruct-GGUF](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/)
- [Llama-3.1-8B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF)
- [microsoft/Phi-3-mini-4k-instruct-gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf)
- [Qwen/Qwen2.5-1.5B-Instruct-GGUF](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF)
- [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B-GGUF)
- [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-S-1B-sft-gguf)
- [tencent/Hunyuan-7B-Instruct](https://huggingface.co/bartowski/tencent_Hunyuan-7B-Instruct-GGUF)
- [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF)
- [bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF)
+| Model  | Validated   | Known Issues  |
+| :------| :---------- | :-------------|
+| [Llama-3.2-1B-Instruct](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
+| [Meta-Llama-3.1-8B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF) | `Q8_0`, `Q4_K_M` on CPU/GPU/NPU | `Q4_0_8_8`, `Q4_0_4_8`, `Q4_0_4_4` fail |
+| [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) | `FP16`, `Q4` on CPU/NPU | GPU unsupported for `FP16` and `Q4` (`llama-cli`, `llama-bench`) |
+| [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
+| [Qwen3-8B-Instruct](https://huggingface.co/Qwen/Qwen3-8B-GGUF) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/NPU; GPU works via `llama-bench` | GPU `llama-cli` unsupported for all quantizations |
+| [MiniCPM-V-2_6-GGUF](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `Q4_0` on CPU/GPU/NPU | — |
+| [DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF) | `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
+| [Hunyuan-7B-Instruct](https://huggingface.co/bartowski/tencent_Hunyuan-7B-Instruct-GGUF) | CPU: `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M`; GPU: `Q8_0`, `Q4_0`, `Q4_1`; NPU (`llama-bench` only): `Q4_0`, `Q4_1`, `Q4_K_M` | GPU `Q4_K_M` unsupported; NPU `llama-cli` unsupported |
+| [Mistral-7B-Instruct-v0.3](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF/) | CPU/GPU: `Q8_0`, `Q4_K_M`; NPU: `Q8_0`, `Q4_K_M` (via `llama-bench`) | NPU `llama-cli` unsupported for `Q8_0`, `Q4_K_M` |

 ## Build Instructions

--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -720,6 +720,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | GGML_SYCL_GRAPH    | OFF *(default)* \|ON *(Optional)*     | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
 | GGML_SYCL_DNN      | ON *(default)* \|OFF *(Optional)*     | Enable build with oneDNN.                   |
 | GGML_SYCL_HOST_MEM_FALLBACK | ON *(default)* \|OFF *(Optional)* | Allow host memory fallback when device memory is full during quantized weight reorder. Enables inference to continue at reduced speed (reading over PCIe) instead of failing. Requires Linux kernel 6.8+. |
+| GGML_SYCL_SUPPORT_LEVEL_ZERO | ON *(default)* \|OFF *(Optional)* | Enable Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. |
 | CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |

@@ -733,9 +734,10 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.|
 | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) |
 | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
+| GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. |
 | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
-| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
+| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Allow SYCL/Unified Runtime Level Zero device allocations larger than 4 GiB. llama.cpp's direct Level Zero allocation path requests the relaxed maximum-size limit itself when GGML_SYCL_ENABLE_LEVEL_ZERO=1. |

 ## Compile-time Flags

@@ -819,7 +821,7 @@ Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spo

 - `ggml_backend_sycl_buffer_type_alloc_buffer: can't allocate 5000000000 Bytes of memory on device`

-  You need to enable to support 4GB memory malloc by:
+  With the default `GGML_SYCL_ENABLE_LEVEL_ZERO=1`, llama.cpp requests Level Zero's relaxed maximum-size allocation limit directly. If Level Zero support is disabled at build time or runtime and the allocation goes through SYCL/Unified Runtime instead, enable support for allocations larger than 4 GiB by:
  ```
    export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
    set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
--- a/docs/build-riscv64-spacemit.md
+++ b/docs/build-riscv64-spacemit.md
@@ -9,18 +9,20 @@ wget https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_6
 ~~~

 2. Build
-Below is the build script: it requires utilizing RISC-V vector instructions for acceleration. Ensure the `GGML_CPU_RISCV64_SPACEMIT` compilation option is enabled. The currently supported optimization version is `RISCV64_SPACEMIT_IME1`, corresponding to the `RISCV64_SPACEMIT_IME_SPEC` compilation option. Compiler configurations are defined in the `riscv64-spacemit-linux-gnu-gcc.cmake` file. Please ensure you have installed the RISC-V compiler and set the environment variable via `export RISCV_ROOT_PATH={your_compiler_path}`.
+Below is the build script: it requires utilizing RISC-V vector instructions for acceleration. Ensure the `GGML_CPU_RISCV64_SPACEMIT` compilation option is enabled. The currently supported optimization version is `RISCV64_SPACEMIT_IME1` and `RISCV64_SPACEMIT_IME2`, corresponding to the `RISCV64_SPACEMIT_IME_SPEC` compilation option. Compiler configurations are defined in the `riscv64-spacemit-linux-gnu-gcc.cmake` file. Please ensure you have installed the RISC-V compiler and set the environment variable via `export RISCV_ROOT_PATH={your_compiler_path}`.
 ```bash

 cmake -B build \
    -DCMAKE_BUILD_TYPE=Release \
    -DGGML_CPU_RISCV64_SPACEMIT=ON \
+    -DGGML_CPU_REPACK=OFF \
    -DLLAMA_OPENSSL=OFF \
    -DGGML_RVV=ON \
+    -DGGML_RV_ZVFH=ON \
    -DGGML_RV_ZFH=ON \
    -DGGML_RV_ZICBOP=ON \
    -DGGML_RV_ZIHINTPAUSE=ON \
-    -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
+    -DGGML_RV_ZBA=ON \
    -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake \
    -DCMAKE_INSTALL_PREFIX=build/installed

@@ -47,8 +49,25 @@ export RISCV_ROOT_PATH_IME1={your RISC-V compiler path}

 ${QEMU_ROOT_PATH}/bin/qemu-riscv64 -L ${RISCV_ROOT_PATH_IME1}/sysroot -cpu max,vlen=256,elen=64,vext_spec=v1.0 ${PWD}/build/bin/llama-cli -m ${PWD}/models/Qwen2.5-0.5B-Instruct-Q4_0.gguf -t 1
 ~~~
+
+## Quantization Support For Matrix
+
+| Quantization Type | X60 | A100 |
+| ---: | ---: | ---: |
+| Q2_K |  | :heavy_check_mark: |
+| Q3_K |  | :heavy_check_mark: |
+| Q4_0 | :heavy_check_mark: | :heavy_check_mark: |
+| Q4_1 | :heavy_check_mark: | :heavy_check_mark: |
+| Q4_K | :heavy_check_mark: | :heavy_check_mark: |
+| Q5_0 |  | :heavy_check_mark: |
+| Q5_1 |  | :heavy_check_mark: |
+| Q5_K |  | :heavy_check_mark: |
+| Q6_K |  | :heavy_check_mark: |
+| Q8_0 |  | :heavy_check_mark: |
+
+
 ## Performance
-#### Quantization Support For Matrix
+* Spacemit(R) X60
 ~~~
 model name      : Spacemit(R) X60
 isa             : rv64imafdcv_zicbom_zicboz_zicntr_zicond_zicsr_zifencei_zihintpause_zihpm_zfh_zfhmin_zca_zcd_zba_zbb_zbc_zbs_zkt_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkt_sscofpmf_sstc_svinval_svnapot_svpbmt
@@ -58,33 +77,34 @@ mvendorid       : 0x710
 marchid         : 0x8000000058000001
 ~~~

-Q4_0
-|   Model    |   Size   | Params | backend | threads | test | t/s |
-| -----------| -------- | ------ | ------- | ------- | ---- |------|
-Qwen2.5 0.5B |403.20 MiB|630.17 M|   cpu   |    4    | pp512|64.12 ± 0.26|
-Qwen2.5 0.5B |403.20 MiB|630.17 M|   cpu   |    4    | tg128|10.03 ± 0.01|
-Qwen2.5 1.5B |1011.16 MiB| 1.78 B |   cpu   |    4    | pp512|24.16 ± 0.02|
-Qwen2.5 1.5B |1011.16 MiB| 1.78 B |   cpu   |    4    | tg128|3.83 ± 0.06|
-Qwen2.5 3B   | 1.86 GiB  | 3.40 B |   cpu   |    4    | pp512|12.08 ± 0.02|
-Qwen2.5 3B   | 1.86 GiB  | 3.40 B |   cpu   |    4    | tg128|2.23 ± 0.02|
-
-Q4_1
-|   Model    |   Size   | Params | backend | threads | test | t/s |
-| -----------| -------- | ------ | ------- | ------- | ---- |------|
-Qwen2.5 0.5B |351.50 MiB|494.03 M|   cpu   |    4    | pp512|62.07 ± 0.12|
-Qwen2.5 0.5B |351.50 MiB|494.03 M|   cpu   |    4    | tg128|9.91 ± 0.01|
-Qwen2.5 1.5B |964.06 MiB| 1.54 B |   cpu   |    4    | pp512|22.95 ± 0.25|
-Qwen2.5 1.5B |964.06 MiB| 1.54 B |   cpu   |    4    | tg128|4.01 ± 0.15|
-Qwen2.5 3B   | 1.85 GiB | 3.09 B |   cpu   |    4    | pp512|11.55 ± 0.16|
-Qwen2.5 3B   | 1.85 GiB | 3.09 B |   cpu   |    4    | tg128|2.25 ± 0.04|
+| model                          |       size |     params | backend    | threads | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | ---: | --------------: | -------------------: |
+| qwen35 2B Q4_1                 |   1.19 GiB |     1.88 B | CPU        |       4 |      128 |  1 |    0 |           pp128 |         10.32 ± 0.02 |
+| qwen35 2B Q4_1                 |   1.19 GiB |     1.88 B | CPU        |       4 |      128 |  1 |    0 |           tg128 |          3.07 ± 0.01 |
+| qwen3 0.6B Q4_0                | 358.78 MiB |   596.05 M | CPU        |       4 |      128 |  1 |    0 |           pp128 |         49.15 ± 0.25 |
+| qwen3 0.6B Q4_0                | 358.78 MiB |   596.05 M | CPU        |       4 |      128 |  1 |    0 |           tg128 |         11.73 ± 0.02 |


-Q4_K
-|   Model    |   Size   | Params | backend | threads | test | t/s |
-| -----------| -------- | ------ | ------- | ------- | ---- |------|
-Qwen2.5 0.5B |462.96 MiB|630.17 M|   cpu   |    4    | pp512|9.29 ± 0.05|
-Qwen2.5 0.5B |462.96 MiB|630.17 M|   cpu   |    4    | tg128|5.67 ± 0.04|
-Qwen2.5 1.5B | 1.04 GiB | 1.78 B |   cpu   |    4    | pp512|10.38 ± 0.10|
-Qwen2.5 1.5B | 1.04 GiB | 1.78 B |   cpu   |    4    | tg128|3.17 ± 0.08|
-Qwen2.5 3B   | 1.95 GiB | 3.40 B |   cpu   |    4    | pp512|4.23 ± 0.04|
-Qwen2.5 3B   | 1.95 GiB | 3.40 B |   cpu   |    4    | tg128|1.73 ± 0.00|
+* Spacemit(R) A100
+~~~
+model name      : Spacemit(R) A100
+isa             : rv64imafdcvh_zicbom_zicbop_zicboz_zicntr_zicond_zicsr_zifencei_zihintntl_zihintpause_zihpm_zimop_zaamo_zalrsc_zawrs_zfa_zfh_zfhmin_zca_zcb_zcd_zcmop_zba_zbb_zbc_zbs_zkt_zvbb_zvbc_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkb_zvkg_zvkned_zvknha_zvknhb_zvksed_zvksh_zvkt_smaia_smstateen_ssaia_sscofpmf_sstc_svinval_svnapot_svpbmt_sdtrig
+mmu             : sv39
+mvendorid       : 0x710
+marchid         : 0x8000000041000002
+mimpid          : 0x10000000d5686200
+hart isa        : rv64imafdcv_zicbom_zicbop_zicboz_zicntr_zicond_zicsr_zifencei_zihintntl_zihintpause_zihpm_zimop_zaamo_zalrsc_zawrs_zfa_zfh_zfhmin_zca_zcb_zcd_zcmop_zba_zbb_zbc_zbs_zkt_zvbb_zvbc_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkb_zvkg_zvkned_zvknha_zvknhb_zvksed_zvksh_zvkt_smaia_smstateen_ssaia_sscofpmf_sstc_svinval_svnapot_svpbmt_sdtrig
+~~~
+
+| model                          |       size |     params | backend    | threads | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | ---: | --------------: | -------------------: |
+| qwen3 0.6B Q4_0                | 358.78 MiB |   596.05 M | CPU        |       8 |      128 |  1 |    0 |           pp128 |        565.83 ± 0.31 |
+| qwen3 0.6B Q4_0                | 358.78 MiB |   596.05 M | CPU        |       8 |      128 |  1 |    0 |           tg128 |         55.77 ± 0.02 |
+| qwen3 4B Q4_0                  |   2.21 GiB |     4.02 B | CPU        |       8 |      128 |  1 |    0 |           pp128 |         79.74 ± 0.04 |
+| qwen3 4B Q4_0                  |   2.21 GiB |     4.02 B | CPU        |       8 |      128 |  1 |    0 |           tg128 |         11.29 ± 0.00 |
+| qwen3moe 30B.A3B Q4_0          |  16.18 GiB |    30.53 B | CPU        |       8 |      128 |  1 |    0 |           pp128 |         57.88 ± 0.31 |
+| qwen3moe 30B.A3B Q4_0          |  16.18 GiB |    30.53 B | CPU        |       8 |      128 |  1 |    0 |           tg128 |         12.79 ± 0.00 |
+| qwen35 2B Q4_1                 |   1.19 GiB |     1.88 B | CPU        |       8 |      128 |  1 |    0 |           pp128 |        115.23 ± 0.04 |
+| qwen35 2B Q4_1                 |   1.19 GiB |     1.88 B | CPU        |       8 |      128 |  1 |    0 |           tg128 |         16.49 ± 0.01 |
+| gemma4 E4B Q4_K - Medium       |   4.76 GiB |     7.52 B | CPU        |       8 |      128 |  1 |    0 |           pp128 |         21.13 ± 0.01 |
+| gemma4 E4B Q4_K - Medium       |   4.76 GiB |     7.52 B | CPU        |       8 |      128 |  1 |    0 |           tg128 |          5.66 ± 0.00 |
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -18,7 +18,7 @@ Legend:
 |                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
@@ -71,7 +71,7 @@ Legend:
 |                 MUL_MAT_HADAMARD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
 |                              NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
+|                             NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                     OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                         OUT_PROD | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 |
@@ -118,5 +118,5 @@ Legend:
 |                            TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                            XIELU | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
--- a/docs/ops/WebGPU.csv
+++ b/docs/ops/WebGPU.csv
--- a/examples/llama-eval/README.md
+++ b/examples/llama-eval/README.md
@@ -0,0 +1,26 @@
+# llama-eval
+
+Simple evaluation tool for llama.cpp with support for multiple datasets.
+
+For a full description, usage examples, and sample results, see:
+
+- [PR 21152](https://github.com/ggml-org/llama.cpp/pull/21152)
+
+## Quick start
+
+```bash
+# Single server
+python3 llama-eval.py \
+  --server http://localhost:8033 \
+  --model my-model \
+  --dataset gsm8k --n_cases 100 \
+  --grader-type regex --threads 32
+
+# Multiple servers (comma-separated URLs and thread counts)
+python3 llama-eval.py \
+  --server http://server1:8033,http://server2:8033 \
+  --server-name server1,server2 \
+  --threads 16,16 \
+  --dataset aime2025 --n_cases 240 \
+  --grader-type regex
+```
--- a/examples/llama-eval/llama-eval.py
+++ b/examples/llama-eval/llama-eval.py
--- a/examples/llama-eval/llama-server-simulator.py
+++ b/examples/llama-eval/llama-server-simulator.py
@@ -0,0 +1,317 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import random
+import re
+import time
+import sys
+import os
+import threading
+from http.server import HTTPServer, BaseHTTPRequestHandler
+from typing import Dict, List, Optional
+from dataclasses import dataclass
+from pathlib import Path
+
+import datasets
+
+# Set cache directory for HuggingFace datasets
+cache_dir = Path.home() / ".cache" / "huggingface" / "datasets"
+cache_dir.mkdir(parents=True, exist_ok=True)
+os.environ["HF_DATASETS_CACHE"] = str(cache_dir)
+
+def dice(s1: str, s2: str) -> float:
+    """Calculate Dice coefficient between two strings based on bigram overlap."""
+    if not s1 and not s2:
+        return 1.0
+
+    def _bigrams(s: str):
+        return [s[i : i + 2] for i in range(len(s) - 1)]
+
+    bigrams1 = _bigrams(s1)
+    bigrams2 = _bigrams(s2)
+
+    if not bigrams1 and not bigrams2:
+        return 1.0
+
+    from collections import Counter
+
+    freq1 = Counter(bigrams1)
+    freq2 = Counter(bigrams2)
+
+    intersection = sum(min(freq1[bg], freq2[bg]) for bg in freq1)
+    dice_coeff = 2 * intersection / (len(bigrams1) + len(bigrams2))
+    return dice_coeff
+
+def debug_log(message: str):
+    """Log debug messages to both stdout and a file"""
+    print(message, file=sys.stderr)
+    with open("/tmp/simulator-debug.log", "a") as f:
+        f.write(message + "\n")
+
+simulator: Optional["Simulator"] = None
+
+@dataclass
+class EvalState:
+    id: str
+    tasks: List[str]
+    task_states: Dict[str, Dict]
+    sampling_config: Dict
+
+def normalize_number(s: str) -> Optional[int]:
+    match = re.match(r"\d+", s)  # match digits from the start
+    if not match:
+        return None
+    return int(match.group(0))
+
+class AimeDataset:
+    def __init__(self, split: str = "train"):
+        self.split = split
+        self.questions: List[Dict] = []
+        self._load_dataset()
+
+    def _load_dataset(self):
+        print(f"Loading AIME dataset (split: {self.split})...")
+
+        cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0"
+        if cache_path.exists():
+            print(f"Using cached dataset from {cache_path}")
+            ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path))
+        else:
+            ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
+
+        self.questions = list(ds)
+        print(f"AIME dataset loaded: {len(self.questions)} questions")
+
+    def find_question(self, request_text: str) -> Optional[Dict]:
+        best_match = None
+        best_distance = -1
+        best_index = -1
+
+        for i, question in enumerate(self.questions):
+            question_text = question["problem"]
+            request_lower = request_text.lower()
+            question_lower = question_text.lower()
+
+            # Exact match
+            if question_lower == request_lower:
+                debug_log(f"DEBUG: Found exact match at index {i}")
+                return question
+
+            # Remove LaTeX formatting for more flexible matching
+            question_no_latex = re.sub(r'\$[^$]+\$', '', question_text)
+            if question_no_latex.lower() == request_lower:
+                debug_log(f"DEBUG: Found match (no LaTeX) at index {i}")
+                return question
+
+            # Calculate Dice coefficient for partial matches
+            # Only consider if request is at least 50% of question length
+            if len(request_lower) >= len(question_lower) * 0.5:
+                distance = dice(question_lower, request_lower)
+
+                if distance > best_distance:
+                    best_distance = distance
+                    best_match = question
+                    best_index = i
+
+        if best_match and best_distance > 0.3:  # Threshold for partial match
+            debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}")
+            return best_match
+
+        debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...")
+        return None
+
+    def get_answer(self, question: Dict) -> str:
+        answer = question["answer"]
+        if isinstance(answer, str):
+            normalized = normalize_number(answer)
+            return str(normalized) if normalized is not None else answer
+        return str(answer)
+
+class Simulator:
+    def __init__(
+        self,
+        port: int = 8033,
+        host: str = "localhost",
+        success_rate: float = 0.8,
+        dataset_split: str = "train"
+    ):
+        self.port = port
+        self.host = host
+        self.success_rate = success_rate
+        self.dataset = AimeDataset(dataset_split)
+        self.eval_state = EvalState(
+            id="aime-2025",
+            tasks=["aime"],
+            task_states={},
+            sampling_config={"temperature": 0, "max_tokens": 2048}
+        )
+
+    def _generate_response(
+        self,
+        question: Dict,
+        should_be_correct: bool
+    ) -> Dict:
+        expected_answer = self.dataset.get_answer(question)
+
+        if should_be_correct:
+            response_text = expected_answer
+        else:
+            response_text = self._generate_wrong_answer(question)
+
+        return {
+            "id": f"chatcmpl-{int(time.time())}",
+            "object": "chat.completion",
+            "created": int(time.time()),
+            "model": "llama",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": response_text
+                    },
+                    "finish_reason": "stop"
+                }
+            ],
+            "usage": {
+                "prompt_tokens": 100,
+                "completion_tokens": 50,
+                "total_tokens": 150
+            }
+        }
+
+    def _generate_wrong_answer(self, question: Dict) -> str:
+        expected_answer = self.dataset.get_answer(question)
+
+        if expected_answer.isdigit():
+            wrong_answer = str(int(expected_answer) + 1)
+        else:
+            wrong_answer = expected_answer + " (wrong)"
+
+        return wrong_answer
+
+    def _process_request(self, request_data: Dict) -> Dict:
+        messages = request_data.get("messages", [])
+        if not messages:
+            return {"error": "No messages in request"}
+
+        request_text = messages[0].get("content", "")
+        debug_log(f"DEBUG: Received request with content: {request_text[:150]}...")
+
+        question = self.dataset.find_question(request_text)
+        if not question:
+            debug_log(f"DEBUG: find_question returned None")
+            return {"error": "No matching question found"}
+
+        should_be_correct = random.random() < self.success_rate
+
+        response = self._generate_response(question, should_be_correct)
+
+        task_id = "aime"
+        self.eval_state.task_states[task_id] = {
+            "correct": should_be_correct,
+            "expected": self.dataset.get_answer(question),
+            "predicted": response["choices"][0]["message"]["content"]
+        }
+
+        return response
+
+class RequestHandler(BaseHTTPRequestHandler):
+    def do_POST(self):
+        if self.path != "/v1/chat/completions":
+            self._send_json({"error": "Not found"}, 404)
+            return
+
+        try:
+            content_length = int(self.headers.get("Content-Length", 0))
+            body = self.rfile.read(content_length)
+            request_data = json.loads(body) if body else None
+
+            if not request_data:
+                self._send_json({"error": "Invalid JSON"}, 400)
+                return
+
+            if simulator is None:
+                self._send_json({"error": "Simulator not initialized"}, 500)
+                return
+
+            response = simulator._process_request(request_data)
+            self._send_json(response, 200)
+
+        except json.JSONDecodeError:
+            self._send_json({"error": "Invalid JSON"}, 400)
+        except Exception as e:
+            print(f"Error processing request: {e}")
+            self._send_json({"error": str(e)}, 500)
+
+    def _send_json(self, data: dict, status: int = 200):
+        body = json.dumps(data).encode("utf-8")
+        self.send_response(status)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def log_message(self, format, *args):
+        # Suppress default request logging
+        pass
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="llama-server simulator for testing eval scripts"
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8033,
+        help="Server port (default: 8033)"
+    )
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="localhost",
+        help="Server host (default: localhost)"
+    )
+    parser.add_argument(
+        "--success-rate",
+        type=float,
+        default=0.8,
+        help="Success rate 0-1 (default: 0.8)"
+    )
+    parser.add_argument(
+        "--dataset-split",
+        type=str,
+        default="train",
+        help="AIME dataset split to use (default: train)"
+    )
+
+    args = parser.parse_args()
+
+    global simulator
+    simulator = Simulator(
+        port=args.port,
+        host=args.host,
+        success_rate=args.success_rate,
+        dataset_split=args.dataset_split
+    )
+
+    server = HTTPServer((args.host, args.port), RequestHandler)
+    server_thread = threading.Thread(target=server.serve_forever, daemon=True)
+    server_thread.start()
+
+    print("\n=== llama-server-simulator ===")
+    print(f"Server running on http://{args.host}:{args.port}")
+    print(f"Success rate: {args.success_rate}")
+    print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions")
+    print("\nPress Ctrl+C to stop\n")
+
+    try:
+        server_thread.join()
+    except KeyboardInterrupt:
+        print("\nShutting down...")
+        server.shutdown()
+
+if __name__ == "__main__":
+    main()
--- a/examples/llama-eval/test-simulator.sh
+++ b/examples/llama-eval/test-simulator.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+set -e
+
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+echo "=== llama-server-simulator Test Script ==="
+echo ""
+
+PORT=8033
+SUCCESS_RATE=0.8
+TEST_PORT=8034
+
+echo "Starting simulator on port $PORT with success rate $SUCCESS_RATE..."
+source "$SCRIPT_DIR/venv/bin/activate"
+python3 "$SCRIPT_DIR/llama-server-simulator.py" --port $PORT --success-rate $SUCCESS_RATE > /tmp/simulator-test.log 2>&1 &
+SIMULATOR_PID=$!
+
+echo "Waiting for simulator to start..."
+sleep 5
+
+# Helper function to make a request and extract the answer
+make_request() {
+  local question="$1"
+  curl -s -X POST http://localhost:$PORT/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d "{
+      \"model\": \"llama\",
+      \"messages\": [
+        {\"role\": \"user\", \"content\": \"$question\"}
+      ],
+      \"temperature\": 0,
+      \"max_tokens\": 2048
+    }" | python3 -c "import sys, json; data = json.load(sys.stdin); print(data.get('choices', [{}])[0].get('message', {}).get('content', data.get('error', 'No response')))"
+}
+
+# Test question (repeated in multiple tests)
+TEST_QUESTION="Quadratic polynomials P(x) and Q(x) have leading coefficients 2 and -2, respectively. The graphs of both polynomials pass through the two points (16,54) and (20,53). Find P(0) + Q(0)."
+
+echo ""
+echo "=== Test 1: Correct Answer ==="
+echo "Sending request with known question..."
+answer=$(make_request "$TEST_QUESTION")
+echo "Answer: $answer"
+echo "Expected: 116"
+echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
+
+echo ""
+echo "=== Test 2: Wrong Answer ==="
+echo "Sending request with known question (success rate 0.0)..."
+answer=$(make_request "$TEST_QUESTION")
+echo "Answer: $answer"
+echo "Expected: 116"
+echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
+
+echo ""
+echo "=== Test 3: No Matching Question ==="
+echo "Sending request with non-matching text..."
+response=$(make_request "What is the capital of France?")
+echo "Response: $response"
+echo "Expected: No matching question found"
+echo "Correct: $([ "$response" == "No matching question found" ] && echo "Yes" || echo "No")"
+
+echo ""
+echo "=== Test 4: Success Rate Verification ==="
+echo "Sending 10 requests to test success rate..."
+correct_count=0
+for i in {1..10}; do
+  answer=$(make_request "$TEST_QUESTION")
+  if [ "$answer" == "116" ]; then
+    correct_count=$((correct_count + 1))
+  fi
+  echo "  Request $i: Answer = $answer"
+done
+echo "Correct answers: $correct_count/10"
+echo "Expected: ~8/10 (80% success rate)"
+echo "Success rate: $(echo "scale=1; $correct_count * 10" | bc)%"
+
+echo ""
+echo "=== Test Complete ==="
+echo "Stopping simulator..."
+kill $SIMULATOR_PID 2>/dev/null
+wait $SIMULATOR_PID 2>/dev/null || true
+
+echo "Simulator stopped."
--- a/examples/model-conversion/Makefile
+++ b/examples/model-conversion/Makefile
@@ -52,6 +52,10 @@ causal-convert-mm-model:
 	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
 	./scripts/causal/convert-model.sh

+	$(MAKE) causal-convert-mmproj MM_OUTTYPE="$(MM_OUTTYPE)"
+
+causal-convert-mmproj:
+	$(call validate_model_path,causal-convert-mmproj)
 	@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(MM_OUTTYPE)" MODEL_PATH="$(MODEL_PATH)" \
 	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
 	./scripts/causal/convert-model.sh --mmproj
--- a/examples/speculative-simple/README.md
+++ b/examples/speculative-simple/README.md
@@ -6,7 +6,7 @@ Demonstration of basic greedy speculative decoding
 ./bin/llama-speculative-simple \
    -m  ../models/qwen2.5-32b-coder-instruct/ggml-model-q8_0.gguf \
    -md ../models/qwen2.5-1.5b-coder-instruct/ggml-model-q4_0.gguf \
-    -f test.txt -c 0 -ngl 99 --color \
-    --sampling-seq k --top-k 1 -fa --temp 0.0 \
-    -ngld 99 --draft-max 16 --draft-min 5 --draft-p-min 0.9
+    -f test.txt -c 0 -ngl 99 --color on \
+    --sampling-seq k --top-k 1 -fa on --temp 0.0 \
+    -ngld 99 --spec-draft-n-max 16 --spec-draft-n-draft-min 5 --draft-p-min 0.9
 ```
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -13,20 +13,6 @@
 #include <vector>
 #include <utility>

-struct spec_checkpoint {
-    int64_t n_tokens = 0;
-
-    std::vector<uint8_t> data;
-
-    size_t size() const {
-        return data.size();
-    }
-
-    bool empty() const {
-        return data.empty();
-    }
-};
-
 int main(int argc, char ** argv) {
    std::setlocale(LC_NUMERIC, "C");

@@ -43,11 +29,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    if (params.speculative.draft.mparams.path.empty()) {
-        LOG_ERR("%s: --model-draft is required\n", __func__);
-        return 1;
-    }
-
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -62,18 +43,11 @@ int main(int argc, char ** argv) {
    model_tgt = llama_init_tgt->model();
    ctx_tgt   = llama_init_tgt->context();

-    // check if the context supports partial sequence removal
-    const auto ctx_seq_rm = common_context_can_seq_rm(ctx_tgt);
-    const bool use_ckpt = (ctx_seq_rm == COMMON_CONTEXT_SEQ_RM_TYPE_FULL);
-
-    if (use_ckpt) {
-        LOG_INF("speculative decoding will use checkpoints (context does not support partial sequence removal)\n");
-    }
-
    const llama_vocab * vocab = llama_model_get_vocab(model_tgt);

    // load the draft model
    llama_model_ptr model_dft;
+    llama_context_ptr ctx_dft;

    // TODO: simplify this logic
    {
@@ -81,9 +55,6 @@ int main(int argc, char ** argv) {

        auto params_dft = params;

-        params_dft.n_parallel   = 1;
-        params_dft.n_ctx        = params_spec.n_ctx;
-        params_dft.n_batch      = llama_n_ctx_seq(ctx_tgt);
        params_dft.devices      = params_spec.devices;
        params_dft.model        = params_spec.mparams;
        params_dft.n_gpu_layers = params_spec.n_gpu_layers;
@@ -103,8 +74,19 @@ int main(int argc, char ** argv) {
            return 1;
        }

-        params.speculative.draft.model = model_dft.get();
-        params.speculative.draft.cparams = common_context_params_to_llama(params_dft);
+        auto cparams = common_context_params_to_llama(params_dft);
+        ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams));
+
+        params.speculative.draft.ctx_tgt = ctx_tgt;
+        params.speculative.draft.ctx_dft = ctx_dft.get();
+    }
+
+    // check if the context supports partial sequence removal
+    const bool use_ckpt_tgt = (common_context_can_seq_rm(ctx_tgt)       == COMMON_CONTEXT_SEQ_RM_TYPE_FULL);
+    const bool use_ckpt_dft = (common_context_can_seq_rm(ctx_dft.get()) == COMMON_CONTEXT_SEQ_RM_TYPE_FULL);
+
+    if (use_ckpt_tgt) {
+        LOG_INF("speculative decoding will use checkpoints (context does not support partial sequence removal)\n");
    }

    // Tokenize the prompt
@@ -136,6 +118,8 @@ int main(int argc, char ** argv) {
    // used to determine end of generation
    bool has_eos = false;

+    llama_seq_id seq_id = 0;
+
    // ================================================
    // everything until here is standard initialization
    // the relevant stuff for speculative decoding starts here
@@ -146,7 +130,8 @@ int main(int argc, char ** argv) {
    common_sampler_ptr smpl(common_sampler_init(model_tgt, params.sampling));

    // eval the prompt
-    llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
+    llama_decode(ctx_tgt,       llama_batch_get_one(inp.data(), inp.size() - 1));
+    llama_decode(ctx_dft.get(), llama_batch_get_one(inp.data(), inp.size() - 1));

    // note: keep the last token separate!
    llama_token id_last = inp.back();
@@ -160,16 +145,16 @@ int main(int argc, char ** argv) {
    // init the speculator
    const auto & params_spec = params.speculative;

-    struct common_speculative * spec = common_speculative_init(params.speculative, ctx_tgt);
+    struct common_speculative * spec = common_speculative_init(params.speculative, 1);

-    common_speculative_begin(spec, prompt_tgt);
+    common_speculative_begin(spec, seq_id, prompt_tgt);

    llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);

    size_t n_draft = 0;

    llama_tokens draft;
-    spec_checkpoint spec_ckpt;
+    common_prompt_checkpoint ckpt;

    const auto t_enc_end = ggml_time_us();

@@ -184,40 +169,57 @@ int main(int argc, char ** argv) {
        // from a cache or lookup tables.
        //
        if (draft.empty()) {
+            ckpt.update_pos(
+                    prompt_tgt.size(),
+                    llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), seq_id),
+                    llama_memory_seq_pos_max(llama_get_memory(ctx_tgt), seq_id));
+
+            if (use_ckpt_dft) {
+                ckpt.update_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
+            }
+
            // generate a new draft
-            draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);
+            common_speculative_get_draft_params(spec, seq_id) = {
+                /* .drafting   = */ true,
+                /* .n_max      = */ -1,
+                /* .n_past     = */ n_past,
+                /* .id_last    = */ id_last,
+                /* .prompt     = */ &prompt_tgt,
+                /* .result     = */ &draft, // output
+            };
+            common_speculative_draft(spec);

            // save the original draft size
            n_draft = draft.size();

            // save a checkpoint of the target context before evaluating the draft
            // this allows us to restore the state if partial draft acceptance occurs
-            if (!draft.empty() && use_ckpt) {
-                const size_t ckpt_size = llama_state_seq_get_size_ext(ctx_tgt, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-                spec_ckpt.data.resize(ckpt_size);
+            if (!draft.empty()) {
+                if (use_ckpt_tgt) {
+                    ckpt.update_tgt(ctx_tgt, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
+                }
+            }

-                const size_t n = llama_state_seq_get_data_ext(ctx_tgt, spec_ckpt.data.data(), ckpt_size, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-                GGML_ASSERT(n == ckpt_size);
+            {
+                ckpt.load_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);

-                spec_ckpt.n_tokens = (int64_t) prompt_tgt.size();
-                LOG_DBG("created speculative checkpoint (n_tokens = %" PRId64 ", size = %.3f MiB)\n",
-                        spec_ckpt.n_tokens, (float) spec_ckpt.data.size() / 1024 / 1024);
+                llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), seq_id, ckpt.pos_max + 1, -1);
            }
        } else {
            // we have a previous (partial) draft to reuse from checkpoint restoration
-            if (use_ckpt) {
-                GGML_ASSERT(!spec_ckpt.empty());
+            if (use_ckpt_tgt) {
+                GGML_ASSERT(!ckpt.empty());
            }
        }

        // always have a token to evaluate from before - id_last
        common_batch_clear(batch_tgt);
-        common_batch_add  (batch_tgt, id_last, n_past++, { 0 }, true);
+        common_batch_add  (batch_tgt, id_last, n_past++, { seq_id }, true);

        // evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
        {
            for (size_t i = 0; i < draft.size(); ++i) {
-                common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
+                common_batch_add(batch_tgt, draft[i], n_past + i, { seq_id }, true);
            }

            //LOG_DBG("target batch: %s\n", string_from(ctx_tgt, batch_tgt).c_str());
@@ -225,9 +227,15 @@ int main(int argc, char ** argv) {
            llama_decode(ctx_tgt, batch_tgt);
        }

+        // evaluate the same batch with the draft model
+        {
+            // TODO: extend to support MTP, Eagle, etc. See server code for reference
+            llama_decode(ctx_dft.get(), batch_tgt);
+        }
+
        // only save the sampler sampler state if we use checkpoints
        common_sampler_ptr smpl_save;
-        if (use_ckpt) {
+        if (use_ckpt_tgt) {
            smpl_save.reset(common_sampler_clone(smpl.get()));
        }

@@ -247,17 +255,24 @@ int main(int argc, char ** argv) {
        // check for partial draft acceptance:
        // if the context doesn't support partial sequence removal, restore the checkpoint
        // and make the accepted tokens the new partial draft for the next iteration
-        if (use_ckpt && ids.size() - 1 < draft.size()) {
+        if (use_ckpt_tgt && ids.size() - 1 < draft.size()) {
            LOG_DBG("partial acceptance: %zu < %zu, restoring checkpoint\n", ids.size() - 1, draft.size());

            draft = std::move(ids);

-            const size_t n = llama_state_seq_set_data_ext(ctx_tgt, spec_ckpt.data.data(), spec_ckpt.size(), 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-            GGML_ASSERT(n == spec_ckpt.size());
+            {
+                ckpt.load_tgt(ctx_tgt, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);

-            llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, spec_ckpt.n_tokens, -1);
+                llama_memory_seq_rm(llama_get_memory(ctx_tgt), seq_id, ckpt.pos_max + 1, -1);
+            }

-            prompt_tgt.resize(spec_ckpt.n_tokens);
+            {
+                ckpt.load_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
+
+                llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), seq_id, ckpt.pos_max + 1, -1);
+            }
+
+            prompt_tgt.resize(ckpt.n_tokens);
            smpl = std::move(smpl_save);

            n_past = (int) prompt_tgt.size();
@@ -265,7 +280,7 @@ int main(int argc, char ** argv) {
            continue;
        }

-        common_speculative_accept(spec, ids.size() - 1);
+        common_speculative_accept(spec, seq_id, ids.size() - 1);

        // full acceptance: consume the draft and commit accepted tokens
        n_past    += ids.size() - 1;
@@ -305,7 +320,8 @@ int main(int argc, char ** argv) {
        {
            LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);

-            llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, n_past, -1);
+            llama_memory_seq_rm(llama_get_memory(ctx_tgt),       seq_id, n_past, -1);
+            llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), seq_id, n_past, -1);
        }

        if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -249,6 +249,7 @@ option(GGML_SYCL                            "ggml: use SYCL"
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 option(GGML_SYCL_GRAPH                      "ggml: enable graphs in the SYCL backend"         ON)
 option(GGML_SYCL_HOST_MEM_FALLBACK          "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON)
+option(GGML_SYCL_SUPPORT_LEVEL_ZERO         "ggml: use Level Zero API in SYCL backend"  ON)
 option(GGML_SYCL_DNN                        "ggml: enable oneDNN in the SYCL backend"         ON)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
                                            "ggml: sycl target device")
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -450,12 +450,22 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            ggml-cpu/arch/riscv/repack.cpp
            )
        if (GGML_CPU_RISCV64_SPACEMIT)
+            include(ggml-cpu/cmake/FindSMTIME.cmake)
            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_RISCV64_SPACEMIT ${RISCV64_SPACEMIT_IME_SPEC})
            list(APPEND GGML_CPU_SOURCES
                ggml-cpu/spacemit/ime.cpp
                ggml-cpu/spacemit/ime.h
+                ggml-cpu/spacemit/spine_mem_pool.cpp
+                ggml-cpu/spacemit/spine_mem_pool.h
+                ggml-cpu/spacemit/repack.cpp
+                ggml-cpu/spacemit/repack.h
+                ggml-cpu/spacemit/ime_env.cpp
+                ggml-cpu/spacemit/ime_env.h
                ggml-cpu/spacemit/ime1_kernels.cpp
+                ggml-cpu/spacemit/ime2_kernels.cpp
                ggml-cpu/spacemit/ime_kernels.h
+                ggml-cpu/spacemit/rvv_kernels.cpp
+                ggml-cpu/spacemit/rvv_kernels.h
            )
        endif()
        if(NOT GGML_CPU_ALL_VARIANTS)
@@ -485,6 +495,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            if (GGML_RV_ZIHINTPAUSE)
                string(APPEND MARCH_STR "_zihintpause")
            endif()
+            if (GGML_RV_ZBA)
+                string(APPEND MARCH_STR "_zba")
+            endif()
            if (GGML_CPU_RISCV64_SPACEMIT)
                # `xsmtvdotii' is only required for GCC >= 15.
                if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
--- a/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake
+++ b/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake
@@ -0,0 +1,32 @@
+include(CheckCSourceRuns)
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)" AND GGML_CPU_RISCV64_SPACEMIT)
+    set(SMT_MARCH_STR "-march=rv64gcv_zfh_zvfh_zba_zicbop")
+    if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
+        CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 15)
+        string(APPEND SMT_MARCH_STR "_xsmtvdotii")
+    endif()
+    set(CMAKE_REQUIRED_FLAGS "${SMT_MARCH_STR}")
+
+    check_c_source_compiles("int main() {__asm__ volatile(\"vmadot v2, v0, v1\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_IME1)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vmadot v2, v0, v1, i4\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOT_S4)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vmadot v2, v0, v1, i8\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOT_S8)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vfwmadot v2, v0, v1, fp16\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VFWMADOT_FP16)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vmadot.hp v2, v0, v1, v0, 0, i4\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VFMADOT_S4)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vmadot.hp v2, v0, v1, v0, 0, i8\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VFMADOT_S8)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vmadot1 v2, v0, v1\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOTN)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vpack.vv v2, v0, v1, 2\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VPACK)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vnspack.vv v2, v0, v1, 2\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VNPACK)
+    unset(CMAKE_REQUIRED_FLAGS)
+
+    list(APPEND RISCV64_SPACEMIT_IME_SPEC "")
+    if (SPACEMIT_RISCV_COMPILER_SUPPORT_IME1)
+        set(RISCV64_SPACEMIT_IME_SPEC "RISCV64_SPACEMIT_IME1")
+    endif()
+
+    if (SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOT_S4 AND SPACEMIT_RISCV_COMPILER_SUPPORT_VPACK AND SPACEMIT_RISCV_COMPILER_SUPPORT_VNPACK)
+        list(APPEND RISCV64_SPACEMIT_IME_SPEC "RISCV64_SPACEMIT_IME2")
+    endif()
+
+    message("RISCV64_SPACEMIT_IME_SPEC: ${RISCV64_SPACEMIT_IME_SPEC}")
+endif()
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -50,6 +50,10 @@
 #include "llamafile/sgemm.h"
 #endif

+#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
+#    include "spacemit/ime.h"
+#endif
+
 // Note: once we move threading into a separate C++ file
 // will use std::hardware_destructive_interference_size instead of hardcoding it here
 // and we'll use C++ attribute syntax.
@@ -3011,7 +3015,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
    const struct ggml_cgraph * cgraph = tp->cgraph;
    const struct ggml_cplan  * cplan  = tp->cplan;

+#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
+    ggml_backend_cpu_riscv64_spacemit_set_numa_thread_affinity(state->ith);
+#else
    set_numa_thread_affinity(state->ith);
+#endif

    struct ggml_compute_params params = {
        /*.ith        =*/ state->ith,
@@ -3068,6 +3076,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {

    ggml_barrier(state->threadpool);

+#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
+    ggml_backend_cpu_riscv64_spacemit_clear_numa_thread_affinity_threaded(state->ith);
+#endif
+
    return 0;
 }

--- a/ggml/src/ggml-cpu/spacemit/ime.cpp
+++ b/ggml/src/ggml-cpu/spacemit/ime.cpp
--- a/ggml/src/ggml-cpu/spacemit/ime.h
+++ b/ggml/src/ggml-cpu/spacemit/ime.h
@@ -8,6 +8,14 @@ extern "C" {

 ggml_backend_buffer_type_t ggml_backend_cpu_riscv64_spacemit_buffer_type(void);

+void ggml_backend_cpu_riscv64_spacemit_set_numa_thread_affinity(int thread_n);
+
+void ggml_backend_cpu_riscv64_spacemit_clear_numa_thread_affinity_threaded(int thread_n);
+
+void * ggml_backend_cpu_riscv64_spacemit_alloc_shared(size_t size, size_t alignment);
+
+void ggml_backend_cpu_riscv64_spacemit_free_shared(void * ptr);
+
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp
+++ b/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp
--- a/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp
+++ b/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp
--- a/ggml/src/ggml-cpu/spacemit/ime_env.cpp
+++ b/ggml/src/ggml-cpu/spacemit/ime_env.cpp
@@ -0,0 +1,320 @@
+#include "ime_env.h"
+
+#include "ggml-impl.h"
+#include "spine_mem_pool.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <array>
+#include <cctype>
+#include <fstream>
+#include <string>
+#include <thread>
+#include <unordered_map>
+
+namespace ggml::cpu::riscv64_spacemit {
+bool spine_core_info::get_spine_core_info(std::vector<spine_core_info> & result) {
+    static std::unordered_map<uint64_t, spine_core_arch_id> spine_march_mapping_ = {
+        {0x8000000058000001,  spine_core_arch_id::core_arch_x60 },
+        { 0x8000000041000001, spine_core_arch_id::core_arch_a60 },
+        { 0x8000000058000002, spine_core_arch_id::core_arch_x100},
+        { 0x8000000041000002, spine_core_arch_id::core_arch_a100},
+    };
+
+    result.clear();
+    std::ifstream file("/proc/cpuinfo");
+    std::string   line;
+
+    std::vector<std::array<uint64_t, 2>> cpu_info_list;
+
+    uint64_t current_processor = spine_invalid_core_id;
+    uint64_t current_marchid   = 0;
+    bool     has_processor     = false;
+    bool     has_marchid       = false;
+
+    if (!file.is_open()) {
+        return false;
+    }
+
+    while (std::getline(file, line)) {
+        if (line.substr(0, 9) == "processor") {
+            if (has_processor && has_marchid) {
+                cpu_info_list.push_back({ current_processor, current_marchid });
+            }
+
+            size_t colon_pos = line.find(':');
+            if (colon_pos != std::string::npos) {
+                current_processor = std::stoi(line.substr(colon_pos + 1));
+                has_processor     = true;
+            }
+
+            has_marchid = false;
+        } else if (line.substr(0, 7) == "marchid") {
+            size_t colon_pos = line.find(':');
+            if (colon_pos != std::string::npos) {
+                std::string marchid_str = line.substr(colon_pos + 1);
+                marchid_str.erase(std::remove_if(marchid_str.begin(), marchid_str.end(), isspace), marchid_str.end());
+                current_marchid = std::stoull(marchid_str, nullptr, 16);
+                has_marchid     = true;
+            }
+        }
+    }
+
+    if (has_processor && has_marchid) {
+        cpu_info_list.push_back({ current_processor, current_marchid });
+    }
+
+    if (has_processor && has_marchid) {
+        for (auto & cpu_info : cpu_info_list) {
+            if (cpu_info[0] != spine_invalid_core_id &&
+                spine_march_mapping_.find(cpu_info[1]) != spine_march_mapping_.end()) {
+                auto core_info    = spine_core_info();
+                core_info.core_id = cpu_info[0];
+                core_info.arch_id = spine_core_arch_id(spine_march_mapping_[cpu_info[1]]);
+
+                result.push_back(core_info);
+            }
+        }
+    }
+
+    return has_processor && has_marchid;
+}
+
+namespace {
+uint16_t hex_string_to_u16(const std::string & hex_str) {
+    try {
+        size_t pos = 0;
+        if (hex_str.substr(0, 2) == "0x" || hex_str.substr(0, 2) == "0X") {
+            pos = 2;
+        }
+        unsigned long result = std::stoul(hex_str.substr(pos), nullptr, 16);
+        if (result > std::numeric_limits<uint16_t>::max()) {
+            throw std::out_of_range("Converted value is out of range for uint16_t");
+        }
+        return static_cast<uint16_t>(result);
+    } catch (const std::invalid_argument & e) {
+        throw std::invalid_argument("Invalid hexadecimal string");
+    } catch (const std::out_of_range & e) {
+        throw;
+    }
+}
+
+const char * spine_mem_pool_backend_to_string(spine_mem_pool_backend backend) {
+    switch (backend) {
+        case spine_mem_pool_backend::none:
+            return "NONE";
+        case spine_mem_pool_backend::posix_memalign:
+            return "POSIX";
+        case spine_mem_pool_backend::transparent_hugepage:
+            return "HPAGE";
+        case spine_mem_pool_backend::hugetlb_1g:
+            return "HPAGE1GB";
+    }
+
+    return "unknown";
+}
+
+spine_mem_pool_backend parse_mem_backend(const char * mem_backend_str) {
+    if (mem_backend_str == nullptr || mem_backend_str[0] == '\0') {
+        return spine_mem_pool_backend::transparent_hugepage;
+    }
+
+    std::string value(mem_backend_str);
+    std::transform(value.begin(), value.end(), value.begin(),
+                   [](unsigned char ch) { return static_cast<char>(std::tolower(ch)); });
+
+    if (value == "none") {
+        return spine_mem_pool_backend::none;
+    }
+
+    if (value == "posix") {
+        return spine_mem_pool_backend::posix_memalign;
+    }
+
+    if (value == "hpage") {
+        return spine_mem_pool_backend::transparent_hugepage;
+    }
+
+    if (value == "hpage1gb") {
+        return spine_mem_pool_backend::hugetlb_1g;
+    }
+
+    throw std::runtime_error("invalid SPACEMIT_MEM_BACKEND: " + value + ", expected NONE, POSIX, HPAGE or HPAGE1GB");
+}
+}  // namespace
+
+spine_env_info::spine_env_info() {
+    num_cores = static_cast<int>(std::thread::hardware_concurrency());
+    spine_core_info::get_spine_core_info(core_info_list);
+
+    // special for x60 K1
+    if (core_info_list.size() == 8 && core_info_list[0].arch_id == spine_core_arch_id::core_arch_x60) {
+        for (int i = 0; i < 4; i++) {
+            core_info_list[i].arch_id = spine_core_arch_id::core_arch_a60;
+        }
+    }
+
+    // special for qemu
+    if (core_info_list.size() == 0) {
+        char * spine_core_arch_str = getenv("SPACEMIT_CORE_ARCH");
+        if (spine_core_arch_str != nullptr) {
+            auto arch_id = hex_string_to_u16(spine_core_arch_str);
+            for (int i = 0; i < num_cores; i++) {
+                auto core_info    = spine_core_info();
+                core_info.core_id = i;
+                core_info.arch_id = spine_core_arch_id{ arch_id };
+                core_info_list.push_back(core_info);
+            }
+        }
+    }
+
+    if (core_info_list.size() == 0) {
+        throw std::runtime_error(
+            "Failed to get SPACEMIT_CORE_ARCH from environment or failed to parse it from /proc/cpuinfo");
+    }
+
+    char * spine_perfer_core_arch_str = getenv("SPACEMIT_PERFER_CORE_ARCH");
+    if (spine_perfer_core_arch_str != nullptr && spine_perfer_core_arch_str != "") {
+        perfer_core_arch_id = spine_core_arch_id{ hex_string_to_u16(spine_perfer_core_arch_str) };
+    }
+
+    char *           spine_perfer_core_id_str = getenv("SPACEMIT_PERFER_CORE_ID");
+    std::vector<int> perfer_core_id_vec;
+    if (spine_perfer_core_id_str != nullptr && spine_perfer_core_id_str != "") {
+        std::string perfer_core_id_str(spine_perfer_core_id_str);
+        size_t      start = 0;
+        size_t      end   = 0;
+        while ((end = perfer_core_id_str.find(',', start)) != std::string::npos) {
+            std::string core_id_substr = perfer_core_id_str.substr(start, end - start);
+            perfer_core_id_vec.push_back(std::stoi(core_id_substr));
+            start = end + 1;
+        }
+        std::string core_id_substr = perfer_core_id_str.substr(start);
+        perfer_core_id_vec.push_back(std::stoi(core_id_substr));
+    }
+
+    perfer_core_ids.reserve(num_cores);
+    if (perfer_core_arch_id == spine_core_arch_id::core_arch_none) {
+        for (auto & core_info : core_info_list) {
+            auto core_arch_id   = core_info.arch_id;
+            auto core_arch_head = (uint16_t) (core_arch_id) >> 12;
+            if (core_arch_head == 0xA) {
+                num_perfer_cores++;
+                perfer_core_arch_id = core_arch_id;
+                cpu_mask |= (1ULL << core_info.core_id);
+                perfer_core_ids.push_back(core_info.core_id);
+            }
+        }
+    } else {
+        for (auto & core_info : core_info_list) {
+            auto core_arch_id = core_info.arch_id;
+            if (core_arch_id == perfer_core_arch_id) {
+                num_perfer_cores++;
+                cpu_mask |= (1ULL << core_info.core_id);
+
+                auto core_arch_head = (uint16_t) (core_arch_id) >> 12;
+                if (core_arch_head == 0xA) {
+                    perfer_core_ids.push_back(core_info.core_id);
+                }
+            }
+        }
+        if (num_perfer_cores == 0) {
+            GGML_ABORT("can not find core with arch id %x for SPACEMIT_PERFER_CORE_ARCH in core info list\n",
+                       (uint16_t) perfer_core_arch_id);
+        }
+    }
+
+    if (perfer_core_id_vec.size() > 0) {
+        perfer_core_ids.clear();
+        cpu_mask         = 0;
+        num_perfer_cores = 0;
+        for (int core_id : perfer_core_id_vec) {
+            if (core_id < 0 || core_id >= num_cores) {
+                GGML_ABORT("invalid core id in SPACEMIT_PERFER_CORE_ID: %d, should be between 0 and %d\n", core_id,
+                           num_cores - 1);
+            }
+            auto core_info    = core_info_list[core_id];
+            auto core_arch_id = core_info.arch_id;
+            if (core_arch_id == perfer_core_arch_id) {
+                cpu_mask |= (1ULL << core_id);
+                perfer_core_ids.push_back(core_id);
+            } else {
+                GGML_ABORT(
+                    "core id %d in SPACEMIT_PERFER_CORE_ID has arch id %x which does not match "
+                    "SPACEMIT_PERFER_CORE_ARCH %x\n",
+                    core_id, (uint16_t) core_arch_id, (uint16_t) perfer_core_arch_id);
+            }
+        }
+        std::string perfer_core_id_vec_str;
+        for (int core_id : perfer_core_id_vec) {
+            perfer_core_id_vec_str += std::to_string(core_id) + ",";
+        }
+        perfer_core_id_vec_str.pop_back();
+        GGML_LOG_DEBUG("SPACEMIT_PERFER_CORE_ID is set, perferred core ids: %s\n", perfer_core_id_vec_str.c_str());
+        num_perfer_cores = static_cast<int>(perfer_core_id_vec.size());
+    }
+
+    use_ime1 = perfer_core_arch_id == spine_core_arch_id::core_arch_a60 ||
+               perfer_core_arch_id == spine_core_arch_id::core_arch_x100;
+
+    use_ime2 = perfer_core_arch_id == spine_core_arch_id::core_arch_a100;
+
+    mem_backend                  = parse_mem_backend(getenv("SPACEMIT_MEM_BACKEND"));
+    char * spine_disable_tcm_str = getenv("SPACEMIT_DISABLE_TCM");
+    auto   user_disable_tcm      = spine_disable_tcm_str != nullptr && strcmp(spine_disable_tcm_str, "0") != 0;
+
+    if (!user_disable_tcm) {
+        spine_mem_pool_tcm_info tcm_info;
+        if (spine_mem_pool_tcm_init(&tcm_info)) {
+            use_tcm      = tcm_info.available;
+            tcm_blk_size = tcm_info.blk_size;
+            GGML_LOG_DEBUG("CPU_RISCV64_SPACEMIT: tcm is available, blk_size: %zu, blk_num: %zu, is_fake_tcm: %d\n",
+                           tcm_info.blk_size, tcm_info.blk_num, tcm_info.is_fake_tcm);
+
+            for (auto & core_info : core_info_list) {
+                auto core_arch_head = (uint16_t) (core_info.arch_id) >> 12;
+                if (core_arch_head != 0xA) {
+                    aicpu_id_offset++;
+                } else {
+                    break;
+                }
+            }
+        }
+    }
+
+    GGML_LOG_DEBUG(
+        "CPU_RISCV64_SPACEMIT: num_cores: %d, num_perfer_cores: %d, perfer_core_arch_id: %x, exclude_main_thread: %d, "
+        "use_ime1: %d, use_ime2: %d, mem_backend: %s, cpu_mask: %lx, aicpu_id_offset: %d\n",
+        num_cores, num_perfer_cores, (uint16_t) perfer_core_arch_id, exclude_main_thread, use_ime1, use_ime2,
+        spine_mem_pool_backend_to_string(mem_backend), cpu_mask, aicpu_id_offset);
+
+    const size_t init_barrier_size = sizeof(spine_barrier_t) * spine_init_barrier_count;
+    init_barrier =
+        static_cast<spine_barrier_t *>(spine_mem_pool_shared_mem_alloc(init_barrier_size, alignof(spine_barrier_t)));
+    if (init_barrier != nullptr) {
+        init_barrier_is_shared_mem = true;
+    } else {
+        GGML_LOG_WARN("CPU_RISCV64_SPACEMIT: failed to allocate init_barrier from shared mem, falling back to heap\n",
+                      __func__);
+        init_barrier = new spine_barrier_t[spine_init_barrier_count];
+    }
+
+    spine_barrier_init(init_barrier, spine_init_barrier_count, 2);
+}
+
+spine_env_info::~spine_env_info() {
+    if (init_barrier_is_shared_mem) {
+        spine_mem_pool_shared_mem_free(init_barrier);
+    } else {
+        delete[] init_barrier;
+    }
+
+    init_barrier               = nullptr;
+    init_barrier_is_shared_mem = false;
+}
+
+spine_env_info global_spine_env_info;
+
+}  // namespace ggml::cpu::riscv64_spacemit
--- a/ggml/src/ggml-cpu/spacemit/ime_env.h
+++ b/ggml/src/ggml-cpu/spacemit/ime_env.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include "spine_barrier.h"
+#include "spine_mem_pool.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace ggml::cpu::riscv64_spacemit {
+
+constexpr uint64_t spine_invalid_core_id    = 0xFFFFFFFF;
+constexpr size_t   spine_init_barrier_count = 16;
+
+enum class spine_core_arch_id : uint16_t {
+    core_arch_none = 0,
+    core_arch_x60  = 0x503C,
+    core_arch_x100 = 0x5064,
+    core_arch_x200 = 0x50C8,
+    core_arch_a60  = 0xA03C,
+    core_arch_a100 = 0xA064,
+    core_arch_a200 = 0xA0C8,
+};
+
+struct spine_core_info {
+    uint64_t           core_id{ spine_invalid_core_id };
+    spine_core_arch_id arch_id{ spine_core_arch_id::core_arch_none };
+
+    static bool get_spine_core_info(std::vector<spine_core_info> & result);
+};
+
+struct spine_env_info {
+    std::vector<spine_core_info> core_info_list;
+    std::vector<int>             perfer_core_ids;
+    int                          aicpu_id_offset{ 0 };
+    int                          num_cores{ 0 };
+    int                          num_perfer_cores{ 0 };
+    spine_core_arch_id           perfer_core_arch_id{ spine_core_arch_id::core_arch_none };
+    bool                         exclude_main_thread{ false };
+    bool                         use_ime2{ false };
+    bool                         use_ime1{ false };
+    bool                         use_tcm{ false };
+    spine_mem_pool_backend       mem_backend{ spine_mem_pool_backend::transparent_hugepage };
+    uint64_t                     tcm_blk_size{ 0 };
+    uint64_t                     cpu_mask{ 0 };
+    spine_barrier_t *            init_barrier{ nullptr };
+    bool                         init_barrier_is_shared_mem{ false };
+
+    spine_env_info();
+    ~spine_env_info();
+};
+
+extern spine_env_info global_spine_env_info;
+
+}  // namespace ggml::cpu::riscv64_spacemit
--- a/ggml/src/ggml-cpu/spacemit/ime_kernels.h
+++ b/ggml/src/ggml-cpu/spacemit/ime_kernels.h
@@ -1,26 +1,189 @@
 #pragma once

+#include <cassert>
 #include <cstddef>
+#include <functional>
+
+namespace spacemit_kernels {
+
+#define BLOCK_QNK_LEN 256
+
+template <int N> struct nrow_block_q2_k {
+    // [4bit scale + 4bit zp] * N * 16
+    uint8_t  scales[N * BLOCK_QNK_LEN / 16];
+    // [b0, b16, b32, b48] [b1, b17, b33, b49] ... [b15, b31, b47, b63]
+    // [b64, b80, b96, b112] ...[b79, b95, b111, b127]
+    // [b128, b144, b160, b176] ...[b143, b159, b175, b191]
+    // [b192, b208, b224, b240] ...[b207, b223, b239, b255]
+    uint8_t  qs[N * BLOCK_QNK_LEN / 4];
+    uint16_t scales16[N];
+    uint16_t zeros16[N];
+};
+
+template <int N> struct nrow_block_q3_k {
+    // [8bit scale] * N * 16
+    int8_t   scales[N * 16];
+    // [b0, b1, b2, b3, b4, b5, b6, b7] ... [b248, b249, b250, b251, b252, b253, b254, b255]
+    uint8_t  hmask[N * BLOCK_QNK_LEN / 8];
+    // [b0, b16, b32, b48] [b1, b17, b33, b49] ... [b15, b31, b47, b63]
+    // [b64, b80, b96, b112] ...[b79, b95, b111, b127]
+    // [b128, b144, b160, b176] ...[b143, b159, b175, b191]
+    // [b192, b208, b224, b240] ...[b207, b223, b239, b255]
+    uint8_t  qs[N * BLOCK_QNK_LEN / 4];
+    uint16_t scales16[N];
+};
+
+template <int N> struct nrow_block_mxfp4 {
+    uint8_t e[N];
+    uint8_t qh[4 * N];
+    uint8_t qs[16 * N];
+};
+
+template <int N> struct __attribute__((packed)) nrow_block_q5_1 {
+    uint16_t scales16[N];
+    uint8_t  zp[N];
+    // n0 [bh0, bh1, bh2, bh3, bh4, bh5, bh6, bh7] ....
+    uint8_t  qh[4 * N];
+    // n0 [b0, b1], [b2, b3] ....  [b30, b31]
+    // n1 [b0, b1], [b2, b3] ....  [b30, b31]
+    uint8_t  qs[16 * N];
+};
+
+static_assert(sizeof(nrow_block_q5_1<1>) == sizeof(uint8_t) + 22, "wrong nrow_block_q5_1 block size/padding");
+
+template <int N> struct __attribute__((packed)) nrow_block_q5_0 {
+    uint16_t scales16[N];
+    // n0 [bh0, bh1, bh2, bh3, bh4, bh5, bh6, bh7] ....
+    uint8_t  qh[4 * N];
+    // n0 [b0, b1], [b2, b3] ....  [b30, b31]
+    // n1 [b0, b1], [b2, b3] ....  [b30, b31]
+    uint8_t  qs[16 * N];
+};
+
+static_assert(sizeof(nrow_block_q5_0<1>) == 22, "wrong nrow_block_q5_0 block size/padding");
+
+using gemm_kernel_quantize_def = std::function<
+    size_t(size_t, const uint8_t *, const uint8_t *, const uint8_t *, float *, size_t, size_t, size_t, size_t)>;
+
+using moe_gemm_kernel_quantize_def = std::function<
+    size_t(size_t, const uint8_t **, const uint8_t *, const uint8_t *, float **, size_t, size_t, size_t, size_t)>;

-namespace sqnbitgemm_spacemit_ime {
 namespace ime1 {
-size_t gemm_kernel_i8i4(size_t            blk_len,
-                        const std::byte * quant_a_ptr,
-                        const std::byte * quant_b_data,
-                        const float *     quant_b_scale,
-                        const std::byte * quant_b_zp,
-                        float *           c_ptr,
-                        size_t            count_m,
-                        size_t            count_n,
-                        size_t            count_k,
-                        size_t            block_count_k,
-                        size_t            ldc,
-                        const float *     bias,
-                        const size_t      scale_stride);
+size_t gemm_kernel_i8i4(size_t          blk_len,
+                        const uint8_t * quant_a_ptr,
+                        const uint8_t * quant_b_data,
+                        const uint8_t * quant_b_zp,
+                        float *         c_ptr,
+                        size_t          count_m,
+                        size_t          count_n,
+                        size_t          k_blks,
+                        size_t          ldc);

-void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
+void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);

-void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
+void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);

 }  // namespace ime1
-}  // namespace sqnbitgemm_spacemit_ime
+
+namespace ime2 {
+size_t gemm_kernel_i8i2k(size_t          blk_len,
+                         const uint8_t * quant_a_ptr,
+                         const uint8_t * quant_b_data,
+                         const uint8_t * quant_b_zp,
+                         float *         c_ptr,
+                         size_t          count_m,
+                         size_t          count_n,
+                         size_t          k_blks,
+                         size_t          ldc);
+
+size_t gemm_kernel_i8i3k(size_t          blk_len,
+                         const uint8_t * quant_a_ptr,
+                         const uint8_t * quant_b_data,
+                         const uint8_t * quant_b_zp,
+                         float *         c_ptr,
+                         size_t          count_m,
+                         size_t          count_n,
+                         size_t          k_blks,
+                         size_t          ldc);
+
+size_t gemm_kernel_i8i4(size_t          blk_len,
+                        const uint8_t * quant_a_ptr,
+                        const uint8_t * quant_b_data,
+                        const uint8_t * quant_b_zp,
+                        float *         c_ptr,
+                        size_t          count_m,
+                        size_t          count_n,
+                        size_t          k_blks,
+                        size_t          ldc);
+
+size_t gemm_kernel_i8i4_hp(size_t          blk_len,
+                           const uint8_t * quant_a_ptr,
+                           const uint8_t * quant_b_data,
+                           const uint8_t * quant_b_zp,
+                           float *         c_ptr,
+                           size_t          count_m,
+                           size_t          count_n,
+                           size_t          k_blks,
+                           size_t          ldc);
+
+size_t moe_m2_gemm_kernel_i8i4(size_t           blk_len,
+                               const uint8_t ** quant_a_ptr,
+                               const uint8_t *  quant_b_data,
+                               const uint8_t *  quant_b_zp,
+                               float **         c_ptr,
+                               size_t           count_m,
+                               size_t           count_n,
+                               size_t           k_blks,
+                               size_t           ldc);
+
+size_t gemm_kernel_i8i8(size_t          blk_len,
+                        const uint8_t * quant_a_ptr,
+                        const uint8_t * quant_b_data,
+                        const uint8_t * quant_b_zp,
+                        float *         c_ptr,
+                        size_t          count_m,
+                        size_t          count_n,
+                        size_t          k_blks,
+                        size_t          ldc);
+
+size_t gemm_kernel_i8mxfp4(size_t          blk_len,
+                           const uint8_t * quant_a_ptr,
+                           const uint8_t * quant_b_data,
+                           const uint8_t * quant_b_zp,
+                           float *         c_ptr,
+                           size_t          count_m,
+                           size_t          count_n,
+                           size_t          k_blks,
+                           size_t          ldc);
+
+size_t moe_m2_gemm_kernel_i8mxfp4(size_t           blk_len,
+                                  const uint8_t ** quant_a_ptr,
+                                  const uint8_t *  quant_b_data,
+                                  const uint8_t *  quant_b_zp,
+                                  float **         c_ptr,
+                                  size_t           count_m,
+                                  size_t           count_n,
+                                  size_t           k_blks,
+                                  size_t           ldc);
+
+size_t gemm_kernel_i8i5(size_t          blk_len,
+                        const uint8_t * quant_a_ptr,
+                        const uint8_t * quant_b_data,
+                        const uint8_t * quant_b_zp,
+                        float *         c_ptr,
+                        size_t          count_m,
+                        size_t          count_n,
+                        size_t          k_blks,
+                        size_t          ldc);
+
+size_t moe_m2_gemm_kernel_i8i5(size_t           blk_len,
+                               const uint8_t ** quant_a_ptr,
+                               const uint8_t *  quant_b_data,
+                               const uint8_t *  quant_b_zp,
+                               float **         c_ptr,
+                               size_t           count_m,
+                               size_t           count_n,
+                               size_t           k_blks,
+                               size_t           ldc);
+}  // namespace ime2
+}  // namespace spacemit_kernels
--- a/ggml/src/ggml-cpu/spacemit/repack.cpp
+++ b/ggml/src/ggml-cpu/spacemit/repack.cpp
--- a/ggml/src/ggml-cpu/spacemit/repack.h
+++ b/ggml/src/ggml-cpu/spacemit/repack.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "ggml-common.h"
+#include "ggml.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace ggml::cpu::riscv64_spacemit {
+
+template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
+int repack(ggml_tensor * t, const void * data, size_t data_size);
+
+}  // namespace ggml::cpu::riscv64_spacemit
--- a/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp
+++ b/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp
--- a/ggml/src/ggml-cpu/spacemit/rvv_kernels.h
+++ b/ggml/src/ggml-cpu/spacemit/rvv_kernels.h
@@ -0,0 +1,95 @@
+#pragma once
+
+#include "ggml-cpu-impl.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+
+namespace spacemit_kernels {
+
+constexpr auto div_round_up(auto up, auto down) {
+    return (up + down - 1) / down;
+}
+
+// Q8 Blk [f32] [s16] [int8 * blk_len]
+// Q8 Blk N [f32 * N] [s16 * N] [int8 * blk_len * N]
+constexpr size_t q8_blk_size(size_t blk_len, bool with_blk_sum = false) {
+    const size_t blk_size = sizeof(float) + blk_len * sizeof(int8_t) + (with_blk_sum ? sizeof(int16_t) : 0);
+    return blk_size;
+}
+
+// Q8 HP row block: K is split into K32 subblocks.
+// Each subblock stores [f32 scale] [int8 * 32], with an optional fp16 sum trailer per subblock.
+constexpr size_t q8_hp_blk_size(size_t blk_len, bool with_blk_sum = false, bool with_blk_scale = false) {
+    const size_t subblk_count = div_round_up(blk_len, size_t(32));
+    const size_t blk_size     = blk_len * sizeof(int8_t) + subblk_count * sizeof(_Float16) +
+                            (with_blk_sum ? subblk_count * sizeof(_Float16) : 0) +
+                            (with_blk_scale ? sizeof(_Float16) : 0);
+    return blk_size;
+}
+
+// Q8K Blk [f32] [s16 * (blk_len / 16)] [int8 * blk_len]
+// Q8K Blk N [f32 * N] [s16 * (blk_len / 16) * N] [int8 * blk_len * N]
+constexpr size_t q8k_blk_size(size_t blk_len) {
+    const size_t blk_size = sizeof(float) + blk_len * sizeof(int8_t) + sizeof(int16_t) * blk_len / 16;
+    return blk_size;
+}
+
+using quantize_a_row_def = std::function<void(size_t, const float *, size_t, uint8_t *)>;
+
+namespace rvv {
+void memcpy1d(void * dst, const void * src, int64_t size);
+
+void memcpy2d(void * dst, int64_t dst_stride, const void * src, int64_t src_stride, int64_t tile_rows, int64_t size);
+
+void forward_flash_attn_ext_f16_one_chunk_vlen1024_vf16(const ggml_compute_params * params,
+                                                        ggml_tensor *               dst,
+                                                        int                         ir0,
+                                                        int                         ir1,
+                                                        void *                      tcm_buffer,
+                                                        size_t                      tcm_buffer_size);
+
+void forward_flash_attn_ext_f16_tiled_vlen1024_vf16(const ggml_compute_params * params,
+                                                    ggml_tensor *               dst,
+                                                    int                         ir0,
+                                                    int                         ir1,
+                                                    void *                      tcm_buffer,
+                                                    size_t                      tcm_buffer_size);
+
+void forward_rms_norm_f32(ggml_compute_params * params, ggml_tensor * op);
+
+void forward_norm_f32(ggml_compute_params * params, ggml_tensor * op);
+
+void forward_cont_with_permute(ggml_compute_params * params, ggml_tensor * op);
+
+void forward_cpy_with_permute(ggml_compute_params * params, ggml_tensor * op);
+
+template <typename T> void forward_get_rows(ggml_compute_params * params, ggml_tensor * op);
+
+template <typename T> void forward_concat(ggml_compute_params * params, ggml_tensor * op);
+
+template <ggml_op op_type, typename T> void forward_binary(ggml_compute_params * params, ggml_tensor * op);
+
+template <typename T> void forward_sum_rows(const ggml_compute_params * params, ggml_tensor * op);
+
+template <typename T> void forward_repeat_nrows(ggml_compute_params * params, ggml_tensor * op);
+
+template <typename T> void forward_repeat_dim1(ggml_compute_params * params, ggml_tensor * op);
+
+void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
+
+void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
+
+void quantize_a_row_i8_hp(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
+
+void quantize_a_4row_i8_hp(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
+
+void quantize_a_row_i8k(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
+
+void quantize_a_4row_i8k(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
+
+}  // namespace rvv
+
+}  // namespace spacemit_kernels
--- a/ggml/src/ggml-cpu/spacemit/spine_barrier.h
+++ b/ggml/src/ggml-cpu/spacemit/spine_barrier.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+
+#define SPINE_CACHE_LINE  64
+#define SPINE_CACHE_ALIGN __attribute__((aligned(SPINE_CACHE_LINE)))
+
+struct spine_barrier_t {
+    SPINE_CACHE_ALIGN std::atomic<int64_t> pending_;
+    SPINE_CACHE_ALIGN std::atomic<int64_t> rounds_;
+    SPINE_CACHE_ALIGN int64_t              total_;
+};
+
+inline void spine_barrier_wait(spine_barrier_t * b) {
+    auto cur_round = b->rounds_.load(std::memory_order_acquire);
+    auto cnt       = --b->pending_;
+    if (cnt == 0) {
+        b->pending_.store(b->total_);
+        b->rounds_.store(cur_round + 1);
+    } else {
+        while (cur_round == b->rounds_.load(std::memory_order_relaxed)) {
+            __asm__ volatile("pause " ::: "memory");
+        }
+    }
+}
+
+inline void spine_barrier_init(spine_barrier_t * b, int num_barriers, uint64_t thread_count) {
+    for (int i = 0; i < num_barriers; i++) {
+        b[i].total_ = thread_count;
+        b[i].pending_.store(thread_count);
+        b[i].rounds_.store(0);
+    }
+}
--- a/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp
+++ b/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp
@@ -0,0 +1,760 @@
+#include "spine_mem_pool.h"
+
+#include "common.h"
+#include "ime_env.h"
+#include "spine_tcm.h"
+
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+namespace ggml::cpu::riscv64_spacemit {
+namespace {
+
+constexpr size_t   SPINE_MEM_POOL_CHUNK_SIZE         = 512ull * 1024ull * 1024ull;
+constexpr size_t   SPINE_SHARE_MEM_POOL_CHUNK_SIZE   = 512ull * 1024ull;
+constexpr size_t   SPINE_MEM_POOL_1G_REGION_SIZE     = 1ull << 30;
+constexpr uint64_t HUGETLB_1G_FLAG_REQUIRE_PUD       = 1ull << 0;
+constexpr char     SPINE_MEM_POOL_HUGETLB_1G_DEV[]   = "/dev/hugetlb_1g";
+constexpr char     SPINE_MEM_POOL_TCM_SYNC_MEM_DEV[] = "/dev/tcm_sync_mem";
+
+struct hugetlb_1g_region {
+    uint64_t size{ 0 };
+    uint64_t dma_addr{ 0 };
+    uint64_t flags{ 0 };
+    uint64_t reserved{ 0 };
+};
+
+#define HUGETLB_1G_IOC_MAGIC 'M'
+#define HUGETLB_1G_IOC_ALLOC _IOWR(HUGETLB_1G_IOC_MAGIC, 0x00, struct hugetlb_1g_region)
+#define HUGETLB_1G_IOC_FREE  _IO(HUGETLB_1G_IOC_MAGIC, 0x01)
+
+struct free_block {
+    size_t offset{ 0 };
+    size_t size{ 0 };
+};
+
+struct pool_chunk {
+    uint8_t *               base{ nullptr };
+    size_t                  size{ 0 };
+    int                     fd{ -1 };
+    std::vector<free_block> free_blocks;
+};
+
+struct pool_allocation {
+    void * chunk_base{ nullptr };
+    size_t chunk_size{ 0 };
+    void * base{ nullptr };
+    size_t size{ 0 };
+};
+
+bool is_power_of_two(size_t value) {
+    return value != 0 && (value & (value - 1)) == 0;
+}
+
+bool align_up(size_t value, size_t alignment, size_t * aligned_value) {
+    if (aligned_value == nullptr || alignment == 0) {
+        return false;
+    }
+
+    const size_t remainder = value % alignment;
+    if (remainder == 0) {
+        *aligned_value = value;
+        return true;
+    }
+
+    const size_t padding = alignment - remainder;
+    if (value > std::numeric_limits<size_t>::max() - padding) {
+        return false;
+    }
+
+    *aligned_value = value + padding;
+    return true;
+}
+
+bool align_up_uintptr(uintptr_t value, size_t alignment, uintptr_t * aligned_value) {
+    if (aligned_value == nullptr || alignment == 0) {
+        return false;
+    }
+
+    const uintptr_t remainder = value % alignment;
+    if (remainder == 0) {
+        *aligned_value = value;
+        return true;
+    }
+
+    const uintptr_t padding = alignment - remainder;
+    if (value > std::numeric_limits<uintptr_t>::max() - padding) {
+        return false;
+    }
+
+    *aligned_value = value + padding;
+    return true;
+}
+
+class spine_mem_pool_manager {
+  public:
+    explicit spine_mem_pool_manager(size_t default_chunk_size) : default_chunk_size_(default_chunk_size) {}
+
+    virtual ~spine_mem_pool_manager() = default;
+
+    void * alloc(size_t size, size_t alignment) {
+        if (size == 0 || !is_power_of_two(alignment)) {
+            return nullptr;
+        }
+
+        size_t aligned_size = 0;
+        if (!align_up(size, alignment, &aligned_size)) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: align_up failed for size %zu alignment %zu\n", __func__, size,
+                           alignment);
+            return nullptr;
+        }
+
+        pool_allocation allocation;
+
+        std::lock_guard<std::mutex> lock(mutex_);
+
+        if (!try_alloc_locked(aligned_size, alignment, &allocation)) {
+            if (!add_chunk_locked(aligned_size, alignment)) {
+                return nullptr;
+            }
+
+            if (!try_alloc_locked(aligned_size, alignment, &allocation)) {
+                GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: allocation retry failed for size %zu alignment %zu\n",
+                               __func__, aligned_size, alignment);
+                return nullptr;
+            }
+        }
+
+        try {
+            const auto [allocation_it, inserted] = allocations_.emplace(allocation.base, allocation);
+            if (!inserted) {
+                GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: duplicate allocation key %p\n", __func__, allocation.base);
+                rollback_allocation_locked(allocation);
+                return nullptr;
+            }
+        } catch (const std::bad_alloc &) {
+            rollback_allocation_locked(allocation);
+            throw;
+        }
+
+        return allocation.base;
+    }
+
+    void free(void * base) {
+        if (base == nullptr) {
+            return;
+        }
+
+        std::lock_guard<std::mutex> lock(mutex_);
+
+        auto allocation_it = allocations_.find(base);
+        if (allocation_it == allocations_.end()) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: unknown allocation %p\n", __func__, base);
+            return;
+        }
+
+        pool_allocation allocation = allocation_it->second;
+        allocations_.erase(allocation_it);
+
+        auto chunk_it = find_chunk_locked(allocation);
+        if (chunk_it == chunks_.end()) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: unknown chunk for allocation %p size %zu\n", __func__,
+                           allocation.base, allocation.size);
+            return;
+        }
+
+        auto * chunk_base = chunk_it->base;
+        auto * alloc_base = static_cast<uint8_t *>(allocation.base);
+        if (alloc_base < chunk_base || alloc_base >= chunk_base + chunk_it->size) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: allocation %p out of chunk range %p..%p\n", __func__,
+                           allocation.base, chunk_base, chunk_base + chunk_it->size);
+            return;
+        }
+
+        const size_t offset = static_cast<size_t>(alloc_base - chunk_base);
+        if (offset > chunk_it->size || allocation.size > chunk_it->size - offset) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: allocation %p size %zu exceeds chunk size %zu\n", __func__,
+                           allocation.base, allocation.size, chunk_it->size);
+            return;
+        }
+
+        insert_free_block_locked(*chunk_it, { offset, allocation.size });
+        maybe_release_empty_chunk_locked(chunk_it);
+    }
+
+  protected:
+    void release_chunks() {
+        std::lock_guard<std::mutex> lock(mutex_);
+
+        allocations_.clear();
+        for (auto & chunk : chunks_) {
+            dealloc_chunk(&chunk);
+        }
+        chunks_.clear();
+    }
+
+    size_t default_chunk_size() const { return default_chunk_size_; }
+
+    static void clear_chunk(pool_chunk * chunk) {
+        chunk->base = nullptr;
+        chunk->size = 0;
+        chunk->fd   = -1;
+        chunk->free_blocks.clear();
+    }
+
+    virtual bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) = 0;
+    virtual void dealloc_chunk(pool_chunk * chunk)                                                    = 0;
+
+  private:
+    struct alloc_candidate {
+        size_t    chunk_index{ 0 };
+        size_t    block_index{ 0 };
+        size_t    aligned_offset{ 0 };
+        uintptr_t address{ std::numeric_limits<uintptr_t>::max() };
+        bool      valid{ false };
+    };
+
+    std::vector<pool_chunk>::iterator find_chunk_locked(const pool_allocation & allocation) {
+        return std::find_if(chunks_.begin(), chunks_.end(), [&](const pool_chunk & chunk) {
+            return chunk.base == allocation.chunk_base && chunk.size == allocation.chunk_size;
+        });
+    }
+
+    bool add_chunk_locked(size_t min_size, size_t alignment) {
+        pool_chunk   chunk;
+        const size_t chunk_request = default_chunk_size_ == 0 ? min_size : std::max(min_size, default_chunk_size_);
+        void *       hint_addr     = nullptr;
+
+        for (const auto & existing_chunk : chunks_) {
+            auto * chunk_end = existing_chunk.base + existing_chunk.size;
+            if (hint_addr == nullptr || chunk_end > hint_addr) {
+                hint_addr = chunk_end;
+            }
+        }
+
+        if (!alloc_chunk(chunk_request, alignment, hint_addr, &chunk)) {
+            return false;
+        }
+
+        if (chunk.base == nullptr || chunk.size < min_size) {
+            GGML_LOG_ERROR(
+                "CPU_RISCV64_SPACEMIT: %s: invalid chunk returned for request size %zu, chunk_base=%p chunk_size=%zu\n",
+                __func__, min_size, chunk.base, chunk.size);
+            dealloc_chunk(&chunk);
+            return false;
+        }
+
+        try {
+            chunk.free_blocks.push_back({ 0, chunk.size });
+            chunks_.push_back(std::move(chunk));
+        } catch (const std::bad_alloc &) {
+            dealloc_chunk(&chunk);
+            throw;
+        }
+
+        return true;
+    }
+
+    void rollback_allocation_locked(const pool_allocation & allocation) {
+        auto chunk_it = find_chunk_locked(allocation);
+        if (chunk_it == chunks_.end()) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to rollback allocation %p, owning chunk not found\n",
+                           __func__, allocation.base);
+            return;
+        }
+
+        auto * chunk_base = chunk_it->base;
+        auto * alloc_base = static_cast<uint8_t *>(allocation.base);
+        if (alloc_base < chunk_base || alloc_base >= chunk_base + chunk_it->size) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to rollback allocation %p, chunk range is invalid\n",
+                           __func__, allocation.base);
+            return;
+        }
+
+        const size_t offset = static_cast<size_t>(alloc_base - chunk_base);
+        if (offset > chunk_it->size || allocation.size > chunk_it->size - offset) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to rollback allocation %p size %zu\n", __func__,
+                           allocation.base, allocation.size);
+            return;
+        }
+
+        insert_free_block_locked(*chunk_it, { offset, allocation.size });
+        maybe_release_empty_chunk_locked(chunk_it);
+    }
+
+    bool try_alloc_locked(size_t size, size_t alignment, pool_allocation * allocation) {
+        alloc_candidate best;
+
+        for (size_t chunk_index = 0; chunk_index < chunks_.size(); ++chunk_index) {
+            const auto & chunk = chunks_[chunk_index];
+            for (size_t block_index = 0; block_index < chunk.free_blocks.size(); ++block_index) {
+                const auto & block = chunk.free_blocks[block_index];
+
+                uintptr_t  aligned_addr = 0;
+                const auto block_addr   = reinterpret_cast<uintptr_t>(chunk.base + block.offset);
+                if (!align_up_uintptr(block_addr, alignment, &aligned_addr)) {
+                    continue;
+                }
+
+                if (aligned_addr < block_addr) {
+                    continue;
+                }
+
+                const size_t aligned_offset = block.offset + static_cast<size_t>(aligned_addr - block_addr);
+                const size_t padding        = aligned_offset - block.offset;
+                if (padding > block.size || size > block.size - padding) {
+                    continue;
+                }
+
+                if (!best.valid || aligned_addr < best.address) {
+                    best.chunk_index    = chunk_index;
+                    best.block_index    = block_index;
+                    best.aligned_offset = aligned_offset;
+                    best.address        = aligned_addr;
+                    best.valid          = true;
+                }
+            }
+        }
+
+        if (!best.valid) {
+            return false;
+        }
+
+        auto &           chunk     = chunks_[best.chunk_index];
+        const free_block block     = chunk.free_blocks[best.block_index];
+        const size_t     padding   = best.aligned_offset - block.offset;
+        const size_t     alloc_end = best.aligned_offset + size;
+        const size_t     block_end = block.offset + block.size;
+
+        chunk.free_blocks.erase(chunk.free_blocks.begin() + best.block_index);
+        auto insert_it = chunk.free_blocks.begin() + best.block_index;
+        if (padding != 0) {
+            insert_it = chunk.free_blocks.insert(insert_it, { block.offset, padding });
+            ++insert_it;
+        }
+        if (alloc_end < block_end) {
+            chunk.free_blocks.insert(insert_it, { alloc_end, block_end - alloc_end });
+        }
+
+        allocation->chunk_base = chunk.base;
+        allocation->chunk_size = chunk.size;
+        allocation->base       = chunk.base + best.aligned_offset;
+        allocation->size       = size;
+        return true;
+    }
+
+    void maybe_release_empty_chunk_locked(std::vector<pool_chunk>::iterator chunk_it) {
+        if (chunk_it->free_blocks.size() != 1) {
+            return;
+        }
+
+        const auto & block = chunk_it->free_blocks.front();
+        if (block.offset != 0 || block.size != chunk_it->size) {
+            return;
+        }
+
+        dealloc_chunk(&*chunk_it);
+        chunks_.erase(chunk_it);
+    }
+
+    void insert_free_block_locked(pool_chunk & chunk, free_block block) {
+        auto it = chunk.free_blocks.begin();
+        while (it != chunk.free_blocks.end() && it->offset < block.offset) {
+            ++it;
+        }
+
+        if (it != chunk.free_blocks.begin()) {
+            const auto & prev = *(it - 1);
+            if (prev.offset + prev.size > block.offset) {
+                GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: overlapping free block at offset %zu size %zu\n", __func__,
+                               block.offset, block.size);
+                return;
+            }
+        }
+
+        if (it != chunk.free_blocks.end() && block.offset + block.size > it->offset) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: overlapping next free block at offset %zu size %zu\n", __func__,
+                           block.offset, block.size);
+            return;
+        }
+
+        it = chunk.free_blocks.insert(it, block);
+
+        if (it != chunk.free_blocks.begin()) {
+            auto prev = it - 1;
+            if (prev->offset + prev->size == it->offset) {
+                it->offset = prev->offset;
+                it->size += prev->size;
+                it = chunk.free_blocks.erase(prev);
+            }
+        }
+
+        if (it + 1 != chunk.free_blocks.end() && it->offset + it->size == (it + 1)->offset) {
+            it->size += (it + 1)->size;
+            chunk.free_blocks.erase(it + 1);
+        }
+    }
+
+    std::mutex                                  mutex_;
+    std::vector<pool_chunk>                     chunks_;
+    std::unordered_map<void *, pool_allocation> allocations_;
+    size_t                                      default_chunk_size_{ 0 };
+};
+
+class spine_mem_pool_posix final : public spine_mem_pool_manager {
+  public:
+    spine_mem_pool_posix() : spine_mem_pool_manager(0) {}
+
+    ~spine_mem_pool_posix() override { release_chunks(); }
+
+  private:
+    bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
+        (void) hint_addr;
+
+        const size_t alloc_alignment = std::max(alignment, sizeof(void *));
+        void *       base            = nullptr;
+        const int    rc              = posix_memalign(&base, alloc_alignment, min_size);
+        if (rc != 0) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: posix_memalign failed for size %zu alignment %zu, rc=%d\n",
+                           __func__, min_size, alloc_alignment, rc);
+            return false;
+        }
+
+        chunk->base = static_cast<uint8_t *>(base);
+        chunk->size = min_size;
+        chunk->fd   = -1;
+        return true;
+    }
+
+    void dealloc_chunk(pool_chunk * chunk) override {
+        std::free(chunk->base);
+        clear_chunk(chunk);
+    }
+};
+
+class spine_mem_pool_transparent_hugepage final : public spine_mem_pool_manager {
+  public:
+    spine_mem_pool_transparent_hugepage() : spine_mem_pool_manager(SPINE_MEM_POOL_CHUNK_SIZE) {}
+
+    ~spine_mem_pool_transparent_hugepage() override { release_chunks(); }
+
+  private:
+    bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
+        (void) alignment;
+
+        size_t chunk_size = 0;
+        if (!align_up(min_size, default_chunk_size(), &chunk_size)) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to round chunk size for %zu\n", __func__, min_size);
+            return false;
+        }
+
+        void * map_addr = mmap(hint_addr, chunk_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+        if (map_addr == MAP_FAILED) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: mmap failed for chunk size %zu, errno=%d\n", __func__, chunk_size,
+                           errno);
+            return false;
+        }
+
+        if (madvise(map_addr, chunk_size, MADV_HUGEPAGE) != 0) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: madvise(MADV_HUGEPAGE) failed for chunk size %zu, errno=%d\n",
+                           __func__, chunk_size, errno);
+            munmap(map_addr, chunk_size);
+            return false;
+        }
+
+        chunk->base = static_cast<uint8_t *>(map_addr);
+        chunk->size = chunk_size;
+        chunk->fd   = -1;
+        return true;
+    }
+
+    void dealloc_chunk(pool_chunk * chunk) override {
+        if (chunk->base != nullptr && chunk->size != 0 && munmap(chunk->base, chunk->size) != 0) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: munmap failed for chunk %p size %zu, errno=%d\n", __func__,
+                           chunk->base, chunk->size, errno);
+        }
+
+        clear_chunk(chunk);
+    }
+};
+
+class spine_mem_pool_hugetlb_1g final : public spine_mem_pool_manager {
+  public:
+    spine_mem_pool_hugetlb_1g() : spine_mem_pool_manager(SPINE_MEM_POOL_1G_REGION_SIZE) {}
+
+    ~spine_mem_pool_hugetlb_1g() override { release_chunks(); }
+
+  private:
+    bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
+        (void) alignment;
+        (void) hint_addr;
+
+        size_t region_size = 0;
+        if (!align_up(min_size, SPINE_MEM_POOL_1G_REGION_SIZE, &region_size)) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to round hugetlb_1g size for %zu\n", __func__, min_size);
+            return false;
+        }
+
+        const int fd = open(SPINE_MEM_POOL_HUGETLB_1G_DEV, O_RDWR);
+        if (fd < 0) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: open(%s) failed, errno=%d\n", __func__,
+                           SPINE_MEM_POOL_HUGETLB_1G_DEV, errno);
+            return false;
+        }
+
+        hugetlb_1g_region region;
+        region.size  = region_size;
+        region.flags = HUGETLB_1G_FLAG_REQUIRE_PUD;
+        if (ioctl(fd, HUGETLB_1G_IOC_ALLOC, &region) < 0) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: HUGETLB_1G_IOC_ALLOC failed for size %zu, errno=%d\n", __func__,
+                           region_size, errno);
+            close(fd);
+            return false;
+        }
+
+        void * map_addr = mmap(nullptr, region.size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+        if (map_addr == MAP_FAILED) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: mmap failed for hugetlb_1g size %llu, errno=%d\n", __func__,
+                           static_cast<unsigned long long>(region.size), errno);
+            ioctl(fd, HUGETLB_1G_IOC_FREE);
+            close(fd);
+            return false;
+        }
+
+        chunk->base = static_cast<uint8_t *>(map_addr);
+        chunk->size = region.size;
+        chunk->fd   = fd;
+        return true;
+    }
+
+    void dealloc_chunk(pool_chunk * chunk) override {
+        if (chunk->base != nullptr && chunk->size != 0 && munmap(chunk->base, chunk->size) != 0) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: munmap failed for hugetlb_1g chunk %p size %zu, errno=%d\n",
+                           __func__, chunk->base, chunk->size, errno);
+        }
+
+        if (chunk->fd >= 0) {
+            if (ioctl(chunk->fd, HUGETLB_1G_IOC_FREE) < 0) {
+                GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: HUGETLB_1G_IOC_FREE failed for chunk %p, errno=%d\n",
+                               __func__, chunk->base, errno);
+            }
+
+            close(chunk->fd);
+        }
+
+        clear_chunk(chunk);
+    }
+};
+
+class spine_mem_pool_shared_mem final : public spine_mem_pool_manager {
+  public:
+    spine_mem_pool_shared_mem() : spine_mem_pool_manager(SPINE_SHARE_MEM_POOL_CHUNK_SIZE) {}
+
+    ~spine_mem_pool_shared_mem() override { release_chunks(); }
+
+  private:
+    bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
+        (void) alignment;
+
+        if (hint_addr != nullptr) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: shared_mem does not support multiple active chunks\n", __func__);
+            return false;
+        }
+
+        if (min_size > default_chunk_size()) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: shared_mem request %zu exceeds chunk size %zu\n", __func__,
+                           min_size, default_chunk_size());
+            return false;
+        }
+
+        const int fd = open(SPINE_MEM_POOL_TCM_SYNC_MEM_DEV, O_RDWR | O_SYNC);
+        if (fd < 0) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: open(%s) failed, errno=%d\n", __func__,
+                           SPINE_MEM_POOL_TCM_SYNC_MEM_DEV, errno);
+            return false;
+        }
+
+        void * map_addr = mmap(nullptr, default_chunk_size(), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+        if (map_addr == MAP_FAILED) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: mmap failed for %s size %zu, errno=%d\n", __func__,
+                           SPINE_MEM_POOL_TCM_SYNC_MEM_DEV, default_chunk_size(), errno);
+            close(fd);
+            return false;
+        }
+
+        chunk->base = static_cast<uint8_t *>(map_addr);
+        chunk->size = default_chunk_size();
+        chunk->fd   = fd;
+        return true;
+    }
+
+    void dealloc_chunk(pool_chunk * chunk) override {
+        if (chunk->base != nullptr && chunk->size != 0 && munmap(chunk->base, chunk->size) != 0) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: munmap failed for shared_mem chunk %p size %zu, errno=%d\n",
+                           __func__, chunk->base, chunk->size, errno);
+        }
+
+        if (chunk->fd >= 0) {
+            close(chunk->fd);
+        }
+
+        clear_chunk(chunk);
+    }
+};
+
+spine_mem_pool_manager & get_spine_mem_pool_manager() {
+    static std::once_flag                          pool_once;
+    static std::unique_ptr<spine_mem_pool_manager> selected_pool;
+    static spine_mem_pool_backend                  selected_backend = spine_mem_pool_backend::none;
+
+    spine_mem_pool_backend backend = global_spine_env_info.mem_backend;
+    if (backend == spine_mem_pool_backend::none) {
+        backend = spine_mem_pool_backend::transparent_hugepage;
+    }
+
+    std::call_once(pool_once, [&]() {
+        selected_backend = backend;
+
+        switch (selected_backend) {
+            case spine_mem_pool_backend::posix_memalign:
+                selected_pool = std::make_unique<spine_mem_pool_posix>();
+                break;
+            case spine_mem_pool_backend::transparent_hugepage:
+                selected_pool = std::make_unique<spine_mem_pool_transparent_hugepage>();
+                break;
+            case spine_mem_pool_backend::hugetlb_1g:
+                selected_pool = std::make_unique<spine_mem_pool_hugetlb_1g>();
+                break;
+            case spine_mem_pool_backend::none:
+                selected_backend = spine_mem_pool_backend::transparent_hugepage;
+                selected_pool    = std::make_unique<spine_mem_pool_transparent_hugepage>();
+                break;
+        }
+    });
+
+    if (backend != selected_backend) {
+        GGML_LOG_ERROR(
+            "CPU_RISCV64_SPACEMIT: %s: mem pool backend is process-global and mutually exclusive, requested=%d but "
+            "selected=%d\n",
+            __func__, static_cast<int>(backend), static_cast<int>(selected_backend));
+    }
+
+    if (selected_pool) {
+        return *selected_pool;
+    }
+
+    throw std::bad_alloc();
+}
+
+spine_mem_pool_manager & get_spine_mem_pool_shared_mem_manager() {
+    static std::once_flag                             shared_mem_pool_once;
+    static std::unique_ptr<spine_mem_pool_shared_mem> shared_mem_pool;
+
+    std::call_once(shared_mem_pool_once, [&]() { shared_mem_pool = std::make_unique<spine_mem_pool_shared_mem>(); });
+
+    if (shared_mem_pool) {
+        return *shared_mem_pool;
+    }
+
+    throw std::bad_alloc();
+}
+
+}  // namespace
+
+bool spine_mem_pool_tcm_init(spine_mem_pool_tcm_info * info) noexcept {
+    if (info == nullptr) {
+        return false;
+    }
+
+    *info = {};
+
+    if (spine_tcm_open_handle(NULL) != 0 || !spine_tcm_is_available()) {
+        return false;
+    }
+
+    spine_tcm_mem_info_t mem_info;
+    if (spine_tcm_mem_info(&mem_info) != 0) {
+        return false;
+    }
+
+    info->available   = true;
+    info->blk_size    = mem_info.blk_size;
+    info->blk_num     = mem_info.blk_num;
+    info->is_fake_tcm = mem_info.is_fake_tcm != 0;
+    return true;
+}
+
+void * spine_mem_pool_tcm_mem_get(int cpu_id) noexcept {
+    return spine_tcm_mem_get(cpu_id);
+}
+
+void * spine_mem_pool_tcm_mem_wait(int cpu_id) noexcept {
+    return spine_tcm_mem_try_wait(cpu_id, 1000 * 1000);
+}
+
+int spine_mem_pool_tcm_mem_release(int cpu_id) noexcept {
+    return spine_tcm_mem_release(cpu_id);
+}
+
+void * spine_mem_pool_alloc(size_t size, size_t alignment) noexcept {
+    try {
+        return get_spine_mem_pool_manager().alloc(size, alignment);
+    } catch (const std::bad_alloc &) {
+        GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while allocating size %zu\n", __func__, size);
+        return nullptr;
+    }
+}
+
+void * spine_mem_pool_shared_mem_alloc(size_t size, size_t alignment) noexcept {
+    try {
+        return get_spine_mem_pool_shared_mem_manager().alloc(size, alignment);
+    } catch (const std::bad_alloc &) {
+        GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while allocating shared memory size %zu\n", __func__, size);
+        return nullptr;
+    }
+}
+
+void spine_mem_pool_free(void * base) noexcept {
+    try {
+        get_spine_mem_pool_manager().free(base);
+    } catch (const std::bad_alloc &) {
+        GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while freeing allocation %p\n", __func__, base);
+    }
+}
+
+void spine_mem_pool_shared_mem_free(void * base) noexcept {
+    try {
+        get_spine_mem_pool_shared_mem_manager().free(base);
+    } catch (const std::bad_alloc &) {
+        GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while freeing shared allocation %p\n", __func__, base);
+    }
+}
+
+}  // namespace ggml::cpu::riscv64_spacemit
+
+extern "C" {
+void * ggml_backend_cpu_riscv64_spacemit_alloc_shared(size_t size, size_t alignment) {
+    void * result = ggml::cpu::riscv64_spacemit::spine_mem_pool_shared_mem_alloc(size, alignment);
+    if (result == nullptr) {
+        GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to allocate shared memory size %zu alignment %zu\n", __func__,
+                       size, alignment);
+    }
+    return result;
+}
+
+void ggml_backend_cpu_riscv64_spacemit_free_shared(void * ptr) {
+    ggml::cpu::riscv64_spacemit::spine_mem_pool_shared_mem_free(ptr);
+}
+}
--- a/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h
+++ b/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace ggml::cpu::riscv64_spacemit {
+
+enum class spine_mem_pool_backend : uint8_t {
+    none,
+    posix_memalign,
+    transparent_hugepage,
+    hugetlb_1g,
+};
+
+struct spine_mem_pool_tcm_info {
+    bool   available{ false };
+    size_t blk_size{ 0 };
+    size_t blk_num{ 0 };
+    bool   is_fake_tcm{ false };
+};
+
+bool   spine_mem_pool_tcm_init(spine_mem_pool_tcm_info * info) noexcept;
+void * spine_mem_pool_tcm_mem_get(int cpu_id) noexcept;
+void * spine_mem_pool_tcm_mem_wait(int cpu_id) noexcept;
+int    spine_mem_pool_tcm_mem_release(int cpu_id) noexcept;
+
+void * spine_mem_pool_alloc(size_t size, size_t alignment) noexcept;
+void * spine_mem_pool_shared_mem_alloc(size_t size, size_t alignment) noexcept;
+void   spine_mem_pool_free(void * base) noexcept;
+void   spine_mem_pool_shared_mem_free(void * base) noexcept;
+
+}  // namespace ggml::cpu::riscv64_spacemit
--- a/ggml/src/ggml-cpu/spacemit/spine_tcm.h
+++ b/ggml/src/ggml-cpu/spacemit/spine_tcm.h
@@ -0,0 +1,409 @@
+#ifndef SPINE_TCM_PUBLIC_H_
+#define SPINE_TCM_PUBLIC_H_
+
+/*
+ * spine_tcm public API
+ *
+ * Usage:
+ *   1. Direct link mode
+ *      Define SPINE_TCM_DIRECT_LINK and link against libspine_tcm.so.
+ *
+ *      if (spine_tcm_is_available()) {
+ *          void *buffer = spine_tcm_mem_get(0);
+ *          spine_tcm_mem_free(0);
+ *      }
+ *
+ *   2. Header-only loader mode
+ *      Include this header without linking libspine_tcm.so. The loader first
+ *      tries to reuse a process-global spine_tcm instance and falls back to
+ *      dlopen("libspine_tcm.so") when needed.
+ *
+ *      spine_tcm_open_handle(NULL);  // optional pre-bind
+ *      if (spine_tcm_is_available()) {
+ *          void *buffer = spine_tcm_mem_get(0);
+ *          spine_tcm_mem_free(0);
+ *      }
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#if !defined(SPINE_TCM_BUILD_SHARED) && !defined(SPINE_TCM_DIRECT_LINK)
+#    include <dlfcn.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+#    if defined(SPINE_TCM_BUILD_SHARED)
+#        define SPINE_TCM_API __declspec(dllexport)
+#    else
+#        define SPINE_TCM_API __declspec(dllimport)
+#    endif
+#else
+#    define SPINE_TCM_API __attribute__((visibility("default")))
+#endif
+
+typedef struct spine_tcm_mem_info {
+    size_t blk_size;
+    size_t blk_num;
+    int    is_fake_tcm;
+} spine_tcm_mem_info_t;
+
+typedef struct spine_tcm_block_info {
+    int      id;
+    void *   va;
+    size_t   size;
+    uint64_t phys_addr;
+    uint64_t cpu_affinity_mask;
+    int      owner_tid;
+    int      is_acquired;
+} spine_tcm_block_info_t;
+
+/* Shared-library runtime ABI exported by libspine_tcm.so. */
+SPINE_TCM_API const char * spine_tcm_runtime_version(void);
+SPINE_TCM_API int          spine_tcm_runtime_is_available(void);
+SPINE_TCM_API int          spine_tcm_runtime_layout_info(spine_tcm_mem_info_t * info);
+SPINE_TCM_API int          spine_tcm_runtime_mem_info(int id, spine_tcm_block_info_t * info);
+SPINE_TCM_API void *       spine_tcm_runtime_mem_get(int id);
+SPINE_TCM_API int          spine_tcm_runtime_mem_free(int id);
+SPINE_TCM_API void *       spine_tcm_runtime_mem_try_wait(int id, size_t timeout_us);
+SPINE_TCM_API int          spine_tcm_runtime_mem_release(int id);
+SPINE_TCM_API int          spine_tcm_runtime_mem_force_release(int id);
+SPINE_TCM_API int          spine_tcm_runtime_mem_query(int id);
+
+#if defined(SPINE_TCM_DIRECT_LINK)
+/* Optional no-op in direct-link mode. */
+static inline int spine_tcm_open_handle(const char * so_path) {
+    (void) so_path;
+    return 0;
+}
+
+static inline const char * spine_tcm_version(void) {
+    return spine_tcm_runtime_version();
+}
+
+/* Returns 1 when the runtime driver is available, otherwise 0. */
+static inline int spine_tcm_is_available(void) {
+    return spine_tcm_runtime_is_available();
+}
+
+/* Returns runtime memory geometry and whether the current backend is fake TCM. */
+static inline int spine_tcm_mem_info(spine_tcm_mem_info_t * info) {
+    return spine_tcm_runtime_layout_info(info);
+}
+
+/* Returns per-block runtime metadata for the given TCM id. */
+static inline int spine_tcm_block_info(int id, spine_tcm_block_info_t * info) {
+    return spine_tcm_runtime_mem_info(id, info);
+}
+
+/* Returns a cached buffer for the given TCM id, or NULL on failure. */
+static inline void * spine_tcm_mem_get(int id) {
+    return spine_tcm_runtime_mem_get(id);
+}
+
+/* Releases one reference acquired by spine_tcm_mem_get(id). */
+static inline int spine_tcm_mem_free(int id) {
+    return spine_tcm_runtime_mem_free(id);
+}
+
+/* Waits for a TCM block handoff and returns the driver-owned buffer when available. */
+static inline void * spine_tcm_mem_try_wait(int id, size_t over_time) {
+    return spine_tcm_runtime_mem_try_wait(id, over_time);
+}
+
+/* Releases a buffer acquired by spine_tcm_mem_try_wait(id, over_time). */
+static inline int spine_tcm_mem_release(int id) {
+    return spine_tcm_runtime_mem_release(id);
+}
+
+/* Forces a release for the given TCM id when the backend supports it. */
+static inline int spine_tcm_mem_force_release(int id) {
+    return spine_tcm_runtime_mem_force_release(id);
+}
+
+/* Returns whether the given TCM id is currently acquired. */
+static inline int spine_tcm_mem_query(int id) {
+    return spine_tcm_runtime_mem_query(id);
+}
+#elif !defined(SPINE_TCM_BUILD_SHARED)
+typedef struct spine_tcm_handle {
+    void * module_handle;
+    int    use_global_scope;
+    int    owns_module_handle;
+    const char * (*runtime_version)(void);
+    int (*runtime_is_available)(void);
+    int (*runtime_layout_info)(spine_tcm_mem_info_t * info);
+    int (*runtime_mem_info)(int id, spine_tcm_block_info_t * info);
+    void * (*runtime_mem_get)(int id);
+    int (*runtime_mem_free)(int id);
+    void * (*runtime_mem_try_wait)(int id, size_t over_time);
+    int (*runtime_mem_release)(int id);
+    int (*runtime_mem_force_release)(int id);
+    int (*runtime_mem_query)(int id);
+} spine_tcm_handle_t;
+
+static inline spine_tcm_handle_t * spine_tcm_default_handle(void) {
+    static spine_tcm_handle_t handle = { 0 };
+    return &handle;
+}
+
+static inline void spine_tcm_handle_reset(spine_tcm_handle_t * handle) {
+    if (handle != NULL) {
+        memset(handle, 0, sizeof(*handle));
+    }
+}
+
+static inline int spine_tcm_handle_bind(spine_tcm_handle_t * handle) {
+    void * symbol_scope = handle->use_global_scope ? RTLD_DEFAULT : handle->module_handle;
+
+    handle->runtime_version      = (const char * (*) (void) ) dlsym(symbol_scope, "spine_tcm_runtime_version");
+    handle->runtime_is_available = (int (*)(void)) dlsym(symbol_scope, "spine_tcm_runtime_is_available");
+    handle->runtime_layout_info =
+        (int (*)(spine_tcm_mem_info_t *)) dlsym(symbol_scope, "spine_tcm_runtime_layout_info");
+    handle->runtime_mem_info =
+        (int (*)(int, spine_tcm_block_info_t *)) dlsym(symbol_scope, "spine_tcm_runtime_mem_info");
+    handle->runtime_mem_get      = (void * (*) (int) ) dlsym(symbol_scope, "spine_tcm_runtime_mem_get");
+    handle->runtime_mem_free     = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_free");
+    handle->runtime_mem_try_wait = (void * (*) (int, size_t)) dlsym(symbol_scope, "spine_tcm_runtime_mem_try_wait");
+    handle->runtime_mem_release  = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_release");
+    handle->runtime_mem_force_release = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_force_release");
+    handle->runtime_mem_query         = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_query");
+
+    return handle->runtime_version != NULL && handle->runtime_is_available != NULL &&
+                   handle->runtime_layout_info != NULL && handle->runtime_mem_info != NULL &&
+                   handle->runtime_mem_get != NULL && handle->runtime_mem_free != NULL &&
+                   handle->runtime_mem_try_wait != NULL && handle->runtime_mem_release != NULL &&
+                   handle->runtime_mem_force_release != NULL && handle->runtime_mem_query != NULL ?
+               0 :
+               -1;
+}
+
+/*
+ * Try to bind against an already-loaded process-global spine_tcm instance.
+ * The shared library exports spine_tcm_runtime_marker only for this probe.
+ */
+static inline int spine_tcm_try_bind_global(spine_tcm_handle_t * handle) {
+    if (dlsym(RTLD_DEFAULT, "spine_tcm_runtime_marker") == NULL) {
+        return -1;
+    }
+
+    handle->use_global_scope = 1;
+    return spine_tcm_handle_bind(handle);
+}
+
+/*
+ * Optional pre-bind entry point.
+ *
+ * Behavior:
+ *   - Reuses an already-loaded global spine_tcm instance when available.
+ *   - Otherwise loads the shared library from so_path or the default soname.
+ *   - Repeated calls are safe and return 0 after the first successful bind.
+ */
+static inline int spine_tcm_open_handle(const char * so_path) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+    const char *         library  = (so_path != NULL && so_path[0] != '\0') ? so_path : "libspine_tcm.so";
+
+    if (resolved->module_handle != NULL || resolved->use_global_scope) {
+        return 0;
+    }
+
+    if (spine_tcm_try_bind_global(resolved) == 0) {
+        return 0;
+    }
+
+    spine_tcm_handle_reset(resolved);
+
+    resolved->module_handle      = dlopen(library, RTLD_LAZY | RTLD_GLOBAL);
+    resolved->owns_module_handle = resolved->module_handle != NULL ? 1 : 0;
+
+    if (resolved->module_handle == NULL) {
+        spine_tcm_handle_reset(resolved);
+        return -1;
+    }
+
+    if (spine_tcm_handle_bind(resolved) != 0) {
+        if (resolved->owns_module_handle) {
+            dlclose(resolved->module_handle);
+        }
+        spine_tcm_handle_reset(resolved);
+        return -1;
+    }
+
+    return 0;
+}
+
+/* Returns 1 when the runtime driver is available, otherwise 0. */
+static inline int spine_tcm_is_available(void) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_is_available == NULL) {
+        return 0;
+    }
+
+    return resolved->runtime_is_available();
+}
+
+/* Returns runtime memory geometry and whether the current backend is fake TCM. */
+static inline int spine_tcm_mem_info(spine_tcm_mem_info_t * info) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_layout_info == NULL) {
+        return -1;
+    }
+
+    return resolved->runtime_layout_info(info);
+}
+
+static inline const char * spine_tcm_version(void) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_version == NULL) {
+        return "unknown";
+    }
+
+    return resolved->runtime_version();
+}
+
+/* Returns per-block runtime metadata for the given TCM id. */
+static inline int spine_tcm_block_info(int id, spine_tcm_block_info_t * info) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_info == NULL) {
+        return -1;
+    }
+
+    return resolved->runtime_mem_info(id, info);
+}
+
+/* Returns a cached buffer for the given TCM id, or NULL on failure. */
+static inline void * spine_tcm_mem_get(int id) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        return NULL;
+    }
+
+    if (resolved->runtime_mem_get == NULL) {
+        return NULL;
+    }
+
+    return resolved->runtime_mem_get(id);
+}
+
+/* Releases one reference acquired by spine_tcm_mem_get(id). */
+static inline int spine_tcm_mem_free(int id) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_free == NULL) {
+        return -1;
+    }
+
+    return resolved->runtime_mem_free(id);
+}
+
+/* Waits for a TCM block handoff and returns the driver-owned buffer when available. */
+static inline void * spine_tcm_mem_try_wait(int id, size_t over_time) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        return NULL;
+    }
+
+    if (resolved->runtime_mem_try_wait == NULL) {
+        return NULL;
+    }
+
+    return resolved->runtime_mem_try_wait(id, over_time);
+}
+
+/* Releases a buffer acquired by spine_tcm_mem_try_wait(id, over_time). */
+static inline int spine_tcm_mem_release(int id) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_release == NULL) {
+        return -1;
+    }
+
+    return resolved->runtime_mem_release(id);
+}
+
+/* Forces a release for the given TCM id when the backend supports it. */
+static inline int spine_tcm_mem_force_release(int id) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if ((resolved->module_handle == NULL && !resolved->use_global_scope) ||
+        resolved->runtime_mem_force_release == NULL) {
+        return -1;
+    }
+
+    return resolved->runtime_mem_force_release(id);
+}
+
+/* Returns whether the given TCM id is currently acquired. */
+static inline int spine_tcm_mem_query(int id) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_query == NULL) {
+        return -1;
+    }
+
+    return resolved->runtime_mem_query(id);
+}
+#else
+static inline const char * spine_tcm_version(void) {
+    return spine_tcm_runtime_version();
+}
+#endif
+
+#define SPINE_TCM_VERSION (spine_tcm_version())
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/ggml/src/ggml-cuda/allreduce.cu
+++ b/ggml/src/ggml-cuda/allreduce.cu
@@ -184,13 +184,15 @@ static __global__ void ggml_cuda_ar_kernel(
            #pragma unroll
            for (int k = 0; k < ELEMS_PER_VEC; ++k) {
                const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
-                recvbuf[off + k] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(wire[k]);
+                recvbuf[off + k] = ggml_cuda_cast<T_dst>(
+                    ggml_cuda_cast<float>(d_low) + ggml_cuda_cast<float>(wire[k]));
            }
        }
        if (bid == 0 && tid < count - tail) {
            const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
-            recvbuf[tail + tid] =
-                ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(host_other[tail + tid]);
+            recvbuf[tail + tid] = ggml_cuda_cast<T_dst>(
+                ggml_cuda_cast<float>(d_low) +
+                ggml_cuda_cast<float>(host_other[tail + tid]));
        }
    }
 }
@@ -210,7 +212,8 @@ static __global__ void ggml_cuda_ar_add_kernel(
    const int nt  = gridDim.x * blockDim.x;
    for (int i = tid; i < count; i += nt) {
        const T_src d_low = ggml_cuda_cast<T_src>(dst[i]);
-        dst[i] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(src[i]);
+        dst[i] = ggml_cuda_cast<T_dst>(
+            ggml_cuda_cast<float>(d_low) + ggml_cuda_cast<float>(src[i]));
    }
 }

--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -125,61 +125,107 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
 }

 static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_config_rdna(const int DKQ, const int DV, const int ncols) {
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 16, 128, 2,  64, 128, 128, 128, 2, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2,  64, 128, 128,  64, 2, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2,  64, 128, 128,  64, 2, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64,  8, 128, 2,  64,  32,  32,  32, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64, 16, 128, 2,  64,  32,  32,  32, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64, 32, 128, 2,  64,  32,  32,  32, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64, 64, 128, 2,  64,  32,  32,  32, 1, true);

-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 128, 2,  64, 160, 128,  64, 2, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 128, 2,  64, 160, 128,  64, 2, false);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80,  80,  8,  64, 2,  32,  40,  40,  40, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80,  80, 16,  64, 2,  32,  40,  40,  40, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80,  80, 32, 128, 2,  64,  40,  40,  40, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80,  80, 64, 128, 2,  64,  40,  40,  40, 1, true);

-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16,  64, 4,  32, 128, 128, 128, 1, false);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 128, 2,  32, 128, 128, 128, 1, false);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 64, 256, 1,  32, 128, 128, 128, 1, false);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96,  96,  8,  64, 2,  32,  48,  48,  48, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96,  96, 16,  64, 2,  32,  48,  48,  48, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96,  96, 32, 128, 2,  64,  48,  48,  48, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96,  96, 64, 128, 2,  64,  48,  48,  48, 1, true);

-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16,  64, 4,  32,  96,  64, 128, 1, false);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 128, 2,  32, 160, 128, 128, 1, false);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 256, 1,  32, 160, 128, 128, 1, false);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112,  8,  64, 2,  32,  56,  56,  56, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 16,  64, 2,  32,  56,  56,  56, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 32, 128, 2,  64,  56,  56,  56, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 64, 128, 2,  64,  56,  56,  56, 1, true);

-    // TODO tune specifically for RDNA
-    return ggml_cuda_fattn_mma_get_config_ampere(DKQ, DV, ncols);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128,  8,  64, 2,  32,  64,  64,  64, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 16,  64, 2,  32,  64,  64,  64, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 32, 128, 2,  64,  64,  64,  64, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 64, 128, 2,  64,  64,  64,  64, 1, true);
+
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128,  8,  64, 2,  32,  96,  64,  64, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 16,  64, 2,  32,  96,  64,  64, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 32, 128, 2,  64,  96,  64,  64, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 64, 128, 2,  64,  96,  64,  64, 1, true);
+
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256,  8,  64, 2,  32, 128, 128, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 16,  64, 2,  32, 128, 128, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2,  64, 128, 128,  64, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2,  64, 128, 128,  64, 1, true);
+
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 128, 2,  32, 160, 128, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 128, 2,  32, 160, 128, 128, 1, true);
+
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512,  8, 128, 3,  64,  96,  64, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16, 128, 3,  64,  96,  64, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 128, 2,  32, 128, 128, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 64, 128, 2,  32, 128, 128, 128, 1, true);
+
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  8, 128, 3,  64,  96,  64, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16, 128, 3,  64,  96,  64, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 128, 2,  32, 160, 128, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 128, 2,  32, 160, 128, 128, 1, true);
+
+    return fattn_mma_config(32, 1, 0, 0, 0, 0, 0, false);
 }

 static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_config_cdna(const int DKQ, const int DV, const int ncols) {
-    // Conservative configs for CDNA (MI100+): 64KB LDS, wavefront64, nstages=1 (no cp.async).
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64,  8, 128, 2, 128,  32,  32,  32, 1, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64, 16, 128, 2,  64,  32,  32,  32, 1, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64, 32, 128, 2,  64,  32,  32,  32, 1, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64, 64, 256, 2,  64,  32,  32,  32, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64,  8, 128, 1,  64,  32,  32,  32, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64, 16, 256, 2,  64,  32,  32,  32, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64, 32, 256, 2,  64,  32,  32,  32, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64, 64, 256, 4,  64,  32,  32,  32, 1, true);

-    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80,  80,  8, 128, 2, 128,  40,  40,  40, 1, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80,  80, 16, 128, 2,  64,  40,  40,  40, 1, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80,  80, 32, 128, 2,  64,  40,  40,  40, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80,  80,  8, 256, 2,  64,  40,  40,  40, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80,  80, 16, 256, 2,  64,  40,  40,  40, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80,  80, 32, 256, 2,  64,  40,  40,  40, 1, true);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80,  80, 64, 256, 2,  64,  40,  40,  40, 1, true);

-    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96,  96,  8, 128, 2, 128,  48,  48,  48, 1, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96,  96, 16, 128, 2,  64,  48,  48,  48, 1, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96,  96, 32, 128, 2,  64,  48,  48,  48, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96,  96,  8, 256, 2,  64,  48,  48,  48, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96,  96, 16, 256, 2,  64,  48,  48,  48, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96,  96, 32, 256, 2,  64,  48,  48,  48, 1, true);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96,  96, 64, 256, 2,  64,  48,  48,  48, 1, true);

-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112,  8, 128, 2, 128,  56,  56,  56, 1, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 16, 128, 2,  64,  56,  56,  56, 1, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 32, 128, 2,  64,  56,  56,  56, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112,  8, 256, 2,  64,  56,  56,  56, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 16, 256, 2,  64,  56,  56,  56, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 32, 256, 2,  64,  56,  56,  56, 1, true);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 64, 256, 2,  64,  56,  56,  56, 1, true);

-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128,  8, 128, 2, 128,  64,  64,  64, 1, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 16, 128, 2,  64,  64,  64,  64, 1, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 32, 128, 2,  64,  64,  64,  64, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128,  8, 256, 2,  64,  64,  64,  64, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 16, 256, 2,  64,  64,  64,  64, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 32, 256, 2,  64,  64,  64,  64, 1, true);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 64, 256, 2,  64,  64,  64,  64, 1, true);

-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256,  8,  64, 4,  64, 128, 128, 128, 1, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 16,  64, 4,  32, 128, 128, 128, 1, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2,  32, 128, 128, 128, 1, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 256, 2,  32, 128, 128, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128,  8, 256, 1,  64,  64,  64,  64, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 16, 256, 1,  64,  64,  64,  64, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 32, 256, 1,  64,  64,  64,  64, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 64, 512, 1,  64,  64,  64,  64, 1, true);

-    // Fallback for unsupported DKQ values (e.g. 576). Must return non-zero values to satisfy
-    // compile-time static_asserts even though the kernel guard prevents runtime execution.
-    // nthreads=256 gives nwarps=4 (warp_size=64) or 8 (warp_size=32), nbatch_fa=128 satisfies np*16 divisibility.
-    return fattn_mma_config(256, 1, 128, 4, 4, 4, 1, false);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256,  8, 256, 1,  64, 128, 128, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 16, 256, 1,  64, 128, 128, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 256, 1,  64, 128, 128, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 512, 1,  64, 128, 128,  64, 1, true);
+
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 256, 1,  64, 160, 128, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 256, 1,  64, 160, 128, 128, 1, true);
+
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512,  8, 256, 1,  64, 128, 128, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16, 256, 1,  64, 128, 128, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 256, 1,  64, 128, 128, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 64, 256, 1,  64, 128, 128, 128, 1, true);
+
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  8, 256, 1,  64, 128, 128, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16, 256, 1,  64, 128, 128, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 256, 1,  64, 160, 128, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 256, 1,  64, 160, 128, 128, 1, true);
+
+    return fattn_mma_config(32, 1, 0, 0, 0, 0, 0, false);
 }

 static __host__ fattn_mma_config ggml_cuda_fattn_mma_get_config(const int DKQ, const int DV, const int ncols, const int cc) {
@@ -510,7 +556,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
        const int jt,
        const int kb0,
        const int k_VKQ_sup) {
-#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) || defined(AMD_MFMA_AVAILABLE)
+#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
    constexpr int  warp_size       = ggml_cuda_get_physical_warp_size();
    constexpr int  ncols           = ncols1 * ncols2;
    constexpr int  cols_per_warp   = T_B_KQ::I;
@@ -712,6 +758,18 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
 #pragma unroll
            for (int i00 = 0; i00 < nbatch_fa; i00 += np*T_C_KQ::J) {
                const int i0 = i00 + (threadIdx.y % np)*T_C_KQ::J;
+
+                // The mask is stored as 16 bit half values, loading them as 32 bit half2 values is preferred in terms of speed.
+                // However, this is not possible for RDNA3 where 2 consecutive l indices are not consecutive in the mask memory layout.
+#ifdef RDNA3
+#pragma unroll
+                for (int l = 0; l < T_C_KQ::ne; ++l) {
+                    const int i = i0 + T_C_KQ::get_j(l);
+                    const int j = ((threadIdx.y / np)*cols_per_warp + T_C_KQ::get_i(l)) / ncols2;
+
+                    KQ_C[i00/(np*T_C_KQ::J)].x[l] += __half2float(tile_mask[j*(nbatch_fa + 8) + i]);
+                }
+#else
 #pragma unroll
                for (int l0 = 0; l0 < T_C_KQ::ne; l0 += 2) {
                    const int i = (i0 + T_C_KQ::get_j(l0)) / 2;
@@ -721,6 +779,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                    KQ_C[i00/(np*T_C_KQ::J)].x[l0 + 0] += slope*tmp.x;
                    KQ_C[i00/(np*T_C_KQ::J)].x[l0 + 1] += slope*tmp.y;
                }
+#endif // RDNA3
            }
        }

@@ -827,13 +886,23 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
            }
        }
 #elif defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
-        const half2 KQ_max_scale_h2 = make_half2(
-            KQ_max_scale[0], KQ_max_scale[0]);
+        if constexpr (std::is_same_v<decltype(T_C_VKQ::x), half2[T_C_VKQ::ne]>) {
+            const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[0]);
 #pragma unroll
-        for (int i = 0; i < (DV/2)/T_C_VKQ::J; ++i) {
+            for (int i = 0; i < (DV/2)/T_C_VKQ::J; ++i) {
 #pragma unroll
-            for (int l = 0; l < T_C_VKQ::ne; ++l) {
-                VKQ_C[i].x[l] *= KQ_max_scale_h2;
+                for (int l = 0; l < T_C_VKQ::ne; ++l) {
+                    VKQ_C[i].x[l] *= KQ_max_scale_h2;
+                }
+            }
+        } else {
+            static_assert(std::is_same_v<decltype(T_C_VKQ::x), float[T_C_VKQ::ne]>, "bad VKQ type");
+#pragma unroll
+            for (int i = 0; i < DV/T_C_VKQ::J; ++i) {
+#pragma unroll
+                for (int l = 0; l < T_C_VKQ::ne; ++l) {
+                    VKQ_C[i].x[l] *= KQ_max_scale[0];
+                }
            }
        }
 #else // Volta
@@ -901,9 +970,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
        const half2 * tile_V_i = !V_is_K_view || i0_stop > 2*nbatch_K2 ? tile_V : tile_V + i0_start/2;

 #if defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
-        constexpr int i0_stride = cols_per_warp == 8 ? T_C_VKQ::I : 2*T_C_VKQ::J;
 #pragma unroll
-        for (int i_VKQ_0 = i0_start; i_VKQ_0 < i0_stop; i_VKQ_0 += i0_stride) {
+        for (int i_VKQ_0 = i0_start; i_VKQ_0 < i0_stop; i_VKQ_0 += T_A_VKQ::I) {
            static_assert((nbatch_fa/2) % (np*T_A_VKQ::J) == 0, "bad loop size");
 #pragma unroll
            for (int k00 = 0; k00 < nbatch_fa/2; k00 += np*T_A_VKQ::J) {
@@ -912,15 +980,15 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                T_A_VKQ A; // Transposed in SRAM but not in registers, gets transposed on load.
                load_ldmatrix_trans(A, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
                if constexpr (T_B_KQ::I == 8) {
-                    mma(VKQ_C[i_VKQ_0/i0_stride], A, B[k00/(np*T_A_VKQ::J)]);
+                    mma(VKQ_C[i_VKQ_0/T_A_VKQ::I], A, B[k00/(np*T_A_VKQ::J)]);
                } else {
                    // Wide version of VKQ_C is column-major.
 #if defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
                    // AMD matrix C is column-major.
-                    mma(VKQ_C[i_VKQ_0/i0_stride], A, B[k00/(np*T_A_VKQ::J)]);
+                    mma(VKQ_C[i_VKQ_0/T_A_VKQ::I], A, B[k00/(np*T_A_VKQ::J)]);
 #else
                    // swap A and B for CUDA.
-                    mma(VKQ_C[i_VKQ_0/i0_stride], B[k00/(np*T_A_VKQ::J)], A);
+                    mma(VKQ_C[i_VKQ_0/T_A_VKQ::I], B[k00/(np*T_A_VKQ::J)], A);
 #endif // defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
                }
            }
@@ -953,11 +1021,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
        tile_Q, tile_K, tile_V, tile_mask,
        Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0);
    NO_DEVICE_CODE;
-#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) || defined(AMD_MFMA_AVAILABLE)
+#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
 }

 #if defined(TURING_MMA_AVAILABLE)
-template<int ncols> struct mma_tile_sizes {
+template<int DV, int ncols> struct mma_tile_sizes {
    using T_A_KQ  = tile<16,  8, half2>; // row-major
    using T_B_KQ  = tile<16,  8, half2>; // column-major
    using T_C_KQ  = tile<16, 16, float>; // column-major
@@ -965,7 +1033,7 @@ template<int ncols> struct mma_tile_sizes {
    using T_B_VKQ = tile<16,  8, half2>; // column-major
    using T_C_VKQ = tile<16,  8, half2>; // column-major
 };
-template<> struct mma_tile_sizes<8> {
+template<int DV> struct mma_tile_sizes<DV, 8> {
    using T_A_KQ  = tile<16,  8, half2>; // row-major
    using T_B_KQ  = tile< 8,  8, half2>; // column-major
    using T_C_KQ  = tile<16,  8, float>; // row-major
@@ -973,8 +1041,60 @@ template<> struct mma_tile_sizes<8> {
    using T_B_VKQ = tile< 8,  8, half2>; // column-major
    using T_C_VKQ = tile<16,  4, half2>; // row-major
 };
-#elif defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
-template<int ncols> struct mma_tile_sizes {
+#elif defined(AMD_WMMA_AVAILABLE)
+#ifdef RDNA3
+template<int DV, int ncols> struct mma_tile_sizes {
+    using T_A_KQ  = tile<16,  8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // row-major
+    using T_B_KQ  = tile<16,  8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // column-major
+    using T_C_KQ  = tile<16, 16, float, DATA_LAYOUT_I_MAJOR>;          // column-major
+    using T_A_VKQ = tile<32,  8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // row-major
+    using T_B_VKQ = tile<16,  8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // column-major
+    using T_C_VKQ = tile<16, 16, half2, DATA_LAYOUT_I_MAJOR>;          // column-major
+};
+template<int ncols> struct mma_tile_sizes<80, ncols> {
+    using T_A_KQ  = tile<16,  8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // row-major
+    using T_B_KQ  = tile<16,  8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // column-major
+    using T_C_KQ  = tile<16, 16, float, DATA_LAYOUT_I_MAJOR>;          // column-major
+    using T_A_VKQ = tile<16,  8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // row-major
+    using T_B_VKQ = tile<16,  8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // column-major
+    using T_C_VKQ = tile<16, 16, float, DATA_LAYOUT_I_MAJOR>;          // column-major
+};
+template<int ncols> struct mma_tile_sizes<112, ncols> {
+    using T_A_KQ  = tile<16,  8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // row-major
+    using T_B_KQ  = tile<16,  8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // column-major
+    using T_C_KQ  = tile<16, 16, float, DATA_LAYOUT_I_MAJOR>;          // column-major
+    using T_A_VKQ = tile<16,  8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // row-major
+    using T_B_VKQ = tile<16,  8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // column-major
+    using T_C_VKQ = tile<16, 16, float, DATA_LAYOUT_I_MAJOR>;          // column-major
+};
+#else
+template<int DV, int ncols> struct mma_tile_sizes {
+    using T_A_KQ  = tile<16,  8, half2, DATA_LAYOUT_I_MAJOR>;           // row-major
+    using T_B_KQ  = tile<16,  8, half2, DATA_LAYOUT_I_MAJOR>;           // column-major
+    using T_C_KQ  = tile<16, 16, float, DATA_LAYOUT_I_MAJOR>;           // column-major
+    using T_A_VKQ = tile<32,  8, half2, DATA_LAYOUT_I_MAJOR>;           // row-major
+    using T_B_VKQ = tile<16,  8, half2, DATA_LAYOUT_I_MAJOR>;           // column-major
+    using T_C_VKQ = tile<16, 16, half2, DATA_LAYOUT_I_MAJOR_SCRAMBLED>; // column-major
+};
+template<int ncols> struct mma_tile_sizes<80, ncols> {
+    using T_A_KQ  = tile<16,  8, half2>; // row-major
+    using T_B_KQ  = tile<16,  8, half2>; // column-major
+    using T_C_KQ  = tile<16, 16, float>; // column-major
+    using T_A_VKQ = tile<16,  8, half2>; // row-major
+    using T_B_VKQ = tile<16,  8, half2>; // column-major
+    using T_C_VKQ = tile<16,  8, half2>; // column-major
+};
+template<int ncols> struct mma_tile_sizes<112, ncols> {
+    using T_A_KQ  = tile<16,  8, half2>; // row-major
+    using T_B_KQ  = tile<16,  8, half2>; // column-major
+    using T_C_KQ  = tile<16, 16, float>; // column-major
+    using T_A_VKQ = tile<16,  8, half2>; // row-major
+    using T_B_VKQ = tile<16,  8, half2>; // column-major
+    using T_C_VKQ = tile<16,  8, half2>; // column-major
+};
+#endif // RDNA3
+#elif defined(AMD_MFMA_AVAILABLE)
+template<int DV, int ncols> struct mma_tile_sizes {
    using T_A_KQ  = tile<16,  8, half2>; // row-major
    using T_B_KQ  = tile<16,  8, half2>; // column-major
    using T_C_KQ  = tile<16, 16, float>; // column-major
@@ -983,7 +1103,7 @@ template<int ncols> struct mma_tile_sizes {
    using T_C_VKQ = tile<16,  8, half2>; // column-major
 };
 #else // Volta
-template<int ncols> struct mma_tile_sizes {
+template<int DV, int ncols> struct mma_tile_sizes {
    using T_A_KQ  = tile< 8,  4, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // row-major
    using T_B_KQ  = tile<32,  4, half2, DATA_LAYOUT_I_MAJOR>;          // column-major
    using T_C_KQ  = tile<32,  8, float, DATA_LAYOUT_I_MAJOR>;          // column-major
@@ -1018,17 +1138,17 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
        const int zt_gqa,
        const int kb0_start,
        const int kb0_stop) {
-#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) || defined(AMD_MFMA_AVAILABLE)
+#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.

    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
    constexpr int ncols = ncols1 * ncols2;
-    using     T_A_KQ    = typename mma_tile_sizes<ncols>::T_A_KQ;
-    using     T_B_KQ    = typename mma_tile_sizes<ncols>::T_B_KQ;
-    using     T_C_KQ    = typename mma_tile_sizes<ncols>::T_C_KQ;
-    using     T_A_VKQ   = typename mma_tile_sizes<ncols>::T_A_VKQ;
-    using     T_B_VKQ   = typename mma_tile_sizes<ncols>::T_B_VKQ;
-    using     T_C_VKQ   = typename mma_tile_sizes<ncols>::T_C_VKQ;
+    using     T_A_KQ    = typename mma_tile_sizes<DV, ncols>::T_A_KQ;
+    using     T_B_KQ    = typename mma_tile_sizes<DV, ncols>::T_B_KQ;
+    using     T_C_KQ    = typename mma_tile_sizes<DV, ncols>::T_C_KQ;
+    using     T_A_VKQ   = typename mma_tile_sizes<DV, ncols>::T_A_VKQ;
+    using     T_B_VKQ   = typename mma_tile_sizes<DV, ncols>::T_B_VKQ;
+    using     T_C_VKQ   = typename mma_tile_sizes<DV, ncols>::T_C_VKQ;

    constexpr int  cols_per_warp   = T_B_KQ::I;
    constexpr int  cols_per_thread = get_cols_per_thread();
@@ -1061,6 +1181,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
    T_B_KQ    Q_B[(Q_in_reg ? DKQ/(2*T_B_KQ::J) : 1)];
 #if defined(TURING_MMA_AVAILABLE)
    T_C_VKQ VKQ_C[cols_per_warp == 8 ? DV/T_C_VKQ::I : DV/(2*T_C_VKQ::J)];
+#elif defined(AMD_WMMA_AVAILABLE) && defined(RDNA3)
+    T_C_VKQ VKQ_C[DV % 32 != 0       ? DV/T_C_VKQ::J : DV/(2*T_C_VKQ::J)];
 #elif defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
    T_C_VKQ VKQ_C[                                     DV/(2*T_C_VKQ::J)];
 #else // Volta
@@ -1269,12 +1391,23 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
            }
        }
 #elif defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
-        const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[0]);
+        if constexpr (std::is_same_v<decltype(T_C_VKQ::x), half2[T_C_VKQ::ne]>) {
+            const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[0]);
 #pragma unroll
-        for (int i = 0; i < (DV/2)/T_C_VKQ::J; ++i) {
+            for (int i = 0; i < (DV/2)/T_C_VKQ::J; ++i) {
 #pragma unroll
-            for (int l = 0; l < T_C_VKQ::ne; ++l) {
-                VKQ_C[i].x[l] *= KQ_max_scale_h2;
+                for (int l = 0; l < T_C_VKQ::ne; ++l) {
+                    VKQ_C[i].x[l] *= KQ_max_scale_h2;
+                }
+            }
+        } else {
+            static_assert(std::is_same_v<decltype(T_C_VKQ::x), float[T_C_VKQ::ne]>, "bad VKQ type");
+#pragma unroll
+            for (int i = 0; i < DV/T_C_VKQ::J; ++i) {
+#pragma unroll
+                for (int l = 0; l < T_C_VKQ::ne; ++l) {
+                    VKQ_C[i].x[l] *= KQ_max_scale[0];
+                }
            }
        }
 #else // Volta
@@ -1433,6 +1566,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
 #pragma unroll
    for (int k00 = 0; k00 < DV/2; k00 += nbatch_combine) {
        if constexpr (cols_per_warp == 8) {
+            static_assert(std::is_same_v<decltype(T_C_VKQ::x), half2[T_C_VKQ::ne]>, "bad VKQ type");
            const int jc_cwd = threadIdx.y*T_B_KQ::I + T_B_KQ::get_i(-1); // jc combine write data
 #pragma unroll
            for (int k1 = 0; k1 < nbatch_combine; k1 += T_B_KQ::J) {
@@ -1447,14 +1581,45 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
            }
        } else {
            const int j0 = threadIdx.y*cols_per_warp;
+            if constexpr (std::is_same_v<decltype(T_C_VKQ::x), half2[T_C_VKQ::ne]>) {
+                if constexpr (T_C_VKQ::dl == DATA_LAYOUT_I_MAJOR) {
 #pragma unroll
-            for (int k1 = 0; k1 < nbatch_combine; k1 += T_C_VKQ::J) {
+                    for (int k1 = 0; k1 < nbatch_combine; k1 += T_C_VKQ::J) {
 #pragma unroll
-                for (int l = 0; l < T_C_VKQ::ne; ++l) {
-                    const int j = j0 + T_C_VKQ::get_i(l);
-                    const int k = k1 + T_C_VKQ::get_j(l);
+                        for (int l = 0; l < T_C_VKQ::ne; ++l) {
+                            const int j = j0 + T_C_VKQ::get_i(l);
+                            const int k = k1 + T_C_VKQ::get_j(l);

-                    tile_Q[j*tile_stride + k] = VKQ_C[(k00 + k1)/T_C_VKQ::J].x[l];
+                            tile_Q[j*tile_stride + k] = VKQ_C[(k00 + k1)/T_C_VKQ::J].x[l];
+                        }
+                    }
+                } else {
+                    static_assert(T_C_VKQ::dl == DATA_LAYOUT_I_MAJOR_SCRAMBLED, "bad T_C_VKQ data layout");
+                    using T_C_VKQ_us = tile<T_C_VKQ::I, T_C_VKQ::J, half2, DATA_LAYOUT_I_MAJOR>; // us == unscrambled
+#pragma unroll
+                    for (int k1 = 0; k1 < nbatch_combine; k1 += T_C_VKQ::J) {
+                        const T_C_VKQ_us VKQ_C_us = unscramble(VKQ_C[(k00 + k1)/T_C_VKQ::J]);
+#pragma unroll
+                        for (int l = 0; l < T_C_VKQ_us::ne; ++l) {
+                            const int j = j0 + T_C_VKQ_us::get_i(l);
+                            const int k = k1 + T_C_VKQ_us::get_j(l);
+
+                            tile_Q[j*tile_stride + k] = VKQ_C_us.x[l];
+                        }
+                    }
+                }
+            } else {
+                static_assert(std::is_same_v<decltype(T_C_VKQ::x), float[T_C_VKQ::ne]>, "bad VKQ type");
+                half * tile_Q_h = (half *) tile_Q;
+#pragma unroll
+                for (int k1 = 0; k1 < nbatch_combine; k1 += T_C_VKQ::J/2) {
+#pragma unroll
+                    for (int l = 0; l < T_C_VKQ::ne; ++l) {
+                        const int j = j0 + T_C_VKQ::get_i(l);
+                        const int k = 2*k1 + T_C_VKQ::get_j(l);
+
+                        tile_Q_h[j*(2*tile_stride) + k] = VKQ_C[(k00 + k1)/(T_C_VKQ::J/2)].x[l];
+                    }
                }
            }
        }
@@ -1532,7 +1697,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
        stride_Q1, stride_Q2, stride_K, stride_V, stride_mask,
        jt, kb0_start, kb0_stop);
    NO_DEVICE_CODE;
-#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) || defined(AMD_MFMA_AVAILABLE)
+#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
 }

 template<int DKQ, int DV, int ncols1, int ncols2, bool use_logit_softcap, bool V_is_K_view>
@@ -1559,7 +1724,7 @@ static __global__ void flash_attn_ext_f16(
                            const int32_t nb21, const int32_t nb22, const int64_t nb23,
                            const int32_t ne31, const int32_t ne32, const int32_t ne33,
                            const int32_t nb31, const int32_t nb32, const int64_t nb33) {
-#if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) || defined(AMD_MFMA_AVAILABLE))
+#if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE))

    // Skip unused kernel variants for faster compilation:
    if (use_logit_softcap && !(DKQ == 128 || DKQ == 256 || DKQ == 512)) {
@@ -1585,14 +1750,14 @@ static __global__ void flash_attn_ext_f16(
 #endif // __CUDA_ARCH__ == GGML_CUDA_CC_TURING

 #if defined(AMD_WMMA_AVAILABLE)
-    if (ncols1*ncols2 > 32 || ncols1*ncols2 < 16 || DKQ > 128 || ncols2 == 1) {
+    if (ncols1*ncols2 < 16 || ncols2 == 1 || DKQ > 128) {
        NO_DEVICE_CODE;
        return;
    }
 #endif // defined(AMD_WMMA_AVAILABLE)

 #if defined(AMD_MFMA_AVAILABLE)
-    if (DKQ != 64 && DKQ != 80 && DKQ != 96 && DKQ != 112 && DKQ != 128) {
+    if (ncols1*ncols2 < 16 || DKQ > 256) {
        NO_DEVICE_CODE;
        return;
    }
@@ -1715,7 +1880,7 @@ static __global__ void flash_attn_ext_f16(
              ne31, ne32, ne33,
              nb31, nb32, nb33);
    NO_DEVICE_CODE;
-#endif // defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) || defined(AMD_MFMA_AVAILABLE))
+#endif // defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE))
 }

 template <int DKQ, int DV, int ncols1, int ncols2>
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -19,13 +19,14 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_con
    }

    if constexpr (ncols2 <= 16) {
-        if ((turing_mma_available(cc) || amd_wmma_available(cc)) && Q->ne[1] <= 16/ncols2) {
+        if (Q->ne[1] <= 16/ncols2) {
            ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 16/ncols2, ncols2>(ctx, dst);
            return;
        }
    }

-    if (ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_TURING || amd_wmma_available(cc) || Q->ne[1] <= 32/ncols2) {
+    if (Q->ne[1] <= 32/ncols2 || (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_TURING) ||
+            (GGML_CUDA_CC_IS_AMD(cc) && DKQ > 256)) {
        ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 32/ncols2, ncols2>(ctx, dst);
        return;
    }
@@ -477,12 +478,13 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
        return BEST_FATTN_KERNEL_MMA_F16;
    }

+    const int ncols2_max = Q->ne[0] == 320 ? 32 : ((Q->ne[0] == 576 || Q->ne[0] == 192) ? 16 : 8);
+    int gqa_ratio_eff = 1;
+    while (gqa_ratio % (2*gqa_ratio_eff) == 0 && gqa_ratio_eff < ncols2_max) {
+        gqa_ratio_eff *= 2;
+    }
+
    if (volta_mma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72) {
-        int gqa_ratio_eff = 1;
-        const int ncols2_max = (Q->ne[0] == 576 || Q->ne[0] == 192) ? 16 : 8;
-        while (gqa_ratio % (2*gqa_ratio_eff) == 0 && gqa_ratio_eff < ncols2_max) {
-            gqa_ratio_eff *= 2;
-        }
        if (can_use_vector_kernel && Q->ne[1] * gqa_ratio_eff <= 2) {
            return BEST_FATTN_KERNEL_VEC;
        }
@@ -500,41 +502,22 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
        return BEST_FATTN_KERNEL_WMMA_F16;
    }

-    if (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc) && gqa_opt_applies && Q->ne[0] <= 128 && Q->ne[0] != 40 && Q->ne[0] != 72) {
-        if (can_use_vector_kernel) {
-            if (!ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) {
-                if (Q->ne[1] == 1) {
-                    if (!gqa_opt_applies) {
-                        return BEST_FATTN_KERNEL_VEC;
-                    }
-                }
-            } else {
-                if (Q->ne[1] <= 2) {
-                    return BEST_FATTN_KERNEL_VEC;
-                }
-            }
-        }
-        int gqa_ratio_eff = 1;
-        const int ncols2_max = Q->ne[0] == 576 ? 16 : 8;
-        while (gqa_ratio % (2*gqa_ratio_eff) == 0 && gqa_ratio_eff < ncols2_max) {
-            gqa_ratio_eff *= 2;
-        }
-        if (Q->ne[1] * gqa_ratio_eff <= 8) {
-            return BEST_FATTN_KERNEL_TILE; // AMD WMMA is only faster if the full tile width of 16 can be utilized.
-        }
-        return BEST_FATTN_KERNEL_MMA_F16;
-    }
-
-    // Use MFMA flash attention for CDNA (MI100+):
-    if (amd_mfma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 192 && Q->ne[0] != 256 && Q->ne[0] != 512 && Q->ne[0] != 576) {
-        const int64_t eff_nq = Q->ne[1] * (gqa_opt_applies ? gqa_ratio : 1);
-        // MMA vs tile crossover benchmarked on MI300X @ d32768:
-        //   hsk=64  (gqa=4): MMA wins at eff >= 128 (+11%)
-        //   hsk=128 (gqa=4): MMA wins at eff >= 128 (+4%)
-        if (eff_nq >= (GGML_CUDA_CC_IS_CDNA1(cc) && Q->ne[0] == 64 ? 64 : 128)) {
+    // AMD MFMA needs a certain minimum batch size to outscale the tile kernel for large head sizes.
+    if ((amd_mfma_available(cc) && Q->ne[0] <= 256) && Q->ne[0] != 40 && Q->ne[0] != 72) {
+        if ((Q->ne[0] <= 64 && Q->ne[1] * gqa_ratio_eff > 8)) {
            return BEST_FATTN_KERNEL_MMA_F16;
        }
-        // Fall through to tile kernel for small effective batch sizes.
+        if ((Q->ne[0] <= 128 && Q->ne[1] * gqa_ratio_eff > 16)) {
+            return BEST_FATTN_KERNEL_MMA_F16;
+        }
+        if ((Q->ne[0] <= 256 && Q->ne[1] * gqa_ratio_eff > 64)) {
+            return BEST_FATTN_KERNEL_MMA_F16;
+        }
+    }
+
+    // AMD WMMA is always faster than the tile kernel if the full tile width of 16 can be utilized.
+    if ((amd_wmma_available(cc) && gqa_opt_applies && Q->ne[0] <= 128) && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[1] * gqa_ratio_eff > 8) {
+        return BEST_FATTN_KERNEL_MMA_F16;
    }

    // If there are no tensor cores available, use the generic tile kernel:
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3929,10 +3929,25 @@ static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph
        // closure check: the trailing add must read the same x as the leading mul
        const ggml_tensor * x_in_add = (add->src[0] == mul1) ? add->src[1] : add->src[0];

-        const bool type_ok  = (x->type == GGML_TYPE_F32 || x->type == GGML_TYPE_F16 || x->type == GGML_TYPE_BF16);
+        // Kernel iterates over total = T * C, so x and add must be 2D and
+        // a / inv_b must collapse to [1, C, 1, 1]. Higher dims are not handled.
+        const bool dim_ok   = (x->ne[2]   == 1 && x->ne[3]   == 1) &&
+                              (add->ne[2] == 1 && add->ne[3] == 1) &&
+                              (a->ne[2]   == 1 && a->ne[3]   == 1);
        const bool shape_ok = ggml_are_same_shape(a, inv_b) && a->ne[0] == 1 && a->ne[1] == x->ne[1];

-        if (type_ok && shape_ok && x_in_add == x && add->type == x->type) {
+        // x must be in the supported whitelist and every operand / intermediate
+        // result must share x's type, since launch_snake casts a / inv_b as
+        // float and templates the kernel on a single T. Mixed precision chains
+        // fall back to the naive path.
+        const ggml_tensor * sin1 = cgraph->nodes[i + 1];
+        const bool types_ok = (x->type == GGML_TYPE_F32 || x->type == GGML_TYPE_F16 || x->type == GGML_TYPE_BF16) &&
+                              (a->type    == x->type) && (inv_b->type == x->type) &&
+                              (mul0->type == x->type) && (sin1->type  == x->type) &&
+                              (sqr->type  == x->type) && (mul1->type  == x->type) &&
+                              (add->type  == x->type);
+
+        if (types_ok && shape_ok && dim_ok && x_in_add == x) {
            ggml_cuda_op_snake_fused(*cuda_ctx, x, a, inv_b, add);
            return 4;
        }
@@ -5291,12 +5306,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_VIEW:
        case GGML_OP_PERMUTE:
        case GGML_OP_TRANSPOSE:
-        case GGML_OP_ADD:
        case GGML_OP_ADD_ID:
        case GGML_OP_ADD1:
-        case GGML_OP_SUB:
-        case GGML_OP_MUL:
-        case GGML_OP_DIV:
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_SQRT:
@@ -5305,6 +5316,13 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_CLAMP:
        case GGML_OP_LOG:
            return true;
+        case GGML_OP_ADD:
+        case GGML_OP_SUB:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+            return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
+                   (op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16) &&
+                   (op->type         == GGML_TYPE_F32 || op->type         == GGML_TYPE_F16);
        case GGML_OP_SSM_SCAN: {
            if (op->src[3]->ne[0] == 1) {
                // Mamba2
--- a/ggml/src/ggml-cuda/im2col.cu
+++ b/ggml/src/ggml-cuda/im2col.cu
@@ -1,5 +1,6 @@
 #include "im2col.cuh"

+#define MAX_GRIDDIM_Y 65535
 #define MAX_GRIDDIM_Z 65535

 template <typename T>
@@ -18,22 +19,23 @@ static  __global__ void im2col_kernel(
    const int64_t ikh = rem / KW;
    const int64_t ikw = rem - ikh * KW;

-    const int64_t  iow = blockIdx.y;
-    for (int64_t iz = blockIdx.z; iz < N_OH; iz+=MAX_GRIDDIM_Z) {
-        const int64_t  in = iz / OH;
-        const int64_t  ioh = iz - in * OH;
+    for (int64_t iow = blockIdx.y; iow < OW; iow += MAX_GRIDDIM_Y) {
+        for (int64_t iz = blockIdx.z; iz < N_OH; iz += MAX_GRIDDIM_Z) {
+            const int64_t  in = iz / OH;
+            const int64_t  ioh = iz - in * OH;

-        const int64_t iiw = iow * s0 + ikw * d0 - p0;
-        const int64_t iih = ioh * s1 + ikh * d1 - p1;
+            const int64_t iiw = iow * s0 + ikw * d0 - p0;
+            const int64_t iih = ioh * s1 + ikh * d1 - p1;

-        const int64_t offset_dst =
-            ((in * OH + ioh) * OW + iow) * IC_KH_KW + iic * KH_KW + ikh * KW + ikw;
+            const int64_t offset_dst =
+                ((in * OH + ioh) * OW + iow) * IC_KH_KW + iic * KH_KW + ikh * KW + ikw;

-        if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
-            dst[offset_dst] = 0.0f;
-        } else {
-            const int64_t offset_src = iic * IC_IH_IW + in * IH_IW;
-            dst[offset_dst] = x[offset_src + iih * IW + iiw];
+            if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+                dst[offset_dst] = 0.0f;
+            } else {
+                const int64_t offset_src = iic * IC_IH_IW + in * IH_IW;
+                dst[offset_dst] = x[offset_src + iih * IW + iiw];
+            }
        }
    }

@@ -51,7 +53,7 @@ static void im2col_cuda(const float * x, T* dst,
    const int64_t num_blocks = (IC_KH_KW + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
    const int64_t N_OH = N * OH;
    const int64_t KH_KW = KW*KH;
-    dim3 block_nums(num_blocks, OW, MIN(N_OH, MAX_GRIDDIM_Z));
+    dim3 block_nums(num_blocks, MIN(OW, MAX_GRIDDIM_Y), MIN(N_OH, MAX_GRIDDIM_Z));
    im2col_kernel<<<block_nums, MIN(IC_KH_KW, CUDA_IM2COL_BLOCK_SIZE) , 0, stream>>>(x, dst, IC, IW, IH, OH, OW, KW, KH,
                                                                                     IC_IH_IW, IH_IW, N_OH, KH_KW, IC_KH_KW,
                                                                                     s0, s1, p0, p1, d0, d1);
@@ -136,23 +138,24 @@ static  __global__ void im2col_3d_kernel(
    const int64_t ikh = (i - iic * KD_KH_KW - ikd * KH_KW) / KW;
    const int64_t ikw = i % KW;

-    const int64_t  iow = blockIdx.y;
-    for (int64_t iz = blockIdx.z; iz < N_OD_OH; iz+=MAX_GRIDDIM_Z) {
-        const int64_t in  = iz / OD_OH;
-        const int64_t iod = (iz - in*OD_OH) / OH;
-        const int64_t ioh = iz % OH;
+    for (int64_t iow = blockIdx.y; iow < OW; iow += MAX_GRIDDIM_Y) {
+        for (int64_t iz = blockIdx.z; iz < N_OD_OH; iz += MAX_GRIDDIM_Z) {
+            const int64_t in  = iz / OD_OH;
+            const int64_t iod = (iz - in*OD_OH) / OH;
+            const int64_t ioh = iz % OH;

-        const int64_t iiw = iow * s0 + ikw * d0 - p0;
-        const int64_t iih = ioh * s1 + ikh * d1 - p1;
-        const int64_t iid = iod * s2 + ikd * d2 - p2;
+            const int64_t iiw = iow * s0 + ikw * d0 - p0;
+            const int64_t iih = ioh * s1 + ikh * d1 - p1;
+            const int64_t iid = iod * s2 + ikd * d2 - p2;

-        const int64_t offset_dst = in*OD_OH_OW_IC_KD_KH_KW + iod*OH_OW_IC_KD_KH_KW + ioh*OW_IC_KD_KH_KW + iow*IC_KD_KH_KW + iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw;
+            const int64_t offset_dst = in*OD_OH_OW_IC_KD_KH_KW + iod*OH_OW_IC_KD_KH_KW + ioh*OW_IC_KD_KH_KW + iow*IC_KD_KH_KW + iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw;

-        if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
-            dst[offset_dst] = 0.0f;
-        } else {
-            const int64_t offset_src = ((in * IC + iic) * stride_q) + (iid * stride_z) + (iih * stride_y) + (iiw * stride_x);
-            dst[offset_dst] = src[offset_src];
+            if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
+                dst[offset_dst] = 0.0f;
+            } else {
+                const int64_t offset_src = ((in * IC + iic) * stride_q) + (iid * stride_z) + (iih * stride_y) + (iiw * stride_x);
+                dst[offset_dst] = src[offset_src];
+            }
        }
    }
 }
@@ -178,7 +181,7 @@ static void im2col_3d_cuda(const float * src, T* dst,
    const int64_t OH_OW_IC_KD_KH_KW = OH*OW*IC*KD*KH*KW;
    const int64_t OW_IC_KD_KH_KW = OW*IC*KD*KH*KW;
    const int64_t num_blocks = (IC_KD_KH_KW + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
-    dim3 block_nums(num_blocks, OW, MIN(N_OD_OH, MAX_GRIDDIM_Z));
+    dim3 block_nums(num_blocks, MIN(OW, MAX_GRIDDIM_Y), MIN(N_OD_OH, MAX_GRIDDIM_Z));
    im2col_3d_kernel<<<block_nums, MIN(IC_KD_KH_KW, CUDA_IM2COL_BLOCK_SIZE) , 0, stream>>>(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW,
                                                                                           OH_OW, KD_KH_KW, ID_IH_IW, KH_KW, IH_IW, IC_ID_IH_IW,
                                                                                           IC_KD_KH_KW, OW_KD_KH_KW, OD_OH_OW_IC_KD_KH_KW,
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -80,6 +80,7 @@ namespace ggml_cuda_mma {
        DATA_LAYOUT_J_MAJOR           = 10, // Matrix C for CDNA and RDNA4, int and float matrix C for RDNA3.
        DATA_LAYOUT_I_MAJOR_MIRRORED  = 20, // Volta, matrix A&B for RDNA3.
        DATA_LAYOUT_J_MAJOR_MIRRORED  = 30,
+        DATA_LAYOUT_I_MAJOR_SCRAMBLED = 40, // Scrambled matrix C for faster transposition (RDNA4/CDNA), convert to float to unscramble.
    };
    // Implemented mma combinations are:
    //   - (I_MAJOR, I_MAJOR)          -> I_MAJOR
@@ -312,13 +313,19 @@ namespace ggml_cuda_mma {
        half2 x[ne] = {{0.0f, 0.0f}};

        static constexpr __device__ bool supported() {
-            if (I == 16 && J == 8) return true;
+            if (I == 16 && J ==  8) return true;
+            if (I == 16 && J == 16) return true;
+            if (I == 32 && J ==  8) return true;
            return false;
        }

        static __device__ __forceinline__ int get_i(const int l) {
            if constexpr (I == 16 && J == 8) {
                return threadIdx.x % 16;
+            } else if constexpr (I == 16 && J == 16) {
+                return threadIdx.x % 16;
+            } else if constexpr (I == 32 && J == 8) {
+                return (threadIdx.x % 16) * 2 + l / (ne/2);
            } else {
                NO_DEVICE_CODE;
                return -1;
@@ -327,7 +334,15 @@ namespace ggml_cuda_mma {

        static __device__ __forceinline__ int get_j(const int l) {
            if constexpr (I == 16 && J == 8) {
-                return ne * (threadIdx.x / 16) + l;
+                return (threadIdx.x / 16) * ne + l;
+            } else if constexpr (I == 16 && J == 16) {
+#ifdef RDNA3
+                return l*2 + (threadIdx.x / 16);
+#else
+                return (threadIdx.x / 16) * ne + l;
+#endif // RDNA3
+            } else if constexpr (I == 32 && J == 8) {
+                return (threadIdx.x / 16) * (ne/2) + l % (ne/2);
            } else {
                NO_DEVICE_CODE;
                return -1;
@@ -338,13 +353,19 @@ namespace ggml_cuda_mma {
        half2 x[ne] = {{0.0f, 0.0f}};

        static constexpr __device__ bool supported() {
-            if (I == 16 && J == 8) return true;
+            if (I == 16 && J ==  8) return true;
+            if (I == 16 && J == 16) return true;
+            if (I == 32 && J ==  8) return true;
            return false;
        }

        static __device__ __forceinline__ int get_i(const int l) {
            if constexpr (I == 16 && J == 8) {
                return threadIdx.x % 16;
+            } else if constexpr (I == 16 && J == 16) {
+                return threadIdx.x % 16;
+            } else if constexpr (I == 32 && J == 8) {
+                return (threadIdx.x % 16) * 2 + l / (ne/2);
            } else {
                NO_DEVICE_CODE;
                return -1;
@@ -353,7 +374,11 @@ namespace ggml_cuda_mma {

        static __device__ __forceinline__ int get_j(const int l) {
            if constexpr (I == 16 && J == 8) {
-                return ne * (threadIdx.x / 16) + l;
+                return (threadIdx.x / 16) * ne + l;
+            } else if constexpr (I == 16 && J == 16) {
+                return (threadIdx.x / 16) * ne + l;
+            } else if constexpr (I == 32 && J == 8) {
+                return (threadIdx.x / 16) * (ne/2) + l % (ne/2);
            } else {
                NO_DEVICE_CODE;
                return -1;
@@ -516,12 +541,15 @@ namespace ggml_cuda_mma {
            if (I == 16 && J == 16) return true;
            if (I == 16 && J == 8)  return true;
            if (I == 16 && J == 4)  return true;
+            if (I == 32 && J == 8)  return true;
            return false;
        }

-        static __device__ __forceinline__ int get_i(const int /*l*/) {
-            if constexpr (supported()) {
+        static __device__ __forceinline__ int get_i(const int l) {
+            if constexpr (I == 16) {
                return threadIdx.x % 16;
+            } else if constexpr (I == 32) {
+                return (threadIdx.x % 16) * 2 + l / (ne/2);
            } else {
                NO_DEVICE_CODE;
                return -1;
@@ -529,8 +557,10 @@ namespace ggml_cuda_mma {
        }

        static __device__ __forceinline__ int get_j(const int l) {
-            if constexpr (supported()) {
+            if constexpr (I == 16) {
                return l;
+            } else if constexpr (I == 32) {
+                return l % (ne/2);
            } else {
                NO_DEVICE_CODE;
                return -1;
@@ -644,6 +674,40 @@ namespace ggml_cuda_mma {
        }
    };

+    template <int I_, int J_>
+    struct tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_SCRAMBLED> {
+        static constexpr int         I  = I_;
+        static constexpr int         J  = J_;
+        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_SCRAMBLED;
+
+        static constexpr int ne = I * J / ggml_cuda_get_physical_warp_size();
+        half2 x[ne] = {{0.0f, 0.0f}};
+
+        static constexpr __device__ bool supported() {
+            if (I == 16 && J == 16) return true;
+            return false;
+        }
+
+        static __device__ __forceinline__ int get_i(const int l) {
+            return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR>::get_i(l);
+        }
+    };
+
+    static __device__ __forceinline__ tile<16, 16, half2, DATA_LAYOUT_I_MAJOR> unscramble(const tile<16, 16, half2, DATA_LAYOUT_I_MAJOR_SCRAMBLED> & t) {
+#if defined(AMD_MFMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
+        tile<16, 16, half2, DATA_LAYOUT_I_MAJOR> ret;
+#pragma unroll
+        for (int l0 = 0; l0 < t.ne/2; ++l0) {
+            ret.x[2*l0 + 0] =  __lows2half2(t.x[l0], t.x[l0 + t.ne/2]);
+            ret.x[2*l0 + 1] = __highs2half2(t.x[l0], t.x[l0 + t.ne/2]);
+        }
+        return ret;
+#else
+        NO_DEVICE_CODE;
+        GGML_UNUSED(t);
+#endif // defined(AMD_MFMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
+    }
+
 #if defined(TURING_MMA_AVAILABLE)
    template <int I, int J>
    static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
@@ -660,6 +724,21 @@ namespace ggml_cuda_mma {
        ret.x[0] = ggml_cuda_movmatrix(t.x[0]);
        ret.x[1] = ggml_cuda_movmatrix(t.x[1]);

+        return ret;
+    }
+#elif defined(AMD_WMMA_AVAILABLE) && defined(RDNA3)
+    static __device__ __forceinline__ tile<16, 8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> get_half2(
+            const tile<16, 16, float, DATA_LAYOUT_I_MAJOR> & tile_float) {
+        tile<16, 8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> ret;
+#pragma unroll
+        for (int l = 0; l < tile_float.ne; ++l) {
+            float tmp[2];
+            int i = threadIdx.x / 16;
+            tmp[i] = tile_float.x[l];
+            i ^= 1;
+            tmp[i] = __shfl_xor_sync(0xFFFFFFFF, tile_float.x[l], 16, WARP_SIZE);
+            ret.x[l] = make_half2(tmp[0], tmp[1]);
+        }
        return ret;
    }
 #elif defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
@@ -802,21 +881,35 @@ namespace ggml_cuda_mma {
 #endif // defined(VOLTA_MMA_AVAILABLE)
    }

-    template <typename T>
+    template <int I, typename T, data_layout dl>
    static __device__ __forceinline__ void load_ldmatrix_trans(
-            tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
+            tile<I, 8, T, dl> & t, const T * __restrict__ xs0, const int stride) {
 #ifdef TURING_MMA_AVAILABLE
+        static_assert(I == 16, "bad tile width");
+        static_assert(dl == DATA_LAYOUT_I_MAJOR, "bad data layout");
        int * xi = (int *) t.x;
        const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
        asm volatile("ldmatrix.sync.aligned.m8n8.x4.trans.b16 {%0, %1, %2, %3}, [%4];"
            : "=r"(xi[0]), "=r"(xi[2]), "=r"(xi[1]), "=r"(xi[3])
            : "l"(xs));
 #elif defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
-        half * xh = (half *) t.x;
+        static_assert(dl == DATA_LAYOUT_I_MAJOR || dl == DATA_LAYOUT_I_MAJOR_MIRRORED, "bad data layout");
+        if constexpr (I == 32) {
 #pragma unroll
-        for (int l = 0; l < t.ne; ++l) {
-            xh[2*l + 0] = ((const half *) xs0)[(2*t.get_j(l) + 0)*(2*stride) + t.get_i(l)];
-            xh[2*l + 1] = ((const half *) xs0)[(2*t.get_j(l) + 1)*(2*stride) + t.get_i(l)];
+            for (int l0 = 0; l0 < t.ne/2; ++l0) {
+                const half2 tmp0 = xs0[(2*t.get_j(l0) + 0)*stride + t.get_i(l0)/2];
+                const half2 tmp1 = xs0[(2*t.get_j(l0) + 1)*stride + t.get_i(l0)/2];
+
+                t.x[l0]          =  __lows2half2(tmp0, tmp1);
+                t.x[l0 + t.ne/2] = __highs2half2(tmp0, tmp1);
+            }
+        } else {
+            half * xh = (half *) t.x;
+#pragma unroll
+            for (int l = 0; l < t.ne; ++l) {
+                xh[2*l + 0] = ((const half *) xs0)[(2*t.get_j(l) + 0)*(2*stride) + t.get_i(l)];
+                xh[2*l + 1] = ((const half *) xs0)[(2*t.get_j(l) + 1)*(2*stride) + t.get_i(l)];
+            }
        }
 #else
        GGML_UNUSED_VARS(t, xs0, stride);
@@ -972,6 +1065,20 @@ namespace ggml_cuda_mma {
 #endif // TURING_MMA_AVAILABLE
    }

+    static __device__ __forceinline__ void mma(
+            tile<16, 16, half2, DATA_LAYOUT_I_MAJOR_SCRAMBLED> & D, const tile<32, 8, half2, DATA_LAYOUT_I_MAJOR> & A,
+            const tile<16, 8, half2, DATA_LAYOUT_I_MAJOR> & B) {
+#if defined(AMD_MFMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
+        tile<16, 8, half2>       * D16 = (tile<16, 8, half2>       *) &D;
+        const tile<16, 8, half2> * A16 = (const tile<16, 8, half2> *) &A;
+        mma(D16[0], A16[0], B);
+        mma(D16[1], A16[1], B);
+#else
+        GGML_UNUSED_VARS(D, A, B);
+        NO_DEVICE_CODE;
+#endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)
+    }
+
    template <data_layout dl_ab, data_layout dl_d>
    static __device__ __forceinline__ void mma(
            tile<16, 8, float, dl_d> & D, const tile<16, 8, float, dl_ab> & A, const tile<8, 8, float, dl_ab> & B) {
@@ -1296,6 +1403,22 @@ namespace ggml_cuda_mma {
 #endif // defined(VOLTA_MMA_AVAILABLE)
    }

+    static __device__ __forceinline__ void mma(
+            tile<16, 16, half2, DATA_LAYOUT_I_MAJOR> & D, const tile<32,  8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> & A,
+            const tile<16,  8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> & B) {
+#if defined(AMD_WMMA_AVAILABLE) && defined(RDNA3)
+        using halfx16_t = __attribute__((ext_vector_type(16))) _Float16;
+        halfx16_t       * xD = (halfx16_t       *) D.x;
+        const halfx16_t * xA = (const halfx16_t *) A.x;
+        const halfx16_t * xB = (const halfx16_t *) B.x;
+        xD[0] = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(xA[0], xB[0], xD[0], /*opsel =*/ 0);
+        xD[0] = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(xA[1], xB[0], xD[0], /*opsel =*/ 1);
+#else
+        GGML_UNUSED_VARS(D, A, B);
+        NO_DEVICE_CODE;
+#endif // TURING_MMA_AVAILABLE
+    }
+
    template <data_layout dl_d, data_layout dl_ab>
    static __device__ __forceinline__ void mma(
            tile<16, 16, int, dl_d> & D, const tile<16, 4, int, dl_ab> & A, const tile<16, 4, int, dl_ab> & B) {
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2865,6 +2865,7 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
                case GGML_UNARY_OP_NEG:      return HTP_OP_UNARY_NEG;
                case GGML_UNARY_OP_EXP:      return HTP_OP_UNARY_EXP;
                case GGML_UNARY_OP_SOFTPLUS: return HTP_OP_UNARY_SOFTPLUS;
+                case GGML_UNARY_OP_TANH:     return HTP_OP_UNARY_TANH;
            default:
                break;
            }
@@ -3335,6 +3336,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
                case GGML_UNARY_OP_EXP:
                case GGML_UNARY_OP_SIGMOID:
                case GGML_UNARY_OP_SOFTPLUS:
+                case GGML_UNARY_OP_TANH:
                    supp = ggml_hexagon_supported_unary(sess, op);
                    break;
                case GGML_UNARY_OP_SILU:
--- a/ggml/src/ggml-hexagon/htp/cpy-ops.c
+++ b/ggml/src/ggml-hexagon/htp/cpy-ops.c
@@ -88,6 +88,29 @@ static void cpy_thread_sametype_reshape(struct htp_copy_context * ct, struct htp
    const uint32_t ir0 = dr * ith;
    const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;

+    // Fast path: when both src0 and dst are contiguous in memory
+    // Replace the element-by-element loop with a single bulk HVX copy per (i03, i02) slice.
+    const bool src0_contig = (nb00 == ct->src0_type_size) &&
+                             (nb01 == ne00 * nb00) &&
+                             (nb02 == ne01 * nb01) &&
+                             (nb03 == ne02 * nb02);
+    const bool dst_contig  = (nb0  == ct->dst_type_size)  &&
+                             (nb1  == ne0  * nb0)  &&
+                             (nb2  == ne1  * nb1)  &&
+                             (nb3  == ne2  * nb2);
+
+    if (src0_contig && dst_contig) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                uint8_t * src_ptr = (uint8_t *) src0->data + i03*nb03 + i02*nb02 + ir0*nb01;
+                uint32_t  flat    = ((i03*ne02 + i02)*ne01 + ir0) * ne00;
+                uint8_t * dst_ptr = (uint8_t *) dst->data  + flat * ct->src0_type_size;
+                hvx_copy_uu(dst_ptr, src_ptr, (ir1 - ir0) * ne00, ct->src0_type_size);
+            }
+        }
+        return;
+    }
+
    // dst counters
    int64_t k10 = 0;
    int64_t i11 = 0;
--- a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
@@ -760,8 +760,9 @@ static void fa_softmax_thread(unsigned int n, unsigned int i, void * data) {
            // ALiBi slopes — only needed when has_alibi (scheme A)
            HVX_Vector v_slope0, v_slope1;
            if (args->has_alibi) {
-                v_slope0 = hvx_vec_splat_f16(args->slopes[r + 0]);
-                v_slope1 = (r + 1 < (int) n_rows_g) ? hvx_vec_splat_f16(args->slopes[r + 1]) : Q6_V_vzero();
+                HVX_Vector v_s = hvx_vmemu(args->slopes + r);
+                v_slope0 = hvx_vec_repl_f16(v_s);
+                v_slope1 = (r + 1 < (int) n_rows_g) ? hvx_vec_repl_f16(Q6_V_vror_VR(v_s, 2)) : Q6_V_vzero();
            }

            const HVX_Vector v_threshold = Q6_Vh_vsplat_R(0xcc00);  // fp16 -16.0 (hoisted outside for-c)
--- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
@@ -180,12 +180,10 @@ next_nc:
 // Dequantize one x4x2 Q4_0 group (32 elements from 32 packed bytes) -> 32 FP16 in first 64 bytes.
 // In x4x2, sub-blocks 0..3 use lower nibbles, sub-blocks 4..7 use upper nibbles
 // of the same 32 packed bytes.
-static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx(
-         const uint8_t *packed_32, bool upper_nibbles,
-         const __fp16 *scale, const HVX_Vector vlut_cvt) {
+static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale, const HVX_Vector vlut_cvt) {
    HVX_Vector vq = hvx_vmemu(packed_32);
    const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
-    HVX_Vector v_scales = hvx_vec_splat_f16(*scale);
+    HVX_Vector v_scales = hvx_vec_repl_f16(hvx_vmemu(scale));
    // q4x4x2 stores two int4 values per byte. Keep only the selected nibble.
    HVX_Vector v_quants =  Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
    v_quants = Q6_V_vand_VV(v_quants, mask_h4);
@@ -223,9 +221,10 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(
    HVX_Vector v_hi = Q6_V_hi_W(vp);  // [group2: 32 fp16 | group3: 32 fp16]

    // Build per-group scale vectors: first 64 bytes use scale_a, last 64 use scale_b
-    HVX_VectorPred q64 = Q6_Q_vsetq_R(64);
-    HVX_Vector v_sc01 = Q6_V_vmux_QVV(q64, hvx_vec_splat_f16(scales_4[0]), hvx_vec_splat_f16(scales_4[1]));
-    HVX_Vector v_sc23 = Q6_V_vmux_QVV(q64, hvx_vec_splat_f16(scales_4[2]), hvx_vec_splat_f16(scales_4[3]));
+    volatile HVX_Vector vscale = hvx_vmemu(scales_4);
+
+    HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vscale);
+    HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vscale, 4));

    v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01));
    v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
@@ -237,10 +236,10 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(

 // Dequantize one x4x2 Q8_0 group (32 int8 quants) -> 32 FP16 in first 64 bytes.
 static inline HVX_Vector dequantize_x4x2_q8_0_group_hvx(const int8_t *quants_32, const __fp16 *scale) {
-    HVX_Vector vq = hvx_vmemu(quants_32);
-    HVX_Vector v_scales = hvx_vec_splat_f16(*scale);
-    HVX_Vector v0 = Q6_V_lo_W(Q6_Wh_vunpack_Vb(vq));
-    HVX_Vector v_hf = Q6_Vhf_equals_Vh(v0);
+    HVX_Vector vq       = hvx_vmemu(quants_32);
+    HVX_Vector v_scales = hvx_vec_repl_f16(hvx_vmemu(scale));
+    HVX_Vector v0       = Q6_V_lo_W(Q6_Wh_vunpack_Vb(vq));
+    HVX_Vector v_hf     = Q6_Vhf_equals_Vh(v0);
    return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales));
 }

@@ -521,12 +520,8 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
                const uint8_t *r0 = vtcm_src + row0 * row_stride;
                const uint8_t *r1 = vtcm_src + row1 * row_stride;

-                HVX_Vector v0 = dequantize_x4x2_q8_0_group_hvx(
-                    (const int8_t *)(r0 + byte_off), (const __fp16 *)(r0 + scale_off));
-                HVX_Vector v1 = (row1 < n_cols)
-                    ? dequantize_x4x2_q8_0_group_hvx(
-                        (const int8_t *)(r1 + byte_off), (const __fp16 *)(r1 + scale_off))
-                    : Q6_V_vzero();
+                HVX_Vector v0 = dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r0 + byte_off), (const __fp16 *)(r0 + scale_off));
+                HVX_Vector v1 = (row1 < n_cols) ? dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r1 + byte_off), (const __fp16 *)(r1 + scale_off)) : Q6_V_vzero();

                Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);
                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
--- a/ggml/src/ggml-hexagon/htp/hmx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hmx-utils.h
@@ -77,16 +77,18 @@ static inline void hmx_interleave_rows_to_tiles(__fp16 * restrict vtcm_dst,
            const HVX_Vector v_off0         = Q6_Vw_vadd_VwVw(v_scat_base, Q6_V_vsplat_R(local_r * 4));
            const HVX_Vector v_off1         = Q6_Vw_vadd_VwVw(v_off0, v_scat_step);

-            __fp16 *        tile_base = vtcm_dst + (size_t) ct * n_k_tiles * HMX_FP16_TILE_N_ELMS;
-            const uint8_t * p0        = (const uint8_t *) (vtcm_src + r * src_stride);
-            const uint8_t * p1        = next_row_valid ? (const uint8_t *) (vtcm_src + (r + 1) * src_stride) : NULL;
+            __fp16 * tile_base = vtcm_dst + (size_t) ct * n_k_tiles * HMX_FP16_TILE_N_ELMS;
+            const uint8_t * p0 = (const uint8_t *) (vtcm_src + r * src_stride);
+            const uint8_t * p1 = next_row_valid ? (const uint8_t *) (vtcm_src + (r + 1) * src_stride) : NULL;
+
+            assert(hex_is_aligned(p0, 128));
+            assert(hex_is_aligned(p1, 128));
+            assert(c_byte_step % 128 == 0);

            if (p1) {
                for (int i = 0; i < n_c_iters; ++i) {
-                    HVX_Vector v0 = hvx_vmemu(p0);
-                    p0 += c_byte_step;
-                    HVX_Vector v1 = hvx_vmemu(p1);
-                    p1 += c_byte_step;
+                    HVX_Vector v0 = hvx_vmem(p0); p0 += c_byte_step;
+                    HVX_Vector v1 = hvx_vmem(p1); p1 += c_byte_step;
                    Q6_vscatter_RMVwV((size_t) tile_base, pair_region, v_off0, v0);
                    Q6_vscatter_RMVwV((size_t) tile_base, pair_region, v_off1, v1);
                    tile_base += dst_step;
@@ -94,8 +96,7 @@ static inline void hmx_interleave_rows_to_tiles(__fp16 * restrict vtcm_dst,
            } else {
                const HVX_Vector vzero = Q6_V_vzero();
                for (int i = 0; i < n_c_iters; ++i) {
-                    HVX_Vector v0 = hvx_vmemu(p0);
-                    p0 += c_byte_step;
+                    HVX_Vector v0 = hvx_vmem(p0); p0 += c_byte_step;
                    Q6_vscatter_RMVwV((size_t) tile_base, pair_region, v_off0, v0);
                    Q6_vscatter_RMVwV((size_t) tile_base, pair_region, v_off1, vzero);
                    tile_base += dst_step;
@@ -116,16 +117,14 @@ static inline void hmx_interleave_rows_to_tiles(__fp16 * restrict vtcm_dst,
            const HVX_Vector v_off0         = Q6_Vw_vadd_VwVw(v_scat_base, Q6_V_vsplat_R(local_r * 4));
            const HVX_Vector v_off1         = Q6_Vw_vadd_VwVw(v_off0, v_scat_step);

-            __fp16 *        tile_base = vtcm_dst + (size_t) ct * n_k_tiles * HMX_FP16_TILE_N_ELMS;
-            const uint8_t * p0        = (const uint8_t *) (vtcm_src + r * src_stride);
-            const uint8_t * p1        = next_row_valid ? (const uint8_t *) (vtcm_src + (r + 1) * src_stride) : NULL;
+            __fp16 * tile_base = vtcm_dst + (size_t) ct * n_k_tiles * HMX_FP16_TILE_N_ELMS;
+            const uint8_t * p0 = (const uint8_t *) (vtcm_src + r * src_stride);
+            const uint8_t * p1 = next_row_valid ? (const uint8_t *) (vtcm_src + (r + 1) * src_stride) : NULL;

            if (p1) {
                for (int i = 0; i < n_c_iters; ++i) {
-                    HVX_Vector v0 = hvx_vmemu(p0);
-                    p0 += c_byte_step;
-                    HVX_Vector v1 = hvx_vmemu(p1);
-                    p1 += c_byte_step;
+                    HVX_Vector v0 = hvx_vmemu(p0); p0 += c_byte_step;
+                    HVX_Vector v1 = hvx_vmemu(p1); p1 += c_byte_step;
                    Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, single_region, v_off0, v0);
                    Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, single_region, v_off1, v1);
                    tile_base += dst_step;
@@ -133,8 +132,7 @@ static inline void hmx_interleave_rows_to_tiles(__fp16 * restrict vtcm_dst,
            } else {
                const HVX_Vector vzero = Q6_V_vzero();
                for (int i = 0; i < n_c_iters; ++i) {
-                    HVX_Vector v0 = hvx_vmemu(p0);
-                    p0 += c_byte_step;
+                    HVX_Vector v0 = hvx_vmemu(p0); p0 += c_byte_step;
                    Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, single_region, v_off0, v0);
                    Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, single_region, v_off1, vzero);
                    tile_base += dst_step;
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -62,6 +62,7 @@ enum htp_op_code {
    HTP_OP_UNARY_EXP,
    HTP_OP_UNARY_NEG,
    HTP_OP_UNARY_SOFTPLUS,
+    HTP_OP_UNARY_TANH,
    HTP_OP_GLU_SWIGLU,
    HTP_OP_GLU_SWIGLU_OAI,
    HTP_OP_GLU_GEGLU,
--- a/ggml/src/ggml-hexagon/htp/hvx-repl.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-repl.h
@@ -0,0 +1,74 @@
+#ifndef HVX_REPL_H
+#define HVX_REPL_H
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "hvx-base.h"
+
+static inline HVX_Vector hvx_vec_repl(HVX_Vector v, const uint8_t * ctrl) {
+    return Q6_V_vdelta_VV(v, hvx_vmem(ctrl));
+}
+
+static inline HVX_Vector hvx_vec_repl_u32(HVX_Vector v) {
+    // vdelta control to replicate first 4 bytes across all lanes
+    static const uint8_t __attribute__((aligned(128))) repl[128] = {
+        0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+    };
+    return hvx_vec_repl(v, repl);
+}
+
+static inline HVX_Vector hvx_vec_repl_f32(HVX_Vector v) {
+    // vdelta control to replicate first 4 bytes across all lanes
+    static const uint8_t __attribute__((aligned(128))) repl[128] = {
+        0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+    };
+    return hvx_vec_repl(v, repl);
+}
+
+static inline HVX_Vector hvx_vec_repl_f16(HVX_Vector v) {
+    // vdelta control to replicate first two bytes across all lanes
+    static const uint8_t __attribute__((aligned(128))) repl[128] = {
+        0x00, 0x00, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+        0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+        0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+        0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+        0x40, 0x40, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+        0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+        0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+        0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+    };
+    return hvx_vec_repl(v, repl);
+}
+
+static inline HVX_Vector hvx_vec_repl_2x_f16(HVX_Vector v) {
+    // vdelta control to splat a pair of f16s: first half = f16[0], second half = f16[1]
+    static const uint8_t __attribute__((aligned(128))) repl[128] = {
+        0x00, 0x00, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+        0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+        0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+        0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+        0x02, 0x02, 0x40, 0x40, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04,
+        0x02, 0x02, 0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04,
+        0x02, 0x02, 0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04,
+        0x02, 0x02, 0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04,
+    };
+    return hvx_vec_repl(v, repl);
+}
+
+#endif // HVX_REPL_H
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -5,6 +5,7 @@

 #include "hvx-types.h"
 #include "hvx-copy.h"
+#include "hvx-repl.h"
 #include "hvx-scale.h"
 #include "hvx-exp.h"
 #include "hvx-inverse.h"
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -542,6 +542,7 @@ static int execute_op(struct htp_ops_context * octx) {
        case HTP_OP_UNARY_SIGMOID:
        case HTP_OP_UNARY_NEG:
        case HTP_OP_UNARY_EXP:
+        case HTP_OP_UNARY_TANH:
        case HTP_OP_L2_NORM:
            return op_unary(octx);

--- a/ggml/src/ggml-hexagon/htp/unary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/unary-ops.c
@@ -373,6 +373,21 @@ static void l2_norm_f32(const float * restrict src,
    }
 }

+static void tanh_f32(const float * restrict src,
+                     float * restrict dst,
+                     uint8_t * restrict spad,
+                     const uint32_t num_rows,
+                     const uint32_t row_elems,
+                     const size_t   row_size,
+                     int32_t *      op_params) {
+    for (uint32_t ir = 0; ir < num_rows; ir++) {
+        const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
+        uint8_t * restrict dst_local       = (uint8_t *)dst + (ir * row_size);
+
+        hvx_tanh_f32_aa(dst_local, src_local, row_elems);
+    }
+}
+
 static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
    const struct htp_unary_context * uctx = (const struct htp_unary_context *) data;
    struct htp_ops_context * octx = uctx->octx;
@@ -477,6 +492,9 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
            case HTP_OP_UNARY_SOFTPLUS:
                softplus_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
                break;
+            case HTP_OP_UNARY_TANH:
+                tanh_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
+                break;
            case HTP_OP_L2_NORM:
                l2_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
                break;
@@ -547,10 +565,12 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
        case HTP_OP_UNARY_SOFTPLUS:
            op_type = "softplus-f32";
            break;
+        case HTP_OP_UNARY_TANH:
+            op_type = "tanh-f32";
+            break;
        case HTP_OP_L2_NORM:
            op_type = "l2norm-f32";
            break;
-
        default:
            FARF(ERROR, "Unsupported unary Op %u\n", octx->op);
            return HTP_STATUS_NO_SUPPORT;
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -647,19 +647,30 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri(ggml_m
    return res;
 }

-ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext(ggml_metal_library_t lib, ggml_type tsrc0, ggml_type tsrc1, int nsg, int nxpsg, int r1ptg) {
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext(ggml_metal_library_t lib, const ggml_tensor * op, int nsg, int nxpsg, int r1ptg) {
    char base[256];
    char name[256];

+    const ggml_type tsrc0 = op->src[0]->type;
+    const ggml_type tsrc1 = op->src[1]->type;
+    const int       ne12  = op->src[1]->ne[2];
+    const int       r2    = ne12 / op->src[0]->ne[2];
+    const int       r3    = op->src[1]->ne[3] / op->src[0]->ne[3];
+
+    GGML_ASSERT(ne12 <= INT16_MAX && r2 <= INT16_MAX && r3 <= INT16_MAX);
+
    snprintf(base, 256, "kernel_mul_mv_ext_%s_%s_r1_%d", ggml_type_name(tsrc0), ggml_type_name(tsrc1), r1ptg);
-    snprintf(name, 256, "%s_nsg=%d_nxpsg=%d", base, nsg, nxpsg);
+    snprintf(name, 256, "%s_nsg=%d_nxpsg=%d_ne12=%d_r2=%d_r3=%d", base, nsg, nxpsg, ne12, r2, r3);

    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
    if (!res.pipeline) {
        ggml_metal_cv_t cv = ggml_metal_cv_init();

-        ggml_metal_cv_set_int16(cv, nsg,   FC_MUL_MV + 0);
-        ggml_metal_cv_set_int16(cv, nxpsg, FC_MUL_MV + 1);
+        ggml_metal_cv_set_int16(cv, nsg,            FC_MUL_MV + 0);
+        ggml_metal_cv_set_int16(cv, nxpsg,          FC_MUL_MV + 1);
+        ggml_metal_cv_set_int16(cv, (int16_t) ne12, FC_MUL_MV + 2);
+        ggml_metal_cv_set_int16(cv, (int16_t) r2,   FC_MUL_MV + 3);
+        ggml_metal_cv_set_int16(cv, (int16_t) r3,   FC_MUL_MV + 4);

        res = ggml_metal_library_compile_pipeline(lib, base, name, cv);

@@ -687,8 +698,15 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm(ggml_meta
        ? (op->ne[0] % NRA != 0 || op->ne[1] % NRB != 0)
        : (op->ne[0] % 64  != 0 || op->ne[1] % 32  != 0);

+    GGML_ASSERT(op->src[1]->ne[2] <= INT16_MAX && op->src[1]->ne[3] <= INT16_MAX);
+    const int16_t ne12 = (int16_t) op->src[1]->ne[2];
+    const int16_t ne13 = (int16_t) op->src[1]->ne[3];
+    const int16_t r2   = (int16_t) (ne12 / op->src[0]->ne[2]);
+    const int16_t r3   = (int16_t) (ne13 / op->src[0]->ne[3]);
+
    snprintf(base, 256, "kernel_mul_mm_%s_%s", ggml_type_name(tsrc0), ggml_type_name(tsrc1));
-    snprintf(name, 256, "%s_bci=%d_bco=%d", base, bc_inp, bc_out);
+    snprintf(name, 256, "%s_bci=%d_bco=%d_ne12=%d_ne13=%d_r2=%d_r3=%d",
+             base, bc_inp, bc_out, ne12, ne13, r2, r3);

    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
    if (!res.pipeline) {
@@ -696,6 +714,10 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm(ggml_meta

        ggml_metal_cv_set_bool(cv, bc_inp, FC_MUL_MM + 0);
        ggml_metal_cv_set_bool(cv, bc_out, FC_MUL_MM + 1);
+        ggml_metal_cv_set_int16(cv, ne12,  FC_MUL_MM + 2);
+        ggml_metal_cv_set_int16(cv, ne13,  FC_MUL_MM + 3);
+        ggml_metal_cv_set_int16(cv, r2,    FC_MUL_MM + 4);
+        ggml_metal_cv_set_int16(cv, r3,    FC_MUL_MM + 5);

        res = ggml_metal_library_compile_pipeline(lib, base, name, cv);

@@ -877,14 +899,21 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv(ggml_meta
            }
    };

+    GGML_ASSERT(ne12 <= INT16_MAX && ne13 <= INT16_MAX);
+    const int16_t r2 = (int16_t) (ne12 / ne02);
+    const int16_t r3 = (int16_t) (ne13 / ne03);
+
    snprintf(base, 256, "kernel_mul_mv_%s_%s%s", ggml_type_name(tsrc0), ggml_type_name(tsrc1), suffix);
-    snprintf(name, 256, "%s_nsg=%d", base, nsg);
+    snprintf(name, 256, "%s_nsg=%d_ne12=%d_r2=%d_r3=%d", base, nsg, ne12, r2, r3);

    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
    if (!res.pipeline) {
        ggml_metal_cv_t cv = ggml_metal_cv_init();

-        ggml_metal_cv_set_int16(cv, nsg, FC_MUL_MV + 0);
+        ggml_metal_cv_set_int16(cv, nsg,            FC_MUL_MV + 0);
+        ggml_metal_cv_set_int16(cv, (int16_t) ne12, FC_MUL_MV + 2);
+        ggml_metal_cv_set_int16(cv, r2,             FC_MUL_MV + 3);
+        ggml_metal_cv_set_int16(cv, r3,             FC_MUL_MV + 4);

        res = ggml_metal_library_compile_pipeline(lib, base, name, cv);

@@ -1102,6 +1131,9 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_id(ggml_m
        ggml_metal_cv_t cv = ggml_metal_cv_init();

        ggml_metal_cv_set_int16(cv, nsg, FC_MUL_MV + 0);
+        ggml_metal_cv_set_int16(cv, 1,   FC_MUL_MV + 2);
+        ggml_metal_cv_set_int16(cv, 1,   FC_MUL_MV + 3);
+        ggml_metal_cv_set_int16(cv, 1,   FC_MUL_MV + 4);

        res = ggml_metal_library_compile_pipeline(lib, base, name, cv);

--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@@ -129,7 +129,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv              (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_gated_delta_net   (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri         (ggml_metal_library_t lib, const struct ggml_tensor * op);
-struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext        (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext        (ggml_metal_library_t lib, const struct ggml_tensor * op, int nsg, int nxpsg, int r1ptg);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm            (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv            (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id_map0    (ggml_metal_library_t lib, int ne02, int ne20);
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -672,7 +672,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
                ![[dev->mtl_device name] containsString:@"M6"] &&
                ![[dev->mtl_device name] containsString:@"A19"] &&
                ![[dev->mtl_device name] containsString:@"A20"]) {
-                GGML_LOG_WARN("%s: tensor API disabled for pre-M5 and pre-A19 devices\n", __func__);
+                GGML_LOG_INFO("%s: tensor API disabled for pre-M5 and pre-A19 devices\n", __func__);
                dev->props.has_tensor = false;
            }

--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -2120,7 +2120,7 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
                GGML_ABORT("unsupported ne11");
        };

-        auto pipeline = ggml_metal_library_get_pipeline_mul_mv_ext(lib, op->src[0]->type, op->src[1]->type, nsg, nxpsg, r1ptg);
+        auto pipeline = ggml_metal_library_get_pipeline_mul_mv_ext(lib, op, nsg, nxpsg, r1ptg);

        ggml_metal_kargs_mul_mv_ext args = {
            /*.ne00  =*/ ne00,
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -3353,6 +3353,9 @@ static inline void helper_mv_reduce_and_write(

 constant short FC_mul_mv_nsg   [[function_constant(FC_MUL_MV + 0)]];
 constant short FC_mul_mv_nxpsg [[function_constant(FC_MUL_MV + 1)]];
+constant short FC_mul_mv_ne12  [[function_constant(FC_MUL_MV + 2)]];
+constant short FC_mul_mv_r2    [[function_constant(FC_MUL_MV + 3)]];
+constant short FC_mul_mv_r3    [[function_constant(FC_MUL_MV + 4)]];

 template<typename block_q_type, short NR0, typename args_t>
 void mul_vec_q_n_f32_impl(
@@ -3376,10 +3379,10 @@ void mul_vec_q_n_f32_impl(
    const int r1 =  tgpig.y;
    const int im =  tgpig.z;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-  //const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+  //const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 = r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

  //device const block_q_type * x = (device const block_q_type *) (src0 + offset0);
@@ -3388,7 +3391,7 @@ void mul_vec_q_n_f32_impl(
    // pointers to src0 rows
    device const block_q_type * ax[NR0];
    FOR_UNROLL (int row = 0; row < NR0; ++row) {
-        const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+        const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;

        ax[row] = (device const block_q_type *) ((device char *) src0 + offset0);
    }
@@ -3462,8 +3465,8 @@ void kernel_mul_mv_q1_0_f32_impl(

    const int first_row = (r0 * NSG + sgitg) * nr0;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

    const uint64_t offset1 = r1*args.nb11 + (i12)*args.nb12 + (i13)*args.nb13;

@@ -3471,7 +3474,7 @@ void kernel_mul_mv_q1_0_f32_impl(

    device const block_q1_0 * ax[nr0];
    for (int row = 0; row < nr0; ++row) {
-        const uint64_t offset0 = (first_row + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+        const uint64_t offset0 = (first_row + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
        ax[row] = (device const block_q1_0 *) ((device char *) src0 + offset0);
    }

@@ -3590,10 +3593,10 @@ void kernel_mul_mv_q8_0_f32_impl(
    const int r1 = tgpig.y;
    const int im = tgpig.z;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-  //const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+  //const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 = r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

  //device const block_q8_0 * x = (device const block_q8_0 *) (src0 + offset0);
@@ -3602,7 +3605,7 @@ void kernel_mul_mv_q8_0_f32_impl(
    // pointers to src0 rows
    device const block_q8_0 * ax[NR0];
    FOR_UNROLL (short row = 0; row < NR0; ++row) {
-        const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+        const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;

        ax[row] = (device const block_q8_0 *) ((device char *) src0 + offset0);
    }
@@ -3682,10 +3685,10 @@ void kernel_mul_mv_ext_q4_f32_impl(
    const int i11 = tgpig.y*r1ptg;
    const int i1m = tgpig.z;

-    const int i12 = i1m%args.ne12;
-    const int i13 = i1m/args.ne12;
+    const int i12 = i1m%FC_mul_mv_ne12;
+    const int i13 = i1m/FC_mul_mv_ne12;

-    const uint64_t offset0 = i01*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = i01*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 = i11*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

    device const q_t * xq = (i01 < args.ne01) ? (device const q_t *) (src0 + offset0) + tx/chpb : (device const q_t *) src0;
@@ -3785,10 +3788,10 @@ void kernel_mul_mv_ext_q4x4_f32_impl(
    const int i11 = tgpig.y*r1ptg;
    const int i1m = tgpig.z;

-    const int i12 = i1m%args.ne12;
-    const int i13 = i1m/args.ne12;
+    const int i12 = i1m%FC_mul_mv_ne12;
+    const int i13 = i1m/FC_mul_mv_ne12;

-    const uint64_t offset0 = i01*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = i01*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 = i11*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

    device const q_t * xq = (i01 < args.ne01) ? (device const q_t *) (src0 + offset0) + tx/chpb : (device const q_t *) src0;
@@ -4000,10 +4003,10 @@ void kernel_mul_mv_t_t_impl(
    const int r1 = tgpig.y;
    const int im = tgpig.z;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-  //const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+  //const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 = r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

  //device const T0 * x = (device const T0 *) (src0 + offset0);
@@ -4012,7 +4015,7 @@ void kernel_mul_mv_t_t_impl(
    // pointers to src0 rows
    device const T0 * ax [NR0];
    FOR_UNROLL (short row = 0; row < NR0; ++row) {
-        const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+        const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;

        ax[row] = (device const T0 *) ((device char *) src0 + offset0);
    }
@@ -4122,10 +4125,10 @@ void kernel_mul_mv_t_t_4_impl(
    const int r1 = tgpig.y;
    const int im = tgpig.z;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-  //const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+  //const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 = r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

    device const T1  * y  = (device const T1  *) (src1 + offset1);
@@ -4135,7 +4138,7 @@ void kernel_mul_mv_t_t_4_impl(
    device const T0  * ax [NR0];
    device const T04 * ax4[NR0];
    FOR_UNROLL (short row = 0; row < NR0; ++row) {
-        const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+        const uint64_t offset0 = (r0 + row)*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;

        ax [row] = (device const T0  *) ((device char *) src0 + offset0);
        ax4[row] = (device const T04 *) ((device char *) src0 + offset0);
@@ -4239,10 +4242,10 @@ void kernel_mul_mv_t_t_short_impl(
        return;
    }

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-    const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = r0*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;

    device const T0 * x = (device const T0 *) (src0 + offset0);

@@ -7462,10 +7465,10 @@ void kernel_mul_mv_q2_K_f32_impl(

    const int first_row = (r0 * NSG + sgitg) * nr0;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

    device const block_q2_K * x = (device const block_q2_K *) (src0 + offset0);
@@ -7567,10 +7570,10 @@ void kernel_mul_mv_q3_K_f32_impl(

    const int first_row = (r0 * NSG + sgitg) * nr0;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

    device const block_q3_K * x = (device const block_q3_K *) (src0 + offset0);
@@ -7741,10 +7744,10 @@ void kernel_mul_mv_q4_K_f32_impl(

    const int first_row = (r0 * NSG + sgitg) * nr0;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

    device const block_q4_K * x = (device const block_q4_K *) (src0 + offset0);
@@ -7853,10 +7856,10 @@ void kernel_mul_mv_q5_K_f32_impl(

    const int first_row = (r0 * NSG + sgitg) * nr0;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

    device const block_q5_K * x = (device const block_q5_K *) (src0 + offset0);
@@ -7989,10 +7992,10 @@ void kernel_mul_mv_q6_K_f32_impl(

    const int first_row = (r0 * NSG + sgitg) * nr0;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

    device const block_q6_K * x = (device const block_q6_K *) (src0 + offset0);
@@ -8094,10 +8097,10 @@ void kernel_mul_mv_iq2_xxs_f32_impl(

    const int first_row = (r0 * NSG + sgitg) * nr0;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

    device const block_iq2_xxs * x = (device const block_iq2_xxs *) (src0 + offset0);
@@ -8202,10 +8205,10 @@ void kernel_mul_mv_iq2_xs_f32_impl(

    const int first_row = (r0 * NSG + sgitg) * nr0;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

    device const block_iq2_xs * x = (device const block_iq2_xs *) (src0 + offset0);
@@ -8321,10 +8324,10 @@ void kernel_mul_mv_iq3_xxs_f32_impl(

    const int first_row = (r0 * NSG + sgitg) * nr0;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

    device const block_iq3_xxs * x = (device const block_iq3_xxs *) (src0 + offset0);
@@ -8433,10 +8436,10 @@ void kernel_mul_mv_iq3_s_f32_impl(

    const int first_row = (r0 * NSG + sgitg) * nr0;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

    device const block_iq3_s * x = (device const block_iq3_s *) (src0 + offset0);
@@ -8545,10 +8548,10 @@ void kernel_mul_mv_iq2_s_f32_impl(

    const int first_row = (r0 * NSG + sgitg) * nr0;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

    device const block_iq2_s * x = (device const block_iq2_s *) (src0 + offset0);
@@ -8658,10 +8661,10 @@ void kernel_mul_mv_iq1_s_f32_impl(

    const int first_row = (r0 * NSG + sgitg) * nr0;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

    device const block_iq1_s * x = (device const block_iq1_s *) (src0 + offset0);
@@ -8757,10 +8760,10 @@ void kernel_mul_mv_iq1_m_f32_impl(

    const int first_row = (r0 * NSG + sgitg) * nr0;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

    device const block_iq1_m * x = (device const block_iq1_m *) (src0 + offset0);
@@ -8866,10 +8869,10 @@ void kernel_mul_mv_iq4_nl_f32_impl(

    const int first_row = (r0 * NSG + sgitg) * NR0;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

    device const block_iq4_nl * x = (device const block_iq4_nl *) (src0 + offset0);
@@ -8975,10 +8978,10 @@ void kernel_mul_mv_iq4_xs_f32_impl(
    const int im = tgpig.z;
    const int first_row = (r0 * NSG + sgitg) * NR0;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

    device const block_iq4_xs * x = (device const block_iq4_xs *) (src0 + offset0);
@@ -9086,10 +9089,10 @@ void kernel_mul_mv_mxfp4_f32_impl(

    const int first_row = (r0 * NSG + sgitg) * NR0;

-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
+    const uint i12 = im%FC_mul_mv_ne12;
+    const uint i13 = im/FC_mul_mv_ne12;

-    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = first_row*args.nb01 + (i12/FC_mul_mv_r2)*args.nb02 + (i13/FC_mul_mv_r3)*args.nb03;
    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;

    device const block_mxfp4 * x = (device const block_mxfp4 *) (src0 + offset0);
@@ -9304,6 +9307,10 @@ kernel void kernel_diag_f32(

 constant bool FC_mul_mm_bc_inp [[function_constant(FC_MUL_MM + 0)]];
 constant bool FC_mul_mm_bc_out [[function_constant(FC_MUL_MM + 1)]];
+constant short FC_mul_mm_ne12  [[function_constant(FC_MUL_MM + 2)]];
+constant short FC_mul_mm_ne13  [[function_constant(FC_MUL_MM + 3)]];
+constant short FC_mul_mm_r2    [[function_constant(FC_MUL_MM + 4)]];
+constant short FC_mul_mm_r3    [[function_constant(FC_MUL_MM + 5)]];

 // each block_q contains 16*nl weights
 #ifdef GGML_METAL_HAS_TENSOR
@@ -9330,11 +9337,11 @@ kernel void kernel_mul_mm(

    // Batch dimension handling
    const int im = tgpig.z;
-    const int i12 = im % args.ne12;
-    const int i13 = im / args.ne12;
+    const int i12 = im % FC_mul_mm_ne12;
+    const int i13 = im / FC_mul_mm_ne12;

    // Batch offsets for srcA and srcB
-    const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = (i12/FC_mul_mm_r2)*args.nb02 + (i13/FC_mul_mm_r3)*args.nb03;

    // Tile dimensions
    constexpr int NRB = SZ_SIMDGROUP * N_MM_BLOCK_X * N_MM_SIMD_GROUP_X;
@@ -9473,10 +9480,10 @@ kernel void kernel_mul_mm(

    short il = il0;

-    const int i12 = im%args.ne12;
-    const int i13 = im/args.ne12;
+    const int i12 = im % FC_mul_mm_ne12;
+    const int i13 = im / FC_mul_mm_ne12;

-    const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = (i12/FC_mul_mm_r2)*args.nb02 + (i13/FC_mul_mm_r3)*args.nb03;
    const short    offset1 = il0/nl;

    device const block_q * x = (device const block_q *)(src0 + args.nb01*(r0 + lr0) + offset0) + offset1;
--- a/ggml/src/ggml-opencl/CMakeLists.txt
+++ b/ggml/src/ggml-opencl/CMakeLists.txt
@@ -104,6 +104,12 @@ set(GGML_OPENCL_KERNELS
    mul_mv_id_mxfp4_f32_flat
    gemm_moe_q4_0_f32_ns
    gemv_moe_q4_0_f32_ns
+    gemm_moe_q4_1_f32_ns
+    gemv_moe_q4_1_f32_ns
+    gemm_moe_q5_0_f32_ns
+    gemv_moe_q5_0_f32_ns
+    gemm_moe_q5_1_f32_ns
+    gemv_moe_q5_1_f32_ns
    gemm_moe_mxfp4_f32
    gemv_moe_mxfp4_f32
    gemm_moe_mxfp4_f32_ns
@@ -174,6 +180,10 @@ set(GGML_OPENCL_KERNELS
    flash_attn_f32
 )

+if (GGML_OPENCL_USE_ADRENO_KERNELS)
+    list(APPEND GGML_OPENCL_KERNELS gemm_xmem_f16_f32_os8)
+endif ()
+
 foreach (K ${GGML_OPENCL_KERNELS})
    ggml_opencl_add_kernel(${K})
 endforeach()
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
--- a/ggml/src/ggml-opencl/kernels/cvt.cl
+++ b/ggml/src/ggml-opencl/kernels/cvt.cl
@@ -56,6 +56,25 @@ struct block_q4_1 {
    uchar qs[QK4_1 / 2]; // nibbles / quants
 };

+//------------------------------------------------------------------------------
+// block_q5_0
+//------------------------------------------------------------------------------
+struct block_q5_0 {
+    half d; // delta
+    uchar qh[4]; // 5-th bit of quants
+    uchar qs[QK5_0 / 2]; // nibbles / quants
+};
+
+//------------------------------------------------------------------------------
+// block_q5_1
+//------------------------------------------------------------------------------
+struct block_q5_1 {
+    half d; // delta
+    half m; // min
+    uchar qh[4]; // 5-th bit of quants
+    uchar qs[QK5_1 / 2]; // nibbles / quants
+};
+
 //------------------------------------------------------------------------------
 // block_q4_k
 //------------------------------------------------------------------------------
@@ -370,6 +389,281 @@ kernel void kernel_restore_block_q4_1_noshuffle(
    }
 }

+kernel void kernel_convert_block_q4_1_trans4_ns(
+    __global struct block_q4_1 * src0,
+    __global uint * dst_q,
+    __global half * dst_d,
+    __global half * dst_m,
+    uint ne00,
+    uint ne01
+) {
+    uint i00 = get_global_id(1);
+    uint i01 = get_global_id(0);
+    uint i02 = get_global_id(2);
+
+    uint ne00_blk = ne00 / QK4_1;
+    uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
+    uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
+
+    global struct block_q4_1 * b = src0 + src_blk_offset;
+    dst_d[dst_blk_offset] = b->d;
+    dst_m[dst_blk_offset] = b->m;
+
+    // extract quantization and unshuffle
+    ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
+
+    ushort8 post_block = (ushort8)(0);
+
+    uchar * pre_block_ptr = (uchar *)(&pre_block);
+    uchar * post_block_ptr = (uchar *)(&post_block);
+
+    for (int i = 0; i < QK4_1 / 4; ++i) {
+        uchar x0 = pre_block_ptr[2*i + 0];
+        uchar x1 = pre_block_ptr[2*i + 1];
+
+        post_block_ptr[i + 0        ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
+        post_block_ptr[i + QK4_1 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
+    }
+
+    uint4 q_block = as_uint4(post_block);
+
+    uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
+    dst_q[offset] = q_block.x;
+    dst_q[offset + ne01] = q_block.y;
+    dst_q[offset + ne01 * 2] = q_block.z;
+    dst_q[offset + ne01 * 3] = q_block.w;
+}
+
+kernel void kernel_restore_block_q4_1_trans4_ns(
+    __global uint * src_q,
+    __global half * src_d,
+    __global half * src_m,
+    __global struct block_q4_1 * dst0,
+    uint ne00,
+    uint ne01
+) {
+    int i00 = get_global_id(1);
+    uint i01 = get_global_id(0);
+    uint i02 = get_global_id(2);
+
+    uint ne00_blk = ne00 / QK4_1;
+    uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
+    uint src_dm_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
+
+    __global struct block_q4_1 * b = dst0 + dst_blk_offset;
+    b->d = src_d[src_dm_offset];
+    b->m = src_m[src_dm_offset];
+
+    // collect transposed quantization parts for a block
+    uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
+    uint4 q_block;
+    q_block.x = src_q[src_q_offset];
+    q_block.y = src_q[src_q_offset + ne01];
+    q_block.z = src_q[src_q_offset + ne01 * 2];
+    q_block.w = src_q[src_q_offset + ne01 * 3];
+
+    ushort8 post_block = as_ushort8(q_block);
+    ushort8 pre_block = (ushort8)(0);
+
+    uchar * pre_block_ptr = (uchar *)(&pre_block);
+    uchar * post_block_ptr = (uchar *)(&post_block);
+
+    for (int i = 0; i < QK4_0 / 4; ++i) {
+        uchar x0 = post_block_ptr[i + 0];
+        uchar x1 = post_block_ptr[i + QK4_0 / 4];
+
+        pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
+        pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
+    }
+
+    ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
+}
+
+kernel void kernel_convert_block_q5_0_trans4_ns(
+    __global struct block_q5_0 * src0,
+    __global uint * dst_qs,
+    __global uint * dst_qh,
+    __global half * dst_d,
+    uint ne00,
+    uint ne01
+) {
+    uint i00 = get_global_id(1);
+    uint i01 = get_global_id(0);
+    uint i02 = get_global_id(2);
+
+    uint ne00_blk = ne00 / QK5_0;
+    uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
+    uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
+
+    global struct block_q5_0 * b = src0 + src_blk_offset;
+    dst_d[dst_blk_offset] = b->d;
+
+    dst_qh[dst_blk_offset] = ((global uint *)(&(b->qh[0])))[0];
+
+    // extract quantization and unshuffle
+    ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
+    ushort8 post_block = (ushort8)(0);
+
+    uchar * pre_block_ptr = (uchar *)(&pre_block);
+    uchar * post_block_ptr = (uchar *)(&post_block);
+
+    for (int i = 0; i < QK5_0 / 4; ++i) {
+        uchar x0 = pre_block_ptr[2*i + 0];
+        uchar x1 = pre_block_ptr[2*i + 1];
+
+        post_block_ptr[i + 0        ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
+        post_block_ptr[i + QK5_0 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
+    }
+
+    uint4 q_block = as_uint4(post_block);
+
+    uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
+    dst_qs[offset] = q_block.x;
+    dst_qs[offset + ne01] = q_block.y;
+    dst_qs[offset + ne01 * 2] = q_block.z;
+    dst_qs[offset + ne01 * 3] = q_block.w;
+}
+
+kernel void kernel_restore_block_q5_0_trans4_ns(
+    __global uint * src_qs,
+    __global uint * src_qh,
+    __global half * src_d,
+    __global struct block_q5_0 * dst0,
+    uint ne00,
+    uint ne01
+) {
+    int i00 = get_global_id(1);
+    uint i01 = get_global_id(0);
+    uint i02 = get_global_id(2);
+
+    uint ne00_blk = ne00 / QK5_0;
+    uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
+    uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
+
+    __global struct block_q5_0 * b = dst0 + dst_blk_offset;
+    b->d = src_d[src_blk_offset];
+
+    ((__global uint *)(&(b->qh[0])))[0] = src_qh[src_blk_offset];
+
+    // collect transposed quantization parts for a block
+    uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
+    uint4 q_block;
+    q_block.x = src_qs[src_q_offset];
+    q_block.y = src_qs[src_q_offset + ne01];
+    q_block.z = src_qs[src_q_offset + ne01 * 2];
+    q_block.w = src_qs[src_q_offset + ne01 * 3];
+
+    ushort8 post_block = as_ushort8(q_block);
+    ushort8 pre_block = (ushort8)(0);
+
+    uchar * pre_block_ptr = (uchar *)(&pre_block);
+    uchar * post_block_ptr = (uchar *)(&post_block);
+
+    for (int i = 0; i < QK5_0 / 4; ++i) {
+        uchar x0 = post_block_ptr[i + 0];
+        uchar x1 = post_block_ptr[i + QK5_0 / 4];
+
+        pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
+        pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
+    }
+
+    ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
+}
+
+kernel void kernel_convert_block_q5_1_trans4_ns(
+    __global struct block_q5_1 * src0,
+    __global uint * dst_qs,
+    __global uint * dst_qh,
+    __global half * dst_d,
+    __global half * dst_m,
+    uint ne00,
+    uint ne01
+) {
+    uint i00 = get_global_id(1);
+    uint i01 = get_global_id(0);
+    uint i02 = get_global_id(2);
+
+    uint ne00_blk = ne00 / QK5_1;
+    uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
+    uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
+
+    global struct block_q5_1 * b = src0 + src_blk_offset;
+    dst_d[dst_blk_offset] = b->d;
+    dst_m[dst_blk_offset] = b->m;
+
+    dst_qh[dst_blk_offset] = ((global uint *)(&(b->qh[0])))[0];
+
+    // extract quantization and unshuffle
+    ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
+    ushort8 post_block = (ushort8)(0);
+
+    uchar * pre_block_ptr = (uchar *)(&pre_block);
+    uchar * post_block_ptr = (uchar *)(&post_block);
+
+    for (int i = 0; i < QK5_1 / 4; ++i) {
+        uchar x0 = pre_block_ptr[2*i + 0];
+        uchar x1 = pre_block_ptr[2*i + 1];
+
+        post_block_ptr[i + 0        ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
+        post_block_ptr[i + QK5_1 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
+    }
+
+    uint4 q_block = as_uint4(post_block);
+
+    uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
+    dst_qs[offset] = q_block.x;
+    dst_qs[offset + ne01] = q_block.y;
+    dst_qs[offset + ne01 * 2] = q_block.z;
+    dst_qs[offset + ne01 * 3] = q_block.w;
+}
+
+kernel void kernel_restore_block_q5_1_trans4_ns(
+    __global uint * src_qs,
+    __global uint * src_qh,
+    __global half * src_d,
+    __global half * src_m,
+    __global struct block_q5_1 * dst0,
+    uint ne00,
+    uint ne01
+) {
+    int i00 = get_global_id(1);
+    uint i01 = get_global_id(0);
+    uint i02 = get_global_id(2);
+
+    uint ne00_blk = ne00 / QK5_1;
+    uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
+    uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
+
+    __global struct block_q5_1 * b = dst0 + dst_blk_offset;
+    b->d = src_d[src_blk_offset];
+    b->m = src_m[src_blk_offset];
+
+    ((__global uint *)(&(b->qh[0])))[0] = src_qh[src_blk_offset];
+
+    // collect transposed quantization parts for a block
+    uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
+    uint4 q_block;
+    q_block.x = src_qs[src_q_offset];
+    q_block.y = src_qs[src_q_offset + ne01];
+    q_block.z = src_qs[src_q_offset + ne01 * 2];
+    q_block.w = src_qs[src_q_offset + ne01 * 3];
+
+    ushort8 post_block = as_ushort8(q_block);
+    ushort8 pre_block = (ushort8)(0);
+
+    uchar * pre_block_ptr = (uchar *)(&pre_block);
+    uchar * post_block_ptr = (uchar *)(&post_block);
+
+    for (int i = 0; i < QK5_1 / 4; ++i) {
+        uchar x0 = post_block_ptr[i + 0];
+        uchar x1 = post_block_ptr[i + QK5_1 / 4];
+
+        pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
+        pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
+    }
+    ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
+}
+
 //------------------------------------------------------------------------------
 // block_mxfp4
 //------------------------------------------------------------------------------
--- a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl
@@ -0,0 +1,254 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
+#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
+
+#define TILESIZE_K 16
+#define TILESIZE_M 64
+#define TILESIZE_N 32
+
+
+#define dequantize_q4_1(q4, a_f16, scale, m) \
+    a_f16.s0 = (half)(q4.s0 & 0x000F) * scale + m; \
+    a_f16.s1 = (half)((q4.s0 & 0x00F0) >> 4) * scale + m; \
+    a_f16.s2 = (half)((q4.s0 & 0x0F00) >> 8) * scale + m; \
+    a_f16.s3 = (half)((q4.s0 & 0xF000) >> 12) * scale + m; \
+    a_f16.s4 = (half)(q4.s1 & 0x000F) * scale + m; \
+    a_f16.s5 = (half)((q4.s1 & 0x00F0) >> 4) * scale + m; \
+    a_f16.s6 = (half)((q4.s1 & 0x0F00) >> 8) * scale + m; \
+    a_f16.s7 = (half)((q4.s1 & 0xF000) >> 12) * scale + m; \
+    a_f16.s8 = (half)(q4.s2 & 0x000F) * scale + m; \
+    a_f16.s9 = (half)((q4.s2 & 0x00F0) >> 4) * scale + m; \
+    a_f16.sa = (half)((q4.s2 & 0x0F00) >> 8) * scale + m; \
+    a_f16.sb = (half)((q4.s2 & 0xF000) >> 12) * scale + m; \
+    a_f16.sc = (half)(q4.s3 & 0x000F) * scale + m; \
+    a_f16.sd = (half)((q4.s3 & 0x00F0) >> 4) * scale + m; \
+    a_f16.se = (half)((q4.s3 & 0x0F00) >> 8) * scale + m; \
+    a_f16.sf = (half)((q4.s3 & 0xF000) >> 12) * scale + m; \
+
+
+#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \
+    acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \
+    acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \
+    acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \
+    acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \
+    acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \
+    acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \
+    acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \
+    acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \
+    acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \
+    acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \
+    acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \
+    acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \
+    acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \
+    acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \
+    acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \
+    acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \
+    acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \
+    acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \
+    acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \
+    acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \
+    acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \
+    acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \
+    acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \
+    acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \
+    acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \
+    acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \
+    acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \
+    acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \
+    acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \
+    acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \
+    acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \
+    acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \
+    c_reg.lo += convert_float8(acc.lo); \
+    c_reg.hi += convert_float8(acc.hi); \
+    acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \
+    acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \
+    acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \
+    acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \
+    acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \
+    acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \
+    acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \
+    acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \
+    acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \
+    acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \
+    acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \
+    acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \
+    acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \
+    acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \
+    acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \
+    acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \
+    acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \
+    acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \
+    acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \
+    acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \
+    acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \
+    acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \
+    acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \
+    acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \
+    acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \
+    acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \
+    acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \
+    acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \
+    acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \
+    acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \
+    acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \
+    acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \
+    c_reg.lo += convert_float8(acc.lo); \
+    c_reg.hi += convert_float8(acc.hi); \
+
+
+__attribute__((qcom_wave_pair_mode(1))) // 1=force single 2=force pair
+kernel void kernel_gemm_moe_q4_1_f32_ns(
+        __read_only  image1d_buffer_t src0_q,
+        __global     half *           src0_d,
+        __global     half *           src0_m,
+        __read_only  image1d_buffer_t src1,
+        __global     uint *           src2,
+        __global     ushort *         src2_emap,
+        __write_only image1d_buffer_t dst,
+        __global     int *            total_tiles,
+        uint ne00,
+        uint ne01
+) {
+    uint block_id_m = get_global_id(1); // m_tile
+    uint block_id_n = get_global_id(2); // n_tile
+
+    // Boundary check
+    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+        return;
+    }
+
+    __private half16 reg_a;
+    __private float32 reg_c = (float32)(0);
+    __local half4 shared_b[128];
+
+    const ushort expert_id = src2_emap[block_id_n];
+
+    const uint row = block_id_m * TILESIZE_M;
+    const uint col = block_id_n * TILESIZE_N;
+
+    uint sub_block_id_m = get_local_id(0);
+    uint2 b_global_offset;
+    b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00;
+    b_global_offset.y = b_global_offset.x + (16 * ne00);
+    uint2 b_local_offset;
+    b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2);
+    b_local_offset.y = b_local_offset.x + 16;
+
+    // Loop along K axis, 32 elements (one block) for each iteration, divided into 2 sub-blocks
+    for (uint step = 0; step < ne00; step += TILESIZE_K * 2) {
+        // First sub-block
+        uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
+        uint s_sub_offset = row + ((ne01 * step) >> 5) + ((expert_id * ne00 * ne01) >> 5);
+        uint b_sub_offset = col * ne00 + step;
+
+        // Load scale and m for current Q4_1 block
+        uint sm_offset = s_sub_offset + get_global_id(0);
+        half s = src0_d[sm_offset];
+        half m = src0_m[sm_offset];
+
+        // Load 16 q (64-bits) in transposed layout
+        uint2 q4x16;
+        q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x;
+        q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x;
+
+        // Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
+        float8 bx8_f32;
+        bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
+        bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
+        // Convert to half and store to LM to share within the subgroup
+        half8 bx8_f16 = convert_half8(bx8_f32);
+        shared_b[b_local_offset.x] = bx8_f16.lo;
+        shared_b[b_local_offset.y] = bx8_f16.hi;
+
+        // Dequantization
+        dequantize_q4_1(as_ushort4(q4x16), reg_a, s, m);
+
+        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        // 32 16x16 fp16 dot product with 8 elements reduction for better precision
+        half16 acc;
+        dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
+        dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
+
+        // Repeat for second sub-block
+        uint half_step = step + TILESIZE_K;
+        q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
+        b_sub_offset = col * ne00 + half_step;
+
+        // Load next 16 q (64-bits) in transposed layout
+        q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x;
+        q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x;
+
+        // Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
+        bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
+        bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
+        // Convert to half and store to LM to share within the subgroup
+        bx8_f16 = convert_half8(bx8_f32);
+        shared_b[b_local_offset.x] = bx8_f16.lo;
+        shared_b[b_local_offset.y] = bx8_f16.hi;
+
+        // Dequantization
+        dequantize_q4_1(as_ushort4(q4x16), reg_a, s, m);
+
+        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        // 32 16x16 fp16 dot product with 3-levels reduction for better precision
+        dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
+        dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
+    }
+
+    // Load poster router and share in LM
+    __local uint out_idx[TILESIZE_N];
+
+    if (get_local_id(0) < TILESIZE_N) {
+        uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)];
+        if (idx == 0xFFFFFFFF) {
+            idx = src2[block_id_n * TILESIZE_N + 0];
+        }
+        out_idx[get_local_id(0)] = idx * ne01;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Scatter results back to original position in output grid
+    uint m_offset = row + get_local_id(0);
+
+    write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1));
+    write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2));
+    write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3));
+    write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4));
+    write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5));
+    write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6));
+    write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7));
+    write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8));
+    write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9));
+    write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa));
+    write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb));
+    write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc));
+    write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd));
+    write_imagef(dst, out_idx[14] + m_offset, (reg_c.se));
+    write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf));
+    write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg));
+    write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh));
+    write_imagef(dst, out_idx[18] + m_offset, (reg_c.si));
+    write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj));
+    write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk));
+    write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl));
+    write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm));
+    write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn));
+    write_imagef(dst, out_idx[24] + m_offset, (reg_c.so));
+    write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp));
+    write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq));
+    write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr));
+    write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss));
+    write_imagef(dst, out_idx[29] + m_offset, (reg_c.st));
+    write_imagef(dst, out_idx[30] + m_offset, (reg_c.su));
+    write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv));
+
+    // Store zero padding parts to the index of first output in tile, override correct result in the end
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0));
+}
--- a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl
@@ -0,0 +1,256 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
+#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
+
+#define TILESIZE_K 16
+#define TILESIZE_M 64
+#define TILESIZE_N 32
+
+
+#define dequantize_q5_0(qs5x16, qh5x16, a_f16, scale) \
+    a_f16.s0 = (half)((( qs5x16.s0 & 0x000F)        | (( qh5x16.s0       & 0x01) << 4)) - 16) * scale; \
+    a_f16.s1 = (half)((((qs5x16.s0 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 1) & 0x01) << 4)) - 16) * scale; \
+    a_f16.s2 = (half)((((qs5x16.s0 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 2) & 0x01) << 4)) - 16) * scale; \
+    a_f16.s3 = (half)((((qs5x16.s0 & 0xF000) >> 12) | (((qh5x16.s0 >> 3) & 0x01) << 4)) - 16) * scale; \
+    a_f16.s4 = (half)((( qs5x16.s1 & 0x000F)        | (((qh5x16.s0 >> 4) & 0x01) << 4)) - 16) * scale; \
+    a_f16.s5 = (half)((((qs5x16.s1 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 5) & 0x01) << 4)) - 16) * scale; \
+    a_f16.s6 = (half)((((qs5x16.s1 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 6) & 0x01) << 4)) - 16) * scale; \
+    a_f16.s7 = (half)((((qs5x16.s1 & 0xF000) >> 12) | (((qh5x16.s0 >> 7) & 0x01) << 4)) - 16) * scale; \
+    a_f16.s8 = (half)((( qs5x16.s2 & 0x000F)        | (( qh5x16.s1       & 0x01) << 4)) - 16) * scale; \
+    a_f16.s9 = (half)((((qs5x16.s2 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 1) & 0x01) << 4)) - 16) * scale; \
+    a_f16.sa = (half)((((qs5x16.s2 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 2) & 0x01) << 4)) - 16) * scale; \
+    a_f16.sb = (half)((((qs5x16.s2 & 0xF000) >> 12) | (((qh5x16.s1 >> 3) & 0x01) << 4)) - 16) * scale; \
+    a_f16.sc = (half)((( qs5x16.s3 & 0x000F)        | (((qh5x16.s1 >> 4) & 0x01) << 4)) - 16) * scale; \
+    a_f16.sd = (half)((((qs5x16.s3 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 5) & 0x01) << 4)) - 16) * scale; \
+    a_f16.se = (half)((((qs5x16.s3 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 6) & 0x01) << 4)) - 16) * scale; \
+    a_f16.sf = (half)((((qs5x16.s3 & 0xF000) >> 12) | (((qh5x16.s1 >> 7) & 0x01) << 4)) - 16) * scale; \
+
+
+#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \
+    acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \
+    acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \
+    acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \
+    acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \
+    acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \
+    acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \
+    acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \
+    acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \
+    acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \
+    acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \
+    acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \
+    acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \
+    acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \
+    acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \
+    acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \
+    acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \
+    acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \
+    acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \
+    acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \
+    acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \
+    acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \
+    acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \
+    acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \
+    acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \
+    acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \
+    acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \
+    acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \
+    acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \
+    acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \
+    acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \
+    acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \
+    acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \
+    c_reg.lo += convert_float8(acc.lo); \
+    c_reg.hi += convert_float8(acc.hi); \
+    acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \
+    acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \
+    acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \
+    acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \
+    acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \
+    acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \
+    acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \
+    acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \
+    acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \
+    acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \
+    acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \
+    acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \
+    acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \
+    acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \
+    acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \
+    acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \
+    acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \
+    acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \
+    acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \
+    acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \
+    acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \
+    acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \
+    acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \
+    acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \
+    acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \
+    acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \
+    acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \
+    acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \
+    acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \
+    acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \
+    acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \
+    acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \
+    c_reg.lo += convert_float8(acc.lo); \
+    c_reg.hi += convert_float8(acc.hi); \
+
+
+__attribute__((qcom_wave_pair_mode(1))) // 1=force single 2=force pair
+kernel void kernel_gemm_moe_q5_0_f32_ns(
+        __read_only  image1d_buffer_t src0_qs,
+        __global     uint *           src0_qh,
+        __global     half *           src0_d,
+        __read_only  image1d_buffer_t src1,
+        __global     uint *           src2,
+        __global     ushort *         src2_emap,
+        __write_only image1d_buffer_t dst,
+        __global     int *            total_tiles,
+        uint ne00,
+        uint ne01
+) {
+    uint block_id_m = get_global_id(1); // m_tile
+    uint block_id_n = get_global_id(2); // n_tile
+
+    // Boundary check
+    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+        return;
+    }
+
+    __private half16 reg_a;
+    __private float32 reg_c = (float32)(0);
+    __local half4 shared_b[128];
+
+    const ushort expert_id = src2_emap[block_id_n];
+
+    const uint row = block_id_m * TILESIZE_M;
+    const uint col = block_id_n * TILESIZE_N;
+
+    uint sub_block_id_m = get_local_id(0);
+    uint2 b_global_offset;
+    b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00;
+    b_global_offset.y = b_global_offset.x + (16 * ne00);
+    uint2 b_local_offset;
+    b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2);
+    b_local_offset.y = b_local_offset.x + 16;
+
+    // Loop along K axis, 32 elements (one block) for each iteration, divided into 2 sub-blocks
+    for (uint step = 0; step < ne00; step += TILESIZE_K * 2) {
+        // First sub-block
+        uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
+        uint s_sub_offset = row + ((ne01 * step) >> 5) + ((expert_id * ne00 * ne01) >> 5);
+        uint b_sub_offset = col * ne00 + step;
+
+        // Load scale for current Q5_0 block
+        uint blk_offset = s_sub_offset + get_global_id(0);
+        half s = src0_d[blk_offset];
+
+        // Load 32 qh (5-th bit of each Q5) for the entire block
+        uchar4 qhx32 = as_uchar4(src0_qh[blk_offset]);
+
+        // Load 16 qs (half block) in transposed layout
+        uint2 qsx16;
+        qsx16.x = read_imageui(src0_qs, q_sub_offset + sub_block_id_m).x;
+        qsx16.y = read_imageui(src0_qs, q_sub_offset + sub_block_id_m + ne01).x;
+
+        // Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
+        float8 bx8_f32;
+        bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
+        bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
+        // Convert to half and store to LM to share within the subgroup
+        half8 bx8_f16 = convert_half8(bx8_f32);
+        shared_b[b_local_offset.x] = bx8_f16.lo;
+        shared_b[b_local_offset.y] = bx8_f16.hi;
+
+        // Dequantization
+        dequantize_q5_0(as_ushort4(qsx16), qhx32.lo, reg_a, s);
+
+        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        // 32 16x16 fp16 dot product with 8 elements reduction for better precision
+        half16 acc;
+        dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
+        dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
+
+        // Repeat for second sub-block
+        uint half_step = step + TILESIZE_K;
+        q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
+        b_sub_offset = col * ne00 + half_step;
+
+        // Load next 16 qs in transposed layout
+        qsx16.x = read_imageui(src0_qs, q_sub_offset + sub_block_id_m).x;
+        qsx16.y = read_imageui(src0_qs, q_sub_offset + sub_block_id_m + ne01).x;
+
+        // Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
+        bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
+        bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
+        // Convert to half and store to LM to share within the subgroup
+        bx8_f16 = convert_half8(bx8_f32);
+        shared_b[b_local_offset.x] = bx8_f16.lo;
+        shared_b[b_local_offset.y] = bx8_f16.hi;
+
+        // Dequantization
+        dequantize_q5_0(as_ushort4(qsx16), qhx32.hi, reg_a, s);
+
+        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        // 32 16x16 fp16 dot product with 3-levels reduction for better precision
+        dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
+        dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
+    }
+
+    // Load poster router and share in LM
+    __local uint out_idx[TILESIZE_N];
+
+    if (get_local_id(0) < TILESIZE_N) {
+        uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)];
+        if (idx == 0xFFFFFFFF) {
+            idx = src2[block_id_n * TILESIZE_N + 0];
+        }
+        out_idx[get_local_id(0)] = idx * ne01;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Scatter results back to original position in output grid
+    uint m_offset = row + get_local_id(0);
+
+    write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1));
+    write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2));
+    write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3));
+    write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4));
+    write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5));
+    write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6));
+    write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7));
+    write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8));
+    write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9));
+    write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa));
+    write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb));
+    write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc));
+    write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd));
+    write_imagef(dst, out_idx[14] + m_offset, (reg_c.se));
+    write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf));
+    write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg));
+    write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh));
+    write_imagef(dst, out_idx[18] + m_offset, (reg_c.si));
+    write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj));
+    write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk));
+    write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl));
+    write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm));
+    write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn));
+    write_imagef(dst, out_idx[24] + m_offset, (reg_c.so));
+    write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp));
+    write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq));
+    write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr));
+    write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss));
+    write_imagef(dst, out_idx[29] + m_offset, (reg_c.st));
+    write_imagef(dst, out_idx[30] + m_offset, (reg_c.su));
+    write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv));
+
+    // Store zero padding parts to the index of first output in tile, override correct result in the end
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0));
+}
--- a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl
@@ -0,0 +1,258 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
+#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
+
+#define TILESIZE_K 16
+#define TILESIZE_M 64
+#define TILESIZE_N 32
+
+
+#define dequantize_q5_1(qs5x16, qh5x16, a_f16, scale, m) \
+    a_f16.s0 = (half)((( qs5x16.s0 & 0x000F)        | (( qh5x16.s0       & 0x01) << 4)) * scale + m); \
+    a_f16.s1 = (half)((((qs5x16.s0 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 1) & 0x01) << 4)) * scale + m); \
+    a_f16.s2 = (half)((((qs5x16.s0 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 2) & 0x01) << 4)) * scale + m); \
+    a_f16.s3 = (half)((((qs5x16.s0 & 0xF000) >> 12) | (((qh5x16.s0 >> 3) & 0x01) << 4)) * scale + m); \
+    a_f16.s4 = (half)((( qs5x16.s1 & 0x000F)        | (((qh5x16.s0 >> 4) & 0x01) << 4)) * scale + m); \
+    a_f16.s5 = (half)((((qs5x16.s1 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 5) & 0x01) << 4)) * scale + m); \
+    a_f16.s6 = (half)((((qs5x16.s1 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 6) & 0x01) << 4)) * scale + m); \
+    a_f16.s7 = (half)((((qs5x16.s1 & 0xF000) >> 12) | (((qh5x16.s0 >> 7) & 0x01) << 4)) * scale + m); \
+    a_f16.s8 = (half)((( qs5x16.s2 & 0x000F)        | (( qh5x16.s1       & 0x01) << 4)) * scale + m); \
+    a_f16.s9 = (half)((((qs5x16.s2 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 1) & 0x01) << 4)) * scale + m); \
+    a_f16.sa = (half)((((qs5x16.s2 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 2) & 0x01) << 4)) * scale + m); \
+    a_f16.sb = (half)((((qs5x16.s2 & 0xF000) >> 12) | (((qh5x16.s1 >> 3) & 0x01) << 4)) * scale + m); \
+    a_f16.sc = (half)((( qs5x16.s3 & 0x000F)        | (((qh5x16.s1 >> 4) & 0x01) << 4)) * scale + m); \
+    a_f16.sd = (half)((((qs5x16.s3 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 5) & 0x01) << 4)) * scale + m); \
+    a_f16.se = (half)((((qs5x16.s3 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 6) & 0x01) << 4)) * scale + m); \
+    a_f16.sf = (half)((((qs5x16.s3 & 0xF000) >> 12) | (((qh5x16.s1 >> 7) & 0x01) << 4)) * scale + m); \
+
+
+#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \
+    acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \
+    acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \
+    acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \
+    acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \
+    acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \
+    acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \
+    acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \
+    acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \
+    acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \
+    acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \
+    acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \
+    acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \
+    acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \
+    acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \
+    acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \
+    acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \
+    acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \
+    acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \
+    acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \
+    acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \
+    acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \
+    acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \
+    acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \
+    acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \
+    acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \
+    acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \
+    acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \
+    acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \
+    acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \
+    acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \
+    acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \
+    acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \
+    c_reg.lo += convert_float8(acc.lo); \
+    c_reg.hi += convert_float8(acc.hi); \
+    acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \
+    acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \
+    acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \
+    acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \
+    acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \
+    acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \
+    acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \
+    acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \
+    acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \
+    acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \
+    acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \
+    acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \
+    acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \
+    acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \
+    acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \
+    acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \
+    acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \
+    acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \
+    acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \
+    acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \
+    acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \
+    acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \
+    acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \
+    acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \
+    acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \
+    acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \
+    acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \
+    acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \
+    acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \
+    acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \
+    acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \
+    acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \
+    c_reg.lo += convert_float8(acc.lo); \
+    c_reg.hi += convert_float8(acc.hi); \
+
+
+__attribute__((qcom_wave_pair_mode(1))) // 1=force single 2=force pair
+kernel void kernel_gemm_moe_q5_1_f32_ns(
+        __read_only  image1d_buffer_t src0_qs,
+        __global     uint *           src0_qh,
+        __global     half *           src0_d,
+        __global     half *           src0_m,
+        __read_only  image1d_buffer_t src1,
+        __global     uint *           src2,
+        __global     ushort *         src2_emap,
+        __write_only image1d_buffer_t dst,
+        __global     int *            total_tiles,
+        uint ne00,
+        uint ne01
+) {
+    uint block_id_m = get_global_id(1); // m_tile
+    uint block_id_n = get_global_id(2); // n_tile
+
+    // Boundary check
+    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+        return;
+    }
+
+    __private half16 reg_a;
+    __private float32 reg_c = (float32)(0);
+    __local half4 shared_b[128];
+
+    const ushort expert_id = src2_emap[block_id_n];
+
+    const uint row = block_id_m * TILESIZE_M;
+    const uint col = block_id_n * TILESIZE_N;
+
+    uint sub_block_id_m = get_local_id(0);
+    uint2 b_global_offset;
+    b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00;
+    b_global_offset.y = b_global_offset.x + (16 * ne00);
+    uint2 b_local_offset;
+    b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2);
+    b_local_offset.y = b_local_offset.x + 16;
+
+    // Loop along K axis, 32 elements (one block) for each iteration, divided into 2 sub-blocks
+    for (uint step = 0; step < ne00; step += TILESIZE_K * 2) {
+        // First sub-block
+        uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
+        uint s_sub_offset = row + ((ne01 * step) >> 5) + ((expert_id * ne00 * ne01) >> 5);
+        uint b_sub_offset = col * ne00 + step;
+
+        // Load scale and m for current Q5_1 block
+        uint blk_offset = s_sub_offset + get_global_id(0);
+        half s = src0_d[blk_offset];
+        half m = src0_m[blk_offset];
+
+        // Load 32 qh (5-th bit of each Q5) for the entire block
+        uchar4 qhx32 = as_uchar4(src0_qh[blk_offset]);
+
+        // Load 16 qs (half block) in transposed layout
+        uint2 qsx16;
+        qsx16.x = read_imageui(src0_qs, q_sub_offset + sub_block_id_m).x;
+        qsx16.y = read_imageui(src0_qs, q_sub_offset + sub_block_id_m + ne01).x;
+
+        // Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
+        float8 bx8_f32;
+        bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
+        bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
+        // Convert to half and store to LM to share within the subgroup
+        half8 bx8_f16 = convert_half8(bx8_f32);
+        shared_b[b_local_offset.x] = bx8_f16.lo;
+        shared_b[b_local_offset.y] = bx8_f16.hi;
+
+        // Dequantization
+        dequantize_q5_1(as_ushort4(qsx16), qhx32.lo, reg_a, s, m);
+
+        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        // 32 16x16 fp16 dot product with 8 elements reduction for better precision
+        half16 acc;
+        dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
+        dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
+
+        // Repeat for second sub-block
+        uint half_step = step + TILESIZE_K;
+        q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
+        b_sub_offset = col * ne00 + half_step;
+
+        // Load next 16 qs in transposed layout
+        qsx16.x = read_imageui(src0_qs, q_sub_offset + sub_block_id_m).x;
+        qsx16.y = read_imageui(src0_qs, q_sub_offset + sub_block_id_m + ne01).x;
+
+        // Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
+        bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
+        bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
+        // Convert to half and store to LM to share within the subgroup
+        bx8_f16 = convert_half8(bx8_f32);
+        shared_b[b_local_offset.x] = bx8_f16.lo;
+        shared_b[b_local_offset.y] = bx8_f16.hi;
+
+        // Dequantization
+        dequantize_q5_1(as_ushort4(qsx16), qhx32.hi, reg_a, s, m);
+
+        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        // 32 16x16 fp16 dot product with 3-levels reduction for better precision
+        dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
+        dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
+    }
+
+    // Load poster router and share in LM
+    __local uint out_idx[TILESIZE_N];
+
+    if (get_local_id(0) < TILESIZE_N) {
+        uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)];
+        if (idx == 0xFFFFFFFF) {
+            idx = src2[block_id_n * TILESIZE_N + 0];
+        }
+        out_idx[get_local_id(0)] = idx * ne01;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Scatter results back to original position in output grid
+    uint m_offset = row + get_local_id(0);
+
+    write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1));
+    write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2));
+    write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3));
+    write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4));
+    write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5));
+    write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6));
+    write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7));
+    write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8));
+    write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9));
+    write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa));
+    write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb));
+    write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc));
+    write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd));
+    write_imagef(dst, out_idx[14] + m_offset, (reg_c.se));
+    write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf));
+    write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg));
+    write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh));
+    write_imagef(dst, out_idx[18] + m_offset, (reg_c.si));
+    write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj));
+    write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk));
+    write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl));
+    write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm));
+    write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn));
+    write_imagef(dst, out_idx[24] + m_offset, (reg_c.so));
+    write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp));
+    write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq));
+    write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr));
+    write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss));
+    write_imagef(dst, out_idx[29] + m_offset, (reg_c.st));
+    write_imagef(dst, out_idx[30] + m_offset, (reg_c.su));
+    write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv));
+
+    // Store zero padding parts to the index of first output in tile, override correct result in the end
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0));
+}
--- a/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl
+++ b/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl
@@ -0,0 +1,233 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load : enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load : enable
+
+__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+__kernel void adreno_xmem_pack_src_f32(
+    __global const void * src_void,
+    ulong offset,
+    __write_only image2d_t src_img,
+    int K,
+    int N) {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int kpack = K / 4;
+
+    if (x >= N || y >= kpack) {
+        return;
+    }
+
+    __global const float * src = (__global const float *)((__global const char *)src_void + offset);
+    const int base = x*K + y*4;
+    const half4 v = (half4)((half)src[base + 0], (half)src[base + 1], (half)src[base + 2], (half)src[base + 3]);
+    write_imageh(src_img, (int2)(x, y), v);
+}
+
+__kernel void adreno_xmem_prepack_weight_f16(
+    __global half4 * dst,
+    __global const void * src_void,
+    ulong offset,
+    int K,
+    int M,
+    int kpack,
+    int npack,
+    int os) {
+    const int linear = get_global_id(0);
+    const int total = kpack*npack;
+    if (linear >= total) {
+        return;
+    }
+
+    __global const half * src = (__global const half *)((__global const char *)src_void + offset);
+
+    const int dst_ogroup = linear % os;
+    const int dst_o_sp_i = linear / os;
+    const int dst_i = dst_o_sp_i % kpack;
+    const int dst_o = dst_o_sp_i / kpack;
+    const int o_slice = dst_o*os + dst_ogroup;
+    const int k_base = dst_i*4;
+
+    half4 w0 = (half4)(0.0h);
+    half4 w1 = (half4)(0.0h);
+    half4 w2 = (half4)(0.0h);
+    half4 w3 = (half4)(0.0h);
+
+    const int o0 = o_slice*4 + 0;
+    const int o1 = o_slice*4 + 1;
+    const int o2 = o_slice*4 + 2;
+    const int o3 = o_slice*4 + 3;
+
+    if (k_base + 0 < K) {
+        if (o0 < M) w0.s0 = src[o0*K + k_base + 0];
+        if (o1 < M) w0.s1 = src[o1*K + k_base + 0];
+        if (o2 < M) w0.s2 = src[o2*K + k_base + 0];
+        if (o3 < M) w0.s3 = src[o3*K + k_base + 0];
+    }
+    if (k_base + 1 < K) {
+        if (o0 < M) w1.s0 = src[o0*K + k_base + 1];
+        if (o1 < M) w1.s1 = src[o1*K + k_base + 1];
+        if (o2 < M) w1.s2 = src[o2*K + k_base + 1];
+        if (o3 < M) w1.s3 = src[o3*K + k_base + 1];
+    }
+    if (k_base + 2 < K) {
+        if (o0 < M) w2.s0 = src[o0*K + k_base + 2];
+        if (o1 < M) w2.s1 = src[o1*K + k_base + 2];
+        if (o2 < M) w2.s2 = src[o2*K + k_base + 2];
+        if (o3 < M) w2.s3 = src[o3*K + k_base + 2];
+    }
+    if (k_base + 3 < K) {
+        if (o0 < M) w3.s0 = src[o0*K + k_base + 3];
+        if (o1 < M) w3.s1 = src[o1*K + k_base + 3];
+        if (o2 < M) w3.s2 = src[o2*K + k_base + 3];
+        if (o3 < M) w3.s3 = src[o3*K + k_base + 3];
+    }
+
+    dst[linear*4 + 0] = w0;
+    dst[linear*4 + 1] = w1;
+    dst[linear*4 + 2] = w2;
+    dst[linear*4 + 3] = w3;
+}
+
+__attribute__((qcom_max_concurrent_subgroups(12)))
+__kernel void kernel_gemm_xmem_f16_f32_os8(
+    __constant half8 * weights_buffer __attribute__((sub_group_uniform)),
+    __constant half8 * xmem_buffer __attribute__((max_constant_size((6144)))),
+    __read_only image2d_t src_img,
+    __write_only image2d_t dst_img,
+    int N,
+    int npack,
+    int kpack) {
+    const int X = get_group_id(1)*get_local_size(0) + get_local_id(0);
+    const int Z = get_group_id(0)*get_local_size(2) + get_local_id(2);
+
+    if (X >= N || Z*8 >= npack) {
+        return;
+    }
+
+    half4 r0 = (half4)(0.0h);
+    half4 r1 = (half4)(0.0h);
+    half4 r2 = (half4)(0.0h);
+    half4 r3 = (half4)(0.0h);
+    half4 r4 = (half4)(0.0h);
+    half4 r5 = (half4)(0.0h);
+    half4 r6 = (half4)(0.0h);
+    half4 r7 = (half4)(0.0h);
+
+    int f_offset = Z*kpack*32;
+    int subgroup_id = (int)(0x1F & qcom_get_physical_sub_group_id());
+    subgroup_id = subgroup_id % 12;
+    const int c_offset = subgroup_id*32;
+    __constant half16 * weights_cache = (__constant half16 *)&xmem_buffer[c_offset];
+
+    int coord_s = 0;
+    do {
+        const half4 src0 = read_imageh(src_img, smp_zero, (int2)(X, coord_s));
+        coord_s++;
+        const half4 src1 = read_imageh(src_img, smp_zero, (int2)(X, coord_s));
+        coord_s++;
+
+        qcom_sub_group_constant_load8(xmem_buffer, weights_buffer, c_offset, f_offset >> 1, 32);
+        f_offset += 64;
+        qcom_sub_group_sync(QCOM_CLK_CONST_LOAD_SYNC);
+
+        r0 += src0.x * weights_cache[0].s0123;
+        r0 += src0.y * weights_cache[0].s4567;
+        r0 += src0.z * weights_cache[0].s89ab;
+        r0 += src0.w * weights_cache[0].scdef;
+        r1 += src0.x * weights_cache[1].s0123;
+        r1 += src0.y * weights_cache[1].s4567;
+        r1 += src0.z * weights_cache[1].s89ab;
+        r1 += src0.w * weights_cache[1].scdef;
+        r2 += src0.x * weights_cache[2].s0123;
+        r2 += src0.y * weights_cache[2].s4567;
+        r2 += src0.z * weights_cache[2].s89ab;
+        r2 += src0.w * weights_cache[2].scdef;
+        r3 += src0.x * weights_cache[3].s0123;
+        r3 += src0.y * weights_cache[3].s4567;
+        r3 += src0.z * weights_cache[3].s89ab;
+        r3 += src0.w * weights_cache[3].scdef;
+        r4 += src0.x * weights_cache[4].s0123;
+        r4 += src0.y * weights_cache[4].s4567;
+        r4 += src0.z * weights_cache[4].s89ab;
+        r4 += src0.w * weights_cache[4].scdef;
+        r5 += src0.x * weights_cache[5].s0123;
+        r5 += src0.y * weights_cache[5].s4567;
+        r5 += src0.z * weights_cache[5].s89ab;
+        r5 += src0.w * weights_cache[5].scdef;
+        r6 += src0.x * weights_cache[6].s0123;
+        r6 += src0.y * weights_cache[6].s4567;
+        r6 += src0.z * weights_cache[6].s89ab;
+        r6 += src0.w * weights_cache[6].scdef;
+        r7 += src0.x * weights_cache[7].s0123;
+        r7 += src0.y * weights_cache[7].s4567;
+        r7 += src0.z * weights_cache[7].s89ab;
+        r7 += src0.w * weights_cache[7].scdef;
+
+        r0 += src1.x * weights_cache[8].s0123;
+        r0 += src1.y * weights_cache[8].s4567;
+        r0 += src1.z * weights_cache[8].s89ab;
+        r0 += src1.w * weights_cache[8].scdef;
+        r1 += src1.x * weights_cache[9].s0123;
+        r1 += src1.y * weights_cache[9].s4567;
+        r1 += src1.z * weights_cache[9].s89ab;
+        r1 += src1.w * weights_cache[9].scdef;
+        r2 += src1.x * weights_cache[10].s0123;
+        r2 += src1.y * weights_cache[10].s4567;
+        r2 += src1.z * weights_cache[10].s89ab;
+        r2 += src1.w * weights_cache[10].scdef;
+        r3 += src1.x * weights_cache[11].s0123;
+        r3 += src1.y * weights_cache[11].s4567;
+        r3 += src1.z * weights_cache[11].s89ab;
+        r3 += src1.w * weights_cache[11].scdef;
+        r4 += src1.x * weights_cache[12].s0123;
+        r4 += src1.y * weights_cache[12].s4567;
+        r4 += src1.z * weights_cache[12].s89ab;
+        r4 += src1.w * weights_cache[12].scdef;
+        r5 += src1.x * weights_cache[13].s0123;
+        r5 += src1.y * weights_cache[13].s4567;
+        r5 += src1.z * weights_cache[13].s89ab;
+        r5 += src1.w * weights_cache[13].scdef;
+        r6 += src1.x * weights_cache[14].s0123;
+        r6 += src1.y * weights_cache[14].s4567;
+        r6 += src1.z * weights_cache[14].s89ab;
+        r6 += src1.w * weights_cache[14].scdef;
+        r7 += src1.x * weights_cache[15].s0123;
+        r7 += src1.y * weights_cache[15].s4567;
+        r7 += src1.z * weights_cache[15].s89ab;
+        r7 += src1.w * weights_cache[15].scdef;
+    } while (coord_s < kpack);
+
+    int coord_s_out = Z*8;
+    if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r0); coord_s_out++; }
+    if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r1); coord_s_out++; }
+    if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r2); coord_s_out++; }
+    if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r3); coord_s_out++; }
+    if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r4); coord_s_out++; }
+    if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r5); coord_s_out++; }
+    if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r6); coord_s_out++; }
+    if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r7); }
+}
+
+__kernel void adreno_xmem_store_dst_f32(
+    __read_only image2d_t dst_img,
+    __global void * dst_void,
+    ulong offset,
+    int M,
+    int N) {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int npack = (M + 3) / 4;
+
+    if (x >= N || y >= npack) {
+        return;
+    }
+
+    __global float * dst = (__global float *)((__global char *)dst_void + offset);
+    const half4 hv = read_imageh(dst_img, smp_zero, (int2)(x, y));
+    const int m = y*4;
+    if (m + 0 < M) dst[x*M + m + 0] = (float)hv.s0;
+    if (m + 1 < M) dst[x*M + m + 1] = (float)hv.s1;
+    if (m + 2 < M) dst[x*M + m + 2] = (float)hv.s2;
+    if (m + 3 < M) dst[x*M + m + 3] = (float)hv.s3;
+}
--- a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl
@@ -0,0 +1,119 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+#define QK_Q4_1 32
+#define N_SIMDGROUP 4
+#define SIMDGROUP_WIDTH 64
+
+static inline float8 q4_1_to_fp32_packed8(ushort2 q4x8, half s, half m) {
+    float8 fp32x8;
+    fp32x8.s0 = (float)((q4x8.s0 & 0x000F) * s + m);
+    fp32x8.s1 = (float)(((q4x8.s0 & 0x00F0) >> 4) * s + m);
+    fp32x8.s2 = (float)(((q4x8.s0 & 0x0F00) >> 8) * s + m);
+    fp32x8.s3 = (float)(((q4x8.s0 & 0xF000) >> 12) * s + m);
+    fp32x8.s4 = (float)((q4x8.s1 & 0x000F) * s + m);
+    fp32x8.s5 = (float)(((q4x8.s1 & 0x00F0) >> 4) * s + m);
+    fp32x8.s6 = (float)(((q4x8.s1 & 0x0F00) >> 8) * s + m);
+    fp32x8.s7 = (float)(((q4x8.s1 & 0xF000) >> 12) * s + m);
+    return fp32x8;
+}
+
+
+__attribute__((qcom_reqd_sub_group_size("half")))
+__kernel void kernel_gemv_moe_q4_1_f32_ns(
+    __global uint * src0_q,
+    __global half * src0_d,
+    __global half * src0_m,
+    __read_only image1d_buffer_t src1,
+    __global uint * src2,
+    __global float * dst,
+    ulong         offsetd,
+    int           ne00,
+    int           ne01,
+    int           ne11
+) {
+    uint i01  = get_global_id(0);
+    uint i20  = get_global_id(2);
+    uint sgid = get_local_id(1);
+    uint slid = get_sub_group_local_id();
+
+    uint i11 = i20 % ne11;
+
+    uint expert_id = src2[i20];
+    uint expert_offset = expert_id * ne00 * ne01 / 32;
+
+    __private float sum = 0.0f; // each thread calculate partial sum of one output
+
+    // loop along ne00 in block granularity, skip 4 blocks every iter
+    for (uint ib00 = sgid; ib00 < (ne00 / QK_Q4_1); ib00 += N_SIMDGROUP) {
+
+        // load one block of q
+        uint4 regQ;
+        uint block_offset = expert_offset * 4 + ib00 * ne01 * 4 + i01;
+
+        regQ.s0 = src0_q[block_offset];
+        regQ.s1 = src0_q[block_offset + ne01];
+        regQ.s2 = src0_q[block_offset + ne01 * 2];
+        regQ.s3 = src0_q[block_offset + ne01 * 3];
+
+        uint offset = i11 * ne00 / 4 + ib00 * 8;
+
+        half regM = src0_m[ib00 * ne01 + i01 + expert_offset];
+        half regS = src0_d[ib00 * ne01 + i01 + expert_offset];
+
+        float8 fp32x8 = q4_1_to_fp32_packed8(as_ushort2(regQ.s0), regS, regM);
+
+        float4 shared_y4;
+        shared_y4 = read_imagef(src1, (offset + 0));
+        float4 acc = shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (offset + 1));
+        acc += shared_y4 * fp32x8.hi;
+
+        fp32x8 = q4_1_to_fp32_packed8(as_ushort2(regQ.s1), regS, regM);
+
+        shared_y4 = read_imagef(src1, (offset + 2));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (offset + 3));
+        acc += shared_y4 * fp32x8.hi;
+
+
+        fp32x8 = q4_1_to_fp32_packed8(as_ushort2(regQ.s2), regS, regM);
+
+        shared_y4 = read_imagef(src1, (offset + 4));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (offset + 5));
+        acc += shared_y4 * fp32x8.hi;
+
+
+        fp32x8 = q4_1_to_fp32_packed8(as_ushort2(regQ.s3), regS, regM);
+
+        shared_y4 = read_imagef(src1, (offset + 6));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (offset + 7));
+        acc += shared_y4 * fp32x8.hi;
+
+        sum += ((acc.s0 + acc.s1) + (acc.s2 + acc.s3));
+    }
+
+    // reduction in local memory, assumes #subgroups=4
+    __local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)];
+    if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum;
+    if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum;
+    if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
+
+    // 1 outputs per thread in subgroup 0
+    if (sgid == 0) {
+        dst = dst + (offsetd >> 2);
+        dst[i01 + i20 * ne01] = sum;
+    }
+
+}
--- a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl
@@ -0,0 +1,119 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+#define QK_Q5_0 32
+#define N_SIMDGROUP 4
+#define SIMDGROUP_WIDTH 64
+
+static inline float8 q5_0_to_fp32_packed8(ushort2 qs5x8, uchar qh5x8) {
+    float8 fp32x8;
+    fp32x8.s0 = (float)((( qs5x8.s0 & 0x000F)        | (( qh5x8       & 0x01) << 4)) - 16);
+    fp32x8.s1 = (float)((((qs5x8.s0 & 0x00F0) >> 4 ) | (((qh5x8 >> 1) & 0x01) << 4)) - 16);
+    fp32x8.s2 = (float)((((qs5x8.s0 & 0x0F00) >> 8 ) | (((qh5x8 >> 2) & 0x01) << 4)) - 16);
+    fp32x8.s3 = (float)((((qs5x8.s0 & 0xF000) >> 12) | (((qh5x8 >> 3) & 0x01) << 4)) - 16);
+    fp32x8.s4 = (float)((( qs5x8.s1 & 0x000F)        | (((qh5x8 >> 4) & 0x01) << 4)) - 16);
+    fp32x8.s5 = (float)((((qs5x8.s1 & 0x00F0) >> 4 ) | (((qh5x8 >> 5) & 0x01) << 4)) - 16);
+    fp32x8.s6 = (float)((((qs5x8.s1 & 0x0F00) >> 8 ) | (((qh5x8 >> 6) & 0x01) << 4)) - 16);
+    fp32x8.s7 = (float)((((qs5x8.s1 & 0xF000) >> 12) | (((qh5x8 >> 7) & 0x01) << 4)) - 16);
+    return fp32x8;
+}
+
+
+__attribute__((qcom_reqd_sub_group_size("half")))
+__kernel void kernel_gemv_moe_q5_0_f32_ns(
+    __global    uint *           src0_qs,
+    __global    uint *           src0_qh,
+    __global    half *           src0_d,
+    __read_only image1d_buffer_t src1,
+    __global    uint *           src2,
+    __global    float *          dst,
+    ulong offsetd,
+    uint  ne00,
+    uint  ne01,
+    uint  ne11
+) {
+    uint i01  = get_global_id(0);
+    uint i20  = get_global_id(2);
+    uint sgid = get_local_id(1);
+    uint slid = get_sub_group_local_id();
+
+    uint i11 = i20 % ne11;
+
+    uint expert_id = src2[i20];
+    uint expert_offset = expert_id * ne00 * ne01 / 32;
+
+    __private float sum = 0.0f; // each thread calculate partial sum of one output
+
+    // loop along ne00 in block granularity, skip 4 blocks every iter
+    for (uint ib00 = sgid; ib00 < (ne00 / QK_Q5_0); ib00 += N_SIMDGROUP) {
+
+        // load one block of q
+        uint4 regQ;
+        uint block_offset = expert_offset * 4 + ib00 * ne01 * 4 + i01;
+
+        regQ.s0 = src0_qs[block_offset];
+        regQ.s1 = src0_qs[block_offset + ne01];
+        regQ.s2 = src0_qs[block_offset + ne01 * 2];
+        regQ.s3 = src0_qs[block_offset + ne01 * 3];
+
+        uint offset = i11 * ne00 / 4 + ib00 * 8;
+
+        uchar4 regQh = as_uchar4(src0_qh[ib00 * ne01 + i01 + expert_offset]);
+        half regS = src0_d[ib00 * ne01 + i01 + expert_offset];
+
+        float8 fp32x8 = q5_0_to_fp32_packed8(as_ushort2(regQ.s0), regQh.s0);
+
+        float4 shared_y4;
+        shared_y4 = read_imagef(src1, (offset + 0));
+        float4 acc = shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (offset + 1));
+        acc += shared_y4 * fp32x8.hi;
+
+        fp32x8 = q5_0_to_fp32_packed8(as_ushort2(regQ.s1), regQh.s1);
+
+        shared_y4 = read_imagef(src1, (offset + 2));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (offset + 3));
+        acc += shared_y4 * fp32x8.hi;
+
+
+        fp32x8 = q5_0_to_fp32_packed8(as_ushort2(regQ.s2), regQh.s2);
+
+        shared_y4 = read_imagef(src1, (offset + 4));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (offset + 5));
+        acc += shared_y4 * fp32x8.hi;
+
+
+        fp32x8 = q5_0_to_fp32_packed8(as_ushort2(regQ.s3), regQh.s3);
+
+        shared_y4 = read_imagef(src1, (offset + 6));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (offset + 7));
+        acc += shared_y4 * fp32x8.hi;
+
+        sum += (float)(regS) * ((acc.s0 + acc.s1) + (acc.s2 + acc.s3));
+    }
+
+    // reduction in local memory, assumes #subgroups=4
+    __local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)];
+    if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum;
+    if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum;
+    if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
+
+    // 1 outputs per thread in subgroup 0
+    if (sgid == 0) {
+        dst = dst + (offsetd >> 2);
+        dst[i01 + i20 * ne01] = sum;
+    }
+
+}
--- a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl
@@ -0,0 +1,121 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+#define QK_Q5_1 32
+#define N_SIMDGROUP 4
+#define SIMDGROUP_WIDTH 64
+
+static inline float8 q5_1_to_fp32_packed8(ushort2 qs5x8, uchar qh5x8, half s, half m) {
+    float8 fp32x8;
+    fp32x8.s0 = (float)((( qs5x8.s0 & 0x000F)        | (( qh5x8       & 0x01) << 4)) * s + m);
+    fp32x8.s1 = (float)((((qs5x8.s0 & 0x00F0) >> 4 ) | (((qh5x8 >> 1) & 0x01) << 4)) * s + m);
+    fp32x8.s2 = (float)((((qs5x8.s0 & 0x0F00) >> 8 ) | (((qh5x8 >> 2) & 0x01) << 4)) * s + m);
+    fp32x8.s3 = (float)((((qs5x8.s0 & 0xF000) >> 12) | (((qh5x8 >> 3) & 0x01) << 4)) * s + m);
+    fp32x8.s4 = (float)((( qs5x8.s1 & 0x000F)        | (((qh5x8 >> 4) & 0x01) << 4)) * s + m);
+    fp32x8.s5 = (float)((((qs5x8.s1 & 0x00F0) >> 4 ) | (((qh5x8 >> 5) & 0x01) << 4)) * s + m);
+    fp32x8.s6 = (float)((((qs5x8.s1 & 0x0F00) >> 8 ) | (((qh5x8 >> 6) & 0x01) << 4)) * s + m);
+    fp32x8.s7 = (float)((((qs5x8.s1 & 0xF000) >> 12) | (((qh5x8 >> 7) & 0x01) << 4)) * s + m);
+    return fp32x8;
+}
+
+
+__attribute__((qcom_reqd_sub_group_size("half")))
+__kernel void kernel_gemv_moe_q5_1_f32_ns(
+    __global    uint *           src0_qs,
+    __global    uint *           src0_qh,
+    __global    half *           src0_d,
+    __global    half *           src0_m,
+    __read_only image1d_buffer_t src1,
+    __global    uint *           src2,
+    __global    float *          dst,
+    ulong offsetd,
+    uint  ne00,
+    uint  ne01,
+    uint  ne11
+) {
+    uint i01  = get_global_id(0);
+    uint i20  = get_global_id(2);
+    uint sgid = get_local_id(1);
+    uint slid = get_sub_group_local_id();
+
+    uint i11 = i20 % ne11;
+
+    uint expert_id = src2[i20];
+    uint expert_offset = expert_id * ne00 * ne01 / 32;
+
+    __private float sum = 0.0f; // each thread calculate partial sum of one output
+
+    // loop along ne00 in block granularity, skip 4 blocks every iter
+    for (uint ib00 = sgid; ib00 < (ne00 / QK_Q5_1); ib00 += N_SIMDGROUP) {
+
+        // load one block of q
+        uint4 regQ;
+        uint block_offset = expert_offset * 4 + ib00 * ne01 * 4 + i01;
+
+        regQ.s0 = src0_qs[block_offset];
+        regQ.s1 = src0_qs[block_offset + ne01];
+        regQ.s2 = src0_qs[block_offset + ne01 * 2];
+        regQ.s3 = src0_qs[block_offset + ne01 * 3];
+
+        uint offset = i11 * ne00 / 4 + ib00 * 8;
+
+        uchar4 regQh = as_uchar4(src0_qh[ib00 * ne01 + i01 + expert_offset]);
+        half regM = src0_m[ib00 * ne01 + i01 + expert_offset];
+        half regS = src0_d[ib00 * ne01 + i01 + expert_offset];
+
+        float8 fp32x8 = q5_1_to_fp32_packed8(as_ushort2(regQ.s0), regQh.s0, regS, regM);
+
+        float4 shared_y4;
+        shared_y4 = read_imagef(src1, (offset + 0));
+        float4 acc = shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (offset + 1));
+        acc += shared_y4 * fp32x8.hi;
+
+        fp32x8 = q5_1_to_fp32_packed8(as_ushort2(regQ.s1), regQh.s1, regS, regM);
+
+        shared_y4 = read_imagef(src1, (offset + 2));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (offset + 3));
+        acc += shared_y4 * fp32x8.hi;
+
+
+        fp32x8 = q5_1_to_fp32_packed8(as_ushort2(regQ.s2), regQh.s2, regS, regM);
+
+        shared_y4 = read_imagef(src1, (offset + 4));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (offset + 5));
+        acc += shared_y4 * fp32x8.hi;
+
+
+        fp32x8 = q5_1_to_fp32_packed8(as_ushort2(regQ.s3), regQh.s3, regS, regM);
+
+        shared_y4 = read_imagef(src1, (offset + 6));
+        acc += shared_y4 * fp32x8.lo;
+
+        shared_y4 = read_imagef(src1, (offset + 7));
+        acc += shared_y4 * fp32x8.hi;
+
+        sum += ((acc.s0 + acc.s1) + (acc.s2 + acc.s3));
+    }
+
+    // reduction in local memory, assumes #subgroups=4
+    __local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)];
+    if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum;
+    if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum;
+    if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
+    if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
+
+    // 1 outputs per thread in subgroup 0
+    if (sgid == 0) {
+        dst = dst + (offsetd >> 2);
+        dst[i01 + i20 * ne01] = sum;
+    }
+
+}
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -39,6 +39,18 @@ if (WIN32)
        set(CMAKE_CXX_COMPILER "icx")
        set(CMAKE_CXX_COMPILER_ID "IntelLLVM")
    endif()
+    # Level Zero SDK path for Windows (only when GGML_SYCL_SUPPORT_LEVEL_ZERO is enabled)
+    if(GGML_SYCL_SUPPORT_LEVEL_ZERO)
+        if(DEFINED ENV{LEVEL_ZERO_V1_SDK_PATH})
+            set(LEVEL_ZERO_V1_SDK_PATH $ENV{LEVEL_ZERO_V1_SDK_PATH})
+            if(EXISTS "${LEVEL_ZERO_V1_SDK_PATH}")
+                target_include_directories(ggml-sycl PRIVATE "${LEVEL_ZERO_V1_SDK_PATH}/include")
+                set(LEVEL_ZERO_V1_SDK_LIB_PATH "${LEVEL_ZERO_V1_SDK_PATH}/lib")
+            else()
+                message(WARNING "LEVEL_ZERO_V1_SDK_PATH set but folder not found: ${LEVEL_ZERO_V1_SDK_PATH}")
+            endif()
+        endif()
+    endif()
 endif()

 macro(detect_and_find_package package_name)
@@ -93,6 +105,23 @@ endif()

 target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")

+message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO ${GGML_SYCL_SUPPORT_LEVEL_ZERO}")
+if (GGML_SYCL_SUPPORT_LEVEL_ZERO)
+    # Link against Level Zero loader for direct device memory allocation.
+    # Avoids sycl::malloc_device triggering DMA-buf/TTM system RAM staging
+    # in the xe kernel driver during multi-GPU inference.
+    find_path(LEVEL_ZERO_INCLUDE_DIR level_zero/ze_api.h HINTS ${ONEAPI_ROOT}/include ${LEVEL_ZERO_V1_SDK_PATH}/include)
+    find_library(ZE_LOADER_LIB ze_loader HINTS ${ONEAPI_ROOT}/lib ${LEVEL_ZERO_V1_SDK_LIB_PATH} ENV LD_LIBRARY_PATH)
+    if(ZE_LOADER_LIB AND LEVEL_ZERO_INCLUDE_DIR)
+        target_link_libraries(ggml-sycl PRIVATE ${ZE_LOADER_LIB})
+        target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO)
+        message(STATUS "Level Zero loader found: ${ZE_LOADER_LIB}")
+        message(STATUS "Level Zero headers found: ${LEVEL_ZERO_INCLUDE_DIR}")
+    else()
+        message(WARNING "Level Zero loader or headers not found, Level Zero support disabled")
+    endif()
+endif()
+
 # Link against oneDNN
 set(GGML_SYCL_DNNL 0)
 if(GGML_SYCL_DNN)
--- a/Show More
+++ b/Show More