HIP: RDNA3 mma FA, faster AMD transpose, tune AMD (#22880 )

Adds RDNA3 support to the CUDA mma FA kernel. To make the RDNA3 tensor cores work with the FP16 accumulation for VKQ the tiles they need to be 32 logical units long in direction of the attention head; for head sizes 80 and 112 that are not exactly divided by 32 the regular length of 16 with FP32 accumulation is used instead. The longer tiles also enable more efficient transposition for a warp size of 32 which is why it's also used for RDNA4. However, this scrambles the data layout of the accumulators along the attention head dimension. To prevent accidental misuse I added another entry to ggml_cuda_mma::data_layout. I also tuned the kernel parameters for RDNA3, RDNA4, and CDNA1 in general, during which I discovered that the kernel can be made to work for head sizes up to 256 for CDNA. For RDNA3/4 I was not able to get better performance that the tile kernel for head sizes > 128.
CI : support IOT device (IQ9) (#22987 )
2026-05-21 17:17:24 +03:00 · 2026-05-14 22:58:58 +02:00 · 2026-05-14 13:58:34 -07:00 · 2026-05-14 09:41:32 -07:00 · 2026-05-14 09:31:36 -07:00 · 2026-05-14 17:57:20 +02:00
546 changed files with 66281 additions and 45272 deletions
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -5,8 +5,15 @@ ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
 FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build

 ARG GGML_SYCL_F16=OFF
+ARG LEVEL_ZERO_VERSION=1.28.2
+ARG LEVEL_ZERO_UBUNTU_VERSION=u24.04
 RUN apt-get update && \
-    apt-get install -y git libssl-dev
+    apt-get install -y git libssl-dev wget ca-certificates && \
+    cd /tmp && \
+    wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb && \
+    wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb && \
+    apt-get -o Dpkg::Options::="--force-overwrite" install -y ./level-zero.deb ./level-zero-devel.deb && \
+    rm -f /tmp/level-zero.deb /tmp/level-zero-devel.deb

 WORKDIR /app

@@ -33,11 +40,11 @@ RUN mkdir -p /app/full \

 FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

-ARG IGC_VERSION=v2.30.1
-ARG IGC_VERSION_FULL=2_2.30.1+20950
-ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
-ARG IGDGMM_VERSION=22.9.0
+ARG IGC_VERSION=v2.20.5
+ARG IGC_VERSION_FULL=2_2.20.5+19972
+ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10
+ARG COMPUTE_RUNTIME_VERSION_FULL=25.40.35563.10-0
+ARG IGDGMM_VERSION=22.8.2
 RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
@@ -109,4 +116,3 @@ WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

 ENTRYPOINT [ "/app/llama-server" ]
-
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -103,6 +103,7 @@ let
    vulkan-headers
    vulkan-loader
    shaderc
+    spirv-headers
  ];
 in

@@ -146,7 +147,6 @@ effectiveStdenv.mkDerivation (finalAttrs: {
      ninja
      pkg-config
      git
-      spirv-headers
    ]
    ++ optionals useCuda [
      cudaPackages.cuda_nvcc
--- a/.editorconfig
+++ b/.editorconfig
@@ -53,14 +53,6 @@ charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset

-[tools/server/public/**]
-indent_style = unset
-indent_size = unset
-end_of_line = unset
-charset = unset
-trim_trailing_whitespace = unset
-insert_final_newline = unset
-
 [benches/**]
 indent_style = unset
 indent_size = unset
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,4 +0,0 @@
-# Treat the generated single-file WebUI build as binary for diff purposes.
-# Git's pack-file delta compression still works (byte-level), but this prevents
-# git diff from printing the entire minified file on every change.
-tools/server/public/index.html -diff
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@@ -12,6 +12,8 @@ body:
        after recreating the CMake build directory and with `-DGGML_CCACHE=OFF`.
        If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
        by clearing `~/.cache/ccache` (on Linux).
+
+        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
  - type: textarea
    id: commit
    attributes:
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@@ -1,5 +1,5 @@
 name: Bug (model use)
-description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
+description: Something goes wrong when running a model (crashes, garbled outputs, etc.).
 title: "Eval bug: "
 labels: ["bug-unconfirmed", "model evaluation"]
 body:
@@ -12,6 +12,8 @@ body:
        If you encountered the issue while using an external UI (e.g. ollama),
        please reproduce your issue using one of the examples/binaries in this repository.
        The `llama-completion` binary can be used for simple and reproducible model inference.
+
+        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
  - type: textarea
    id: version
    attributes:
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@@ -10,6 +10,8 @@ body:
        This issue template is intended for miscellaneous bugs that don't fit into any other category.
        If you encountered the issue while using an external UI (e.g. ollama),
        please reproduce your issue using one of the examples/binaries in this repository.
+
+        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
  - type: textarea
    id: version
    attributes:
--- a/.github/ISSUE_TEMPLATE/020-enhancement.yml
+++ b/.github/ISSUE_TEMPLATE/020-enhancement.yml
@@ -8,6 +8,8 @@ body:
      value: |
        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas)

+        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
+
  - type: checkboxes
    id: prerequisites
    attributes:
--- a/.github/ISSUE_TEMPLATE/030-research.yml
+++ b/.github/ISSUE_TEMPLATE/030-research.yml
@@ -8,6 +8,8 @@ body:
      value: |
        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)

+        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
+
  - type: checkboxes
    id: research-stage
    attributes:
--- a/.github/ISSUE_TEMPLATE/040-refactor.yml
+++ b/.github/ISSUE_TEMPLATE/040-refactor.yml
@@ -9,6 +9,8 @@ body:
        Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
        Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.

+        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
+
  - type: textarea
    id: background-description
    attributes:
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -77,7 +77,6 @@ server/webui:
    - changed-files:
        - any-glob-to-any-file:
            - tools/server/webui/**
-            - tools/server/public/**
 server:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/workflows/build-and-test-snapdragon.yml
+++ b/.github/workflows/build-and-test-snapdragon.yml
@@ -58,14 +58,45 @@ jobs:
          name: llama-cpp-android-arm64-snapdragon
          path: pkg-snapdragon/llama.cpp

+  linux-iot-snapdragon:
+    runs-on: ubuntu-latest
+    container:
+      image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.1'
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          lfs: false
+
+      - name: Build Llama.CPP for Snapdragon Linux IoT
+        id: build_llama_cpp_snapdragon_linux
+        run: |
+          cp docs/backend/snapdragon/CMakeUserPresets.json .
+          cmake --preset arm64-linux-snapdragon-release -B build-snapdragon -DGGML_OPENCL=ON
+          cmake --build build-snapdragon -j $(nproc)
+          cmake --install build-snapdragon --prefix pkg-snapdragon/llama.cpp
+
+      - name: Upload Llama.CPP Snapdragon Linux IoT Build Artifact
+        if: ${{ always() && steps.build_llama_cpp_snapdragon_linux.outcome == 'success' }}
+        uses: actions/upload-artifact@v6
+        with:
+          name: llama-cpp-linux-arm64-snapdragon
+          path: pkg-snapdragon/llama.cpp
+
  test-snapdragon-qdc:
-    name: Test on QDC Android Device (${{ matrix.device }})
-    needs: [android-ndk-snapdragon]
-    runs-on: ubuntu-slim
+    name: Test on QDC Device (${{ matrix.device }})
+    needs: [android-ndk-snapdragon, linux-iot-snapdragon]
+    runs-on: ubuntu-24.04-arm
+    timeout-minutes: 90
    strategy:
      fail-fast: false
      matrix:
-        device: [SM8750, SM8650, SM8850]
+        device: [SM8750, SM8850, QCS9075M]

    steps:
      - name: Checkout
@@ -74,11 +105,11 @@ jobs:
      - name: Download build artifact
        uses: actions/download-artifact@v7
        with:
-          name: llama-cpp-android-arm64-snapdragon
+          name: ${{ startsWith(matrix.device, 'QCS') && 'llama-cpp-linux-arm64-snapdragon' || 'llama-cpp-android-arm64-snapdragon' }}
          path: pkg-snapdragon/llama.cpp

      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
        with:
          python-version: '3.x'
          cache: pip
@@ -107,7 +138,8 @@ jobs:
              --test       all \
              --pkg-dir    pkg-snapdragon/llama.cpp \
              --model-url  "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf" \
-              --device     ${{ matrix.device }}
+              --device     ${{ matrix.device }} \
+              ${{ startsWith(matrix.device, 'QCS') && '--retries 2 --retry-delay 300' || '' }}
        env:
          QDC_API_KEY: ${{ secrets.QDC_API_KEY }}

--- a/.github/workflows/build-cross.yml
+++ b/.github/workflows/build-cross.yml
@@ -301,16 +301,17 @@ jobs:
          export RISCV_ROOT_PATH=${PWD}/spacemit_toolchain
          cmake -B build -DLLAMA_OPENSSL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
+                         -DGGML_CPU_REPACK=OFF \
                         -DLLAMA_BUILD_TOOLS=ON \
                         -DLLAMA_BUILD_TESTS=OFF \
                         -DGGML_CPU_RISCV64_SPACEMIT=ON \
                         -DGGML_RVV=ON \
+                         -DGGML_RV_ZVFH=ON \
                         -DGGML_RV_ZFH=ON \
                         -DGGML_RV_ZICBOP=ON \
                         -DGGML_RV_ZIHINTPAUSE=ON \
-                         -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
+                         -DGGML_RV_ZBA=ON \
                         -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake

          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@@ -55,7 +55,22 @@ env:
  LLAMA_LOG_TIMESTAMPS: 1

 jobs:
+  determine-tag:
+    name: Determine tag name
+    runs-on: ubuntu-slim
+    outputs:
+      tag_name: ${{ steps.tag.outputs.name }}
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
  ggml-ci-nvidia-cuda:
+    needs: determine-tag
    runs-on: [self-hosted, Linux, NVIDIA]

    steps:
@@ -65,11 +80,14 @@ jobs:

      - name: Test
        id: ggml-ci
+        env:
+          HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          nvidia-smi
          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  ggml-ci-nvidia-vulkan-cm:
+    needs: determine-tag
    runs-on: [self-hosted, Linux, NVIDIA]

    steps:
@@ -79,11 +97,14 @@ jobs:

      - name: Test
        id: ggml-ci
+        env:
+          HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  ggml-ci-nvidia-vulkan-cm2:
+    needs: determine-tag
    runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]

    steps:
@@ -93,39 +114,40 @@ jobs:

      - name: Test
        id: ggml-ci
+        env:
+          HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

-  # TODO: investigate slight precision issues in some operations for test-backend-ops on the WebGPU backend.
-  #ggml-ci-nvidia-webgpu:
-  #  runs-on: [self-hosted, Linux, NVIDIA]
+  ggml-ci-nvidia-webgpu:
+    runs-on: [self-hosted, Linux, NVIDIA]

-  #  steps:
-  #    - name: Clone
-  #      id: checkout
-  #      uses: actions/checkout@v6
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6

-  #    - name: Dawn Dependency
-  #      id: dawn-depends
-  #      run: |
-  #        DAWN_VERSION="v20260317.182325"
-  #        DAWN_OWNER="google"
-  #        DAWN_REPO="dawn"
-  #        DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
-  #        echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-  #        curl -L -o artifact.tar.gz \
-  #          "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-  #        mkdir dawn
-  #        tar -xvf artifact.tar.gz -C dawn --strip-components=1
+      - name: Dawn Dependency
+        id: dawn-depends
+        run: |
+          DAWN_VERSION="v20260317.182325"
+          DAWN_OWNER="google"
+          DAWN_REPO="dawn"
+          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
+          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          curl -L -o artifact.tar.gz \
+            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          mkdir dawn
+          tar -xvf artifact.tar.gz -C dawn --strip-components=1

-  #    - name: Test
-  #      id: ggml-ci
-  #      run: |
-  #        GG_BUILD_WEBGPU=1 \
-  #        GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
-  #        GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
-  #          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+      - name: Test
+        id: ggml-ci
+        run: |
+          GG_BUILD_WEBGPU=1 \
+          GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
+          GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
+            bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  # TODO: provision AMX-compatible machine
  #ggml-ci-cpu-amx:
@@ -172,6 +194,7 @@ jobs:
  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  ggml-ci-mac-metal:
+    needs: determine-tag
    runs-on: [self-hosted, macOS, ARM64]

    steps:
@@ -181,10 +204,13 @@ jobs:

      - name: Test
        id: ggml-ci
+        env:
+          HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  ggml-ci-mac-webgpu:
+    needs: determine-tag
    runs-on: [self-hosted, macOS, ARM64]

    steps:
@@ -207,11 +233,14 @@ jobs:

      - name: Test
        id: ggml-ci
+        env:
+          HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  ggml-ci-mac-vulkan:
+    needs: determine-tag
    runs-on: [self-hosted, macOS, ARM64]

    steps:
@@ -221,11 +250,14 @@ jobs:

      - name: Test
        id: ggml-ci
+        env:
+          HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  ggml-ci-linux-intel-vulkan:
+    needs: determine-tag
    runs-on: [self-hosted, Linux, Intel]

    steps:
@@ -237,11 +269,14 @@ jobs:

      - name: Test
        id: ggml-ci
+        env:
+          HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  ggml-ci-win-intel-vulkan:
+    needs: determine-tag
    runs-on: [self-hosted, Windows, X64, Intel]

    steps:
@@ -256,6 +291,7 @@ jobs:
          MSYSTEM: UCRT64
          CHERE_INVOKING: 1
          PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }}
+          HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          vulkaninfo --summary
          # Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create
@@ -263,6 +299,7 @@ jobs:
          LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp

  ggml-ci-intel-openvino-gpu-low-perf:
+    needs: determine-tag
    runs-on: [self-hosted, Linux, Intel, OpenVINO]

    concurrency:
@@ -294,6 +331,8 @@ jobs:

      - name: Test
        id: ggml-ci
+        env:
+          HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
        run: |
          source ./openvino_toolkit/setupvars.sh
          GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
--- a/.github/workflows/build-sycl.yml
+++ b/.github/workflows/build-sycl.yml
@@ -50,6 +50,8 @@ jobs:
    env:
      ONEAPI_ROOT: /opt/intel/oneapi/
      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+      LEVEL_ZERO_VERSION: "1.28.2"
+      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"

    continue-on-error: true

@@ -71,6 +73,14 @@ jobs:
          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept

+      - name: Install Level Zero SDK
+        shell: bash
+        run: |
+          cd /tmp
+          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
+          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
+          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
+
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
@@ -107,6 +117,7 @@ jobs:
    env:
      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
      ONEAPI_INSTALLER_VERSION: "2025.3.3"
    steps:
@@ -127,6 +138,13 @@ jobs:
        run: |
          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL

+      - name: Install Level Zero SDK
+        shell: pwsh
+        run: |
+          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
+          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
+          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
--- a/.github/workflows/build-virtgpu.yml
+++ b/.github/workflows/build-virtgpu.yml
@@ -0,0 +1,50 @@
+name: CI (virtgpu)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-virtgpu.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-virtgpu.yml',
+      'ggml/src/ggml-virtgpu/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  ubuntu-24-virtgpu:
+    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential libdrm-dev pkg-config libssl-dev
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DGGML_VIRTGPU=ON \
+            -DGGML_VIRTGPU_BACKEND=ON
+          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -456,7 +456,8 @@ jobs:
        run: |
          cd build
          # This is using llvmpipe and runs slower than other backends
-          ctest -L main --verbose --timeout 900
+          # test-backend-ops is too slow on llvmpipe, skip it
+          ctest -L main -E test-backend-ops --verbose --timeout 900

  ubuntu-24-webgpu-wasm:
    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
--- a/.github/workflows/code-style.yml
+++ b/.github/workflows/code-style.yml
@@ -0,0 +1,51 @@
+name: Code Style Checker
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  model-naming:
+    runs-on: ubuntu-slim
+    steps:
+      - uses: actions/checkout@v6
+      - name: Check model naming conventions
+        run: |
+          python3 - << 'EOF'
+          import re, os, sys
+
+          pairs = re.findall(
+              r'case\s+(LLM_ARCH_\w+)\s*:\s*\n\s+return new (llama_model_\w+)\s*\(',
+              open("src/llama-model.cpp").read())
+
+          errors = []
+          for arch, cls in pairs:
+              suffix  = arch[len("LLM_ARCH_"):]
+              csuffix = cls[len("llama_model_"):]
+              fname   = csuffix.replace("_", "-") + ".cpp"
+
+              if not re.fullmatch(r'[A-Z][A-Z0-9_]*',   suffix):
+                  errors.append(f"{arch}: suffix not upper snake case, example: LLM_ARCH_MY_MODEL")
+
+              if not re.fullmatch(r'[a-z][a-z0-9_]*', csuffix):
+                  errors.append(f"{arch}: class suffix not lower snake case, example: llama_model_my_model")
+
+              elif suffix.lower() != csuffix:
+                  errors.append(f"{arch}: arch/class name mismatch, expected class 'llama_model_{suffix.lower()}' but got '{cls}'")
+
+              elif not os.path.isfile(f"src/models/{fname}"):
+                  errors.append(f"{arch}: expects model file name to be src/models/{fname}, but not found")
+
+          if errors:
+              print('\n'.join(f"  - {e}" for e in errors)); sys.exit(1)
+          print(f"OK: {len(pairs)} mappings validated.")
+          EOF
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -2,11 +2,6 @@ name: EditorConfig Checker

 on:
  workflow_dispatch: # allows manual triggering
-    inputs:
-      create_release:
-        description: 'Create new release'
-        required: true
-        type: boolean
  push:
    branches:
      - master
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@@ -29,10 +29,10 @@ jobs:
      uses: actions/setup-python@v6
      with:
        python-version: '3.11'
+        pip-install: poetry==2.4.0
    - name: Install dependencies
      run: |
        cd gguf-py
-        python -m pip install poetry==2.3.2
        poetry install

    - name: Build package
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -31,7 +31,7 @@ jobs:
        uses: actions/setup-python@v6
        with:
          python-version: "3.11"
-          pip-install: -r requirements/requirements-all.txt ty==0.0.33
+          pip-install: -r requirements/requirements-all.txt ty==0.0.35
      # - name: Type-check with Pyright
      #   uses: jakebailey/pyright-action@v2
      #   with:
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -36,7 +36,14 @@ env:
  CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"

 jobs:
+  webui-build:
+    name: Build WebUI
+    uses: ./.github/workflows/webui-build.yml
+
  macOS-cpu:
+    needs:
+      - webui-build
+
    strategy:
      matrix:
        include:
@@ -64,6 +71,12 @@ jobs:
        with:
          fetch-depth: 0

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -100,6 +113,9 @@ jobs:
          name: llama-bin-macos-${{ matrix.build }}.tar.gz

  ubuntu-cpu:
+    needs:
+      - webui-build
+
    strategy:
      matrix:
        include:
@@ -119,6 +135,12 @@ jobs:
        with:
          fetch-depth: 0

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: ccache
        if: ${{ matrix.build != 's390x' }}
        uses: ggml-org/ccache-action@v1.2.21
@@ -169,6 +191,9 @@ jobs:
          name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz

  ubuntu-vulkan:
+    needs:
+      - webui-build
+
    strategy:
      matrix:
        include:
@@ -186,6 +211,12 @@ jobs:
        with:
          fetch-depth: 0

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -237,6 +268,9 @@ jobs:
          name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz

  android-arm64:
+    needs:
+      - webui-build
+
    runs-on: ubuntu-latest

    env:
@@ -249,6 +283,12 @@ jobs:
        with:
          fetch-depth: 0

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -306,6 +346,9 @@ jobs:
          name: llama-bin-android-arm64.tar.gz

  ubuntu-24-openvino:
+    needs:
+      - webui-build
+
    runs-on: ubuntu-24.04

    outputs:
@@ -327,6 +370,12 @@ jobs:
        with:
          fetch-depth: 0

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -386,6 +435,9 @@ jobs:
          name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz

  windows-cpu:
+    needs:
+      - webui-build
+
    runs-on: windows-2025

    strategy:
@@ -400,6 +452,12 @@ jobs:
        with:
          fetch-depth: 0

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -438,6 +496,9 @@ jobs:
          name: llama-bin-win-cpu-${{ matrix.arch }}.zip

  windows:
+    needs:
+      - webui-build
+
    runs-on: windows-2025

    env:
@@ -461,6 +522,12 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -520,6 +587,9 @@ jobs:
          name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip

  windows-cuda:
+    needs:
+      - webui-build
+
    runs-on: windows-2022

    strategy:
@@ -531,6 +601,12 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: Install ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -591,6 +667,9 @@ jobs:
          name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip

  windows-sycl:
+    needs:
+      - webui-build
+
    runs-on: windows-2022

    defaults:
@@ -600,6 +679,7 @@ jobs:
    env:
      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
      ONEAPI_INSTALLER_VERSION: "2025.3.3"

@@ -621,6 +701,19 @@ jobs:
        run: |
          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL

+      - name: Install Level Zero SDK
+        shell: pwsh
+        run: |
+          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
+          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
+          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
+
+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -655,6 +748,13 @@ jobs:
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
+          ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
+          if [ -n "$ZE_LOADER_DLL" ]; then
+            echo "Using Level Zero loader: $ZE_LOADER_DLL"
+            cp "$ZE_LOADER_DLL" ./build/bin
+          else
+            echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
+          fi

          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
@@ -681,6 +781,9 @@ jobs:
          name: llama-bin-win-sycl-x64.zip

  ubuntu-24-sycl:
+    needs:
+      - webui-build
+
    strategy:
      matrix:
        build: [fp32, fp16]
@@ -695,6 +798,8 @@ jobs:
    env:
      ONEAPI_ROOT: /opt/intel/oneapi/
      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+      LEVEL_ZERO_VERSION: "1.28.2"
+      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"

    steps:
      - name: Clone
@@ -718,6 +823,20 @@ jobs:
          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept

+      - name: Install Level Zero SDK
+        shell: bash
+        run: |
+          cd /tmp
+          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
+          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
+          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
+
+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -757,6 +876,9 @@ jobs:
          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz

  ubuntu-22-rocm:
+    needs:
+      - webui-build
+
    runs-on: ubuntu-22.04

    strategy:
@@ -773,6 +895,12 @@ jobs:
        with:
          fetch-depth: 0

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: Free up disk space
        uses: ggml-org/free-disk-space@v1.3.1
        with:
@@ -860,6 +988,9 @@ jobs:
          name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz

  windows-hip:
+    needs:
+      - webui-build
+
    runs-on: windows-2022

    env:
@@ -876,6 +1007,12 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: Grab rocWMMA package
        id: grab_rocwmma
        run: |
@@ -1122,6 +1259,7 @@ jobs:
    runs-on: ubuntu-slim

    needs:
+      - webui-build
      - windows
      - windows-cpu
      - windows-cuda
@@ -1137,6 +1275,9 @@ jobs:
      - ios-xcode-build
      - openEuler-cann

+    outputs:
+      tag_name: ${{ steps.tag.outputs.name }}
+
    steps:
      - name: Clone
        id: checkout
@@ -1262,3 +1403,15 @@ jobs:
                });
              }
            }
+
+  webui-publish:
+    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+
+    needs:
+      - release
+
+    uses: ./.github/workflows/webui-publish.yml
+    with:
+      version_tag: ${{ needs.release.outputs.tag_name }}
+    secrets:
+      hf_token: ${{ secrets.HF_TOKEN_WEBUI_STATIC_OUTPUT }}
--- a/.github/workflows/server-self-hosted.yml
+++ b/.github/workflows/server-self-hosted.yml
@@ -39,7 +39,12 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  webui-build:
+    name: Build WebUI
+    uses: ./.github/workflows/webui-build.yml
+
  server-metal:
+    needs: webui-build
    runs-on: [self-hosted, llama-server, macOS, ARM64]

    name: server-metal (${{ matrix.wf_name }})
@@ -67,6 +72,12 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: Build
        id: cmake_build
        run: |
--- a/.github/workflows/server-webui.yml
+++ b/.github/workflows/server-webui.yml
@@ -1,7 +1,7 @@
 name: Server WebUI

 on:
-  workflow_dispatch: # allows manual triggering
+  workflow_dispatch:
    inputs:
      sha:
        description: 'Commit SHA1 to build'
@@ -13,16 +13,14 @@ on:
    paths: [
      '.github/workflows/server-webui.yml',
      'tools/server/webui/**.*',
-      'tools/server/tests/**.*',
-      'tools/server/public/**'
+      'tools/server/tests/**.*'
    ]
  pull_request:
    types: [opened, synchronize, reopened]
    paths: [
      '.github/workflows/server-webui.yml',
      'tools/server/webui/**.*',
-      'tools/server/tests/**.*',
-      'tools/server/public/**'
+      'tools/server/tests/**.*'
    ]

 env:
@@ -36,9 +34,14 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  webui-check:
+  webui-build:
+    name: Build WebUI
+    uses: ./.github/workflows/webui-build.yml
+
+  webui-checks:
    name: WebUI Checks
-    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+    needs: webui-build
+    runs-on: ubuntu-24.04-arm
    continue-on-error: true
    steps:
      - name: Checkout code
@@ -51,7 +54,7 @@ jobs:
        id: node
        uses: actions/setup-node@v6
        with:
-          node-version: "22"
+          node-version: "24"
          cache: "npm"
          cache-dependency-path: "tools/server/webui/package-lock.json"

@@ -71,6 +74,47 @@ jobs:
        run: npm run lint
        working-directory: tools/server/webui

+      - name: Install Playwright browsers
+        id: playwright
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        run: npx playwright install --with-deps
+        working-directory: tools/server/webui
+
+      - name: Run Client tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
+        run: npm run test:client
+        working-directory: tools/server/webui
+
+      - name: Run Unit tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
+        run: npm run test:unit
+        working-directory: tools/server/webui
+
+  e2e-tests:
+    name: E2E Tests
+    needs: webui-build
+    runs-on: ubuntu-24.04-arm
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Setup Node.js
+        id: node
+        uses: actions/setup-node@v6
+        with:
+          node-version: "24"
+          cache: "npm"
+          cache-dependency-path: "tools/server/webui/package-lock.json"
+
+      - name: Install dependencies
+        id: setup
+        if: ${{ steps.node.conclusion == 'success' }}
+        run: npm ci
+        working-directory: tools/server/webui
+
      - name: Build application
        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run build
@@ -87,16 +131,6 @@ jobs:
        run: npm run build-storybook
        working-directory: tools/server/webui

-      - name: Run Client tests
-        if: ${{ always() && steps.playwright.conclusion == 'success' }}
-        run: npm run test:client
-        working-directory: tools/server/webui
-
-      - name: Run Unit tests
-        if: ${{ always() && steps.playwright.conclusion == 'success' }}
-        run: npm run test:unit
-        working-directory: tools/server/webui
-
      - name: Run UI tests
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:ui -- --testTimeout=60000
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -54,7 +54,12 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  webui-build:
+    name: Build WebUI
+    uses: ./.github/workflows/webui-build.yml
+
  server:
+    needs: webui-build
    runs-on: ubuntu-latest

    name: server (${{ matrix.wf_name }})
@@ -93,6 +98,12 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: Build
        id: cmake_build
        run: |
@@ -125,6 +136,7 @@ jobs:
          SLOW_TESTS=1 pytest -v -x

  server-windows:
+    needs: webui-build
    runs-on: windows-2022

    steps:
@@ -135,6 +147,12 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
      - name: Build
        id: cmake_build
        run: |
--- a/.github/workflows/webui-build.yml
+++ b/.github/workflows/webui-build.yml
@@ -0,0 +1,44 @@
+name: Build WebUI
+
+on:
+  workflow_call:
+
+jobs:
+  build:
+    name: Build WebUI
+    runs-on: ubuntu-slim
+    env:
+      BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v6
+        with:
+          node-version: "24"
+          cache: "npm"
+          cache-dependency-path: "tools/server/webui/package-lock.json"
+
+      - name: Install dependencies
+        run: npm ci
+        working-directory: tools/server/webui
+
+      - name: Build application
+        run: npm run build
+        working-directory: tools/server/webui
+
+      - name: Generate checksums
+        run: |
+          cd tools/server/public
+          for f in *; do
+            sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
+          done
+
+      - name: Upload built webui
+        uses: actions/upload-artifact@v6
+        with:
+          name: webui-build
+          path: tools/server/public/
+          retention-days: 1
--- a/.github/workflows/webui-publish.yml
+++ b/.github/workflows/webui-publish.yml
@@ -0,0 +1,65 @@
+name: WebUI Publish
+
+on:
+  workflow_call:
+    inputs:
+      version_tag:
+        description: 'Version tag to publish under (e.g., b1234)'
+        required: true
+        type: string
+    secrets:
+      hf_token:
+        description: 'Hugging Face token with write access'
+        required: true
+
+jobs:
+  publish:
+    name: Publish WebUI Static Output
+    runs-on: ubuntu-24.04-arm
+
+    permissions:
+      contents: read
+
+    env:
+      HF_BUCKET_NAME: ${{ vars.HF_BUCKET_WEBUI_STATIC_OUTPUT }}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 1
+
+      - name: Download WebUI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: webui-build
+          path: tools/server/public/
+
+      - name: Install Hugging Face Hub CLI
+        run: pip install -U huggingface_hub
+
+      - name: Authenticate with Hugging Face
+        run: hf auth login --token ${{ secrets.hf_token }}
+
+      - name: Sync built files to Hugging Face bucket (version tag)
+        run: |
+          # Upload the built files to the Hugging Face bucket under the release version
+          hf buckets sync tools/server/public hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet
+
+      - name: Sync built files to Hugging Face bucket (latest)
+        run: |
+          # Also upload to the 'latest' directory for fallback downloads
+          hf buckets sync tools/server/public hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet
+
+      - name: Verify upload
+        run: |
+          # List the files in the bucket to verify the upload
+          hf buckets list hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} -R -h
+
+      - name: Clean up root-level files
+        run: |
+          # Clean up any old root-level files from previous non-versioned deployments
+          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/index.html --yes 2>/dev/null || true
+          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/bundle.js --yes 2>/dev/null || true
+          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/bundle.css --yes 2>/dev/null || true
+          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/loading.html --yes 2>/dev/null || true
--- a/.gitignore
+++ b/.gitignore
@@ -54,6 +54,7 @@
 /tmp/
 /autogen-*.md
 /common/build-info.cpp
+/tools/server/public

 # Deprecated

@@ -96,8 +97,6 @@

 /tools/server/webui/node_modules
 /tools/server/webui/dist
-# we no longer use gz for index.html
-/tools/server/public/index.html.gz

 # Python

@@ -105,9 +104,12 @@
 __pycache__/
 */poetry.lock
 poetry.toml
+poetry.lock
+uv.lock

 # Nix

+flake.lock
 /result

 # Test binaries
--- a/.pi/gg/SYSTEM.md
+++ b/.pi/gg/SYSTEM.md
@@ -4,6 +4,7 @@ General:
 - By very precise and concise when writing code, comments, explanations, etc.
 - PR and commit titles format: `<module> : <title>`. Lookup recents for examples
 - Don't try to build or run the code unless you are explicitly asked to do so
+- Use the `gh` CLI tool when querying PRs, issues, or other GitHub resources

 Coding:
 - When in doubt, always refer to the CONTRIBUTING.md file of the project
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -104,13 +104,14 @@ option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
 option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})

 # extra artifacts
-option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_TOOLS    "llama: build tools"          ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_WEBUI    "llama: build the embedded Web UI for server"  ON)
-option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_DEFAULT})
-option(LLAMA_TESTS_INSTALL  "llama: install tests"        ON)
+option(LLAMA_BUILD_TESTS            "llama: build tests"                                                                            ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_TOOLS            "llama: build tools"                                                                            ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES         "llama: build examples"                                                                         ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_SERVER           "llama: build server example"                                                                   ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_WEBUI            "llama: build the embedded Web UI for server"                                                   ON)
+option(LLAMA_USE_PREBUILT_WEBUI     "llama: use prebuilt WebUI from HF Bucket when available (requires LLAMA_BUILD_WEBUI=ON)"       ON)
+option(LLAMA_TOOLS_INSTALL          "llama: install tools"                                                                          ${LLAMA_TOOLS_INSTALL_DEFAULT})
+option(LLAMA_TESTS_INSTALL          "llama: install tests"                                                                          ON)

 # 3rd party libs
 option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" ON)
--- a/1
+++ b/1
@@ -76,6 +76,7 @@
 /ggml/src/ggml-vulkan/                  @ggml-org/ggml-vulkan
 /ggml/src/ggml-webgpu/                  @ggml-org/ggml-webgpu
 /ggml/src/ggml-zdnn/                    @ggml-org/ggml-zdnn @Andreas-Krebbel @AlekseiNikiforovIBM
+/ggml/src/ggml-zendnn/                  @avinashcpandey @Jiten1parmar @z-vishal
 /ggml/src/ggml.c                        @ggerganov
 /ggml/src/ggml.cpp                      @ggerganov
 /ggml/src/gguf.cpp                      @JohannesGaessler @Green-Sky
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -46,7 +46,9 @@ Before submitting your PR:
    - provide KL divergence data calculated vs. the FP16/BF16 (whichever is the native precision) version for both the new type as well as types of similar size
    - provide [performance data](https://github.com/ggml-org/llama.cpp/tree/master/tools/llama-bench) for the new type in comparison to types of similar size on pure CPU
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If you are a new contributor, limit your open PRs to 1.
+- If you are a new contributor
+    - Limit your open PRs to 1
+    - Do not submit trivial fixes (e.g. typos, formatting changes)

 After submitting your PR:
 - Expect requests for modifications to ensure the code meets llama.cpp's standards for quality and long-term maintainability
--- a/README.md
+++ b/README.md
@@ -529,6 +529,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
 - [How to build](docs/build.md)
 - [Running on Docker](docs/docker.md)
 - [Build on Android](docs/android.md)
+- [Multi-GPU usage](docs/multi-gpu.md)
 - [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
 - [GGML tips & tricks](https://github.com/ggml-org/llama.cpp/wiki/GGML-Tips-&-Tricks)

--- a/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
+++ b/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
@@ -24,6 +24,6 @@ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
-set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}")
-set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}")
+set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zvfh_zba_zicbop -mabi=lp64d -fno-tree-vectorize -fno-tree-loop-vectorize ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zvfh_zba_zicbop -mabi=lp64d -fno-tree-vectorize -fno-tree-loop-vectorize ${CMAKE_CXX_FLAGS}")
 set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic")
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -248,6 +248,8 @@ std::vector<std::string> common_arg::get_env() const {

 // Helper function to parse tensor buffer override strings
 static void parse_tensor_buffer_overrides(const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) {
+    ggml_backend_load_all();
+
    std::map<std::string, ggml_backend_buffer_type_t> buft_list;
    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
        auto * dev = ggml_backend_dev_get(i);
@@ -306,12 +308,14 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
    common_download_opts opts;
    opts.bearer_token = params.hf_token;
    opts.offline = params.offline;
+
+    LOG_TRC("%s: looking for remote preset at %s\n", __func__, preset_url.c_str());
    const int status = common_download_file_single(preset_url, preset_path, opts);
    const bool has_preset = status >= 200 && status < 400;

    // remote preset is optional, so we don't error out if not found
    if (has_preset) {
-        LOG_INF("applying remote preset from %s\n", preset_url.c_str());
+        LOG_TRC("%s: applying remote preset from %s\n", __func__, preset_url.c_str());
        common_preset_context ctx(ex, /* only_remote_allowed */ true);
        common_preset global;
        auto remote_presets = ctx.load_from_ini(preset_path, global);
@@ -324,7 +328,7 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
            throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
        }
    } else {
-        LOG_INF("%s", "no remote preset found, skipping\n");
+        LOG_TRC("%s: no remote preset found, skipping\n", __func__);
    }

    return has_preset;
@@ -355,8 +359,7 @@ static handle_model_result common_params_handle_model(struct common_params_model
        auto download_result = common_download_model(model, opts, true);

        if (download_result.model_path.empty()) {
-            LOG_ERR("error: failed to download model from Hugging Face\n");
-            exit(1);
+            throw std::runtime_error("failed to download model from Hugging Face");
        }

        model.name = model.hf_repo;
@@ -378,8 +381,7 @@ static handle_model_result common_params_handle_model(struct common_params_model
        opts.offline = offline;
        auto download_result = common_download_model(model, opts);
        if (download_result.model_path.empty()) {
-            LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
-            exit(1);
+            throw std::runtime_error("failed to download model from " + model.url);
        }
    }

@@ -425,10 +427,33 @@ static bool parse_bool_value(const std::string & value) {
    }
 }

+[[noreturn]] static void arg_removed(const std::string & msg) {
+    throw std::invalid_argument("the argument has been removed. " + msg);
+}
+
 //
 // CLI argument parsing functions
 //

+void common_params_handle_models(common_params & params, llama_example curr_ex) {
+    auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
+    if (params.no_mmproj) {
+        params.mmproj = {};
+    } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
+        // optionally, handle mmproj model when -hf is specified
+        params.mmproj = res.mmproj;
+    }
+    // only download mmproj if the current example is using it
+    for (const auto & ex : mmproj_examples) {
+        if (curr_ex == ex) {
+            common_params_handle_model(params.mmproj,    params.hf_token, params.offline);
+            break;
+        }
+    }
+    common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
+    common_params_handle_model(params.vocoder.model,             params.hf_token, params.offline);
+}
+
 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
    common_params & params = ctx_arg.params;

@@ -582,22 +607,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context

    // handle model and download
    if (!skip_model_download) {
-        auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
-        if (params.no_mmproj) {
-            params.mmproj = {};
-        } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
-            // optionally, handle mmproj model when -hf is specified
-            params.mmproj = res.mmproj;
-        }
-        // only download mmproj if the current example is using it
-        for (const auto & ex : mmproj_examples) {
-            if (ctx_arg.ex == ex) {
-                common_params_handle_model(params.mmproj,    params.hf_token, params.offline);
-                break;
-            }
-        }
-        common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
-        common_params_handle_model(params.vocoder.model,             params.hf_token, params.offline);
+        common_params_handle_models(params, ctx_arg.ex);
    }

    // model is required (except for server)
@@ -616,10 +626,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
            string_process_escapes(seq_breaker);
        }
-        for (auto & pair : params.speculative.draft.replacements) {
-            string_process_escapes(pair.first);
-            string_process_escapes(pair.second);
-        }
    }

    if (!params.kv_overrides.empty()) {
@@ -803,6 +809,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
    if (dev_names.size() == 1 && dev_names[0] == "none") {
        devices.push_back(nullptr);
    } else {
+        ggml_backend_load_all();
        for (const auto & device : dev_names) {
            auto * dev = ggml_backend_dev_by_name(device.c_str());
            if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
@@ -820,6 +827,7 @@ static void add_rpc_devices(const std::string & servers) {
    if (rpc_servers.empty()) {
        throw std::invalid_argument("no RPC servers specified");
    }
+    ggml_backend_load_all();
    ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
    if (!rpc_reg) {
        throw std::invalid_argument("failed to find RPC backend");
@@ -1016,9 +1024,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex

    params.use_color = tty_can_use_colors();

-    // load dynamic backends
-    ggml_backend_load_all();
-
    common_params_context ctx_arg(params);
    ctx_arg.print_usage = print_usage;
    ctx_arg.ex          = ex;
@@ -2218,7 +2223,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    if (llama_supports_rpc()) {
        add_opt(common_arg(
            {"--rpc"}, "SERVERS",
-            "comma separated list of RPC servers (host:port)",
+            "comma-separated list of RPC servers (host:port)",
            [](common_params & params, const std::string & value) {
                add_rpc_devices(value);
                GGML_UNUSED(params);
@@ -2275,6 +2280,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--list-devices"},
        "print list of available devices and exit",
        [](common_params &) {
+            ggml_backend_load_all();
            std::vector<ggml_backend_dev_t> devices;
            for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
                auto * dev = ggml_backend_dev_get(i);
@@ -2864,7 +2870,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--tools"}, "TOOL1,TOOL2,...",
        "experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)\n"
        "specify \"all\" to enable all tools\n"
-        "available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff",
+        "available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff, get_datetime",
        [](common_params & params, const std::string & value) {
            params.server_tools = parse_csv_row(value);
        }
@@ -3297,18 +3303,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_env("LLAMA_LOG_VERBOSITY"));
    add_opt(common_arg(
        {"--log-prefix"},
+        {"--no-log-prefix"},
        "Enable prefix in log messages",
-        [](common_params &) {
-            common_log_set_prefix(common_log_main(), true);
+        [](common_params &, bool value) {
+            common_log_set_prefix(common_log_main(), value);
        }
-    ).set_env("LLAMA_LOG_PREFIX"));
+    ).set_env("LLAMA_ARG_LOG_PREFIX"));
    add_opt(common_arg(
        {"--log-timestamps"},
+        {"--no-log-timestamps"},
        "Enable timestamps in log messages",
-        [](common_params &) {
-            common_log_set_timestamps(common_log_main(), true);
+        [](common_params &, bool value) {
+            common_log_set_timestamps(common_log_main(), value);
        }
-    ).set_env("LLAMA_LOG_TIMESTAMPS"));
+    ).set_env("LLAMA_ARG_LOG_TIMESTAMPS"));

    //
    // speculative parameters
@@ -3380,7 +3388,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
    add_opt(common_arg(
        {"--spec-draft-poll", "--poll-draft"}, "<0|1>",
-        "Use polling to wait for draft model work (default: same as --poll])",
+        "Use polling to wait for draft model work (default: same as --poll)",
        [](common_params & params, int value) {
            params.speculative.draft.cpuparams.poll = value;
        }
@@ -3512,13 +3520,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.speculative.draft.p_min = std::stof(value);
        }
    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_P_MIN"));
-    add_opt(common_arg(
-        {"--spec-draft-ctx-size", "-cd", "--ctx-size-draft"}, "N",
-        string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.draft.n_ctx),
-        [](common_params & params, int value) {
-            params.speculative.draft.n_ctx = value;
-        }
-    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_CTX_SIZE"));
    add_opt(common_arg(
        {"--spec-draft-device", "-devd", "--device-draft"}, "<dev1,dev2,..>",
        "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@@ -3555,32 +3556,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_MODEL"));
    add_opt(common_arg(
-        {"--spec-draft-replace", "--spec-replace"}, "TARGET", "DRAFT",
-        "translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
-        [](common_params & params, const std::string & tgt, const std::string & dft) {
-            params.speculative.draft.replacements.push_back({ tgt, dft });
-        }
-    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
-    add_opt(common_arg(
-        {"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]",
-        string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
-            common_speculative_type_to_str(params.speculative.type).c_str()),
+        {"--spec-type"}, common_speculative_all_types_str(),
+        string_format("comma-separated list of types of speculative decoding to use (default: %s)\n",
+            common_speculative_type_name_str(params.speculative.types).c_str()),
        [](common_params & params, const std::string & value) {
-            if (value == "none") {
-                params.speculative.type = COMMON_SPECULATIVE_TYPE_NONE;
-            } else if (value == "ngram-cache") {
-                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_CACHE;
-            } else if (value == "ngram-simple") {
-                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE;
-            } else if (value == "ngram-map-k") {
-                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
-            } else if (value == "ngram-map-k4v") {
-                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
-            } else if (value == "ngram-mod") {
-                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
-            } else {
-                throw std::invalid_argument("unknown speculative decoding type without draft model");
-            }
+            const auto enabled_types = string_split<std::string>(value, ',');
+            params.speculative.types = common_speculative_types_from_names(enabled_types);
        }
    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_TYPE"));
    add_opt(common_arg(
@@ -3715,35 +3696,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--draft", "--draft-n", "--draft-max"}, "N",
        "the argument has been removed. use --spec-draft-n-max or --spec-ngram-mod-n-max",
        [](common_params & /*params*/, int /*value*/) {
-            throw std::invalid_argument("the argument has been removed. use --spec-draft-n-max or --spec-ngram-mod-n-max");
+            arg_removed("use --spec-draft-n-max or --spec-ngram-mod-n-max");
        }
    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MAX"));
    add_opt(common_arg(
        {"--draft-min", "--draft-n-min"}, "N",
        "the argument has been removed. use --spec-draft-n-min or --spec-ngram-mod-n-min",
        [](common_params & /*params*/, int /*value*/) {
-            throw std::invalid_argument("the argument has been removed. use --spec-draft-n-min or --spec-ngram-mod-n-min");
+            arg_removed("use --spec-draft-n-min or --spec-ngram-mod-n-min");
        }
    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN"));
    add_opt(common_arg(
        {"--spec-ngram-size-n"}, "N",
        "the argument has been removed. use the respective --spec-ngram-*-size-n or --spec-ngram-mod-n-match",
        [](common_params & /*params*/, int /*value*/) {
-            throw std::invalid_argument("the argument has been removed. use the respective --spec-ngram-*-size-n");
+            arg_removed("use the respective --spec-ngram-*-size-n");
        }
    ).set_spec().set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--spec-ngram-size-m"}, "N",
        "the argument has been removed. use the respective --spec-ngram-*-size-m",
        [](common_params & /*params*/, int /*value*/) {
-            throw std::invalid_argument("the argument has been removed. use the respective --spec-ngram-*-size-m");
+            arg_removed("use the respective --spec-ngram-*-size-m");
        }
    ).set_spec().set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--spec-ngram-min-hits"}, "N",
        "the argument has been removed. use the respective --spec-ngram-*-min-hits",
        [](common_params & /*params*/, int /*value*/) {
-            throw std::invalid_argument("the argument has been removed. use the respective --spec-ngram-*-min-hits");
+            arg_removed("use the respective --spec-ngram-*-min-hits");
        }
    ).set_spec().set_examples({LLAMA_EXAMPLE_SERVER}));

@@ -3794,7 +3775,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
    add_opt(common_arg(
        {"--diffusion-algorithm"}, "N",
-        string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm),
+        string_format(
+            "diffusion algorithm: 0=DIFFUSION_ALGORITHM_ORIGIN, 1=DIFFUSION_ALGORITHM_ENTROPY_BASED, "
+            "2=DIFFUSION_ALGORITHM_MARGIN_BASED, 3=DIFFUSION_ALGORITHM_RANDOM, "
+            "4=DIFFUSION_ALGORITHM_CONFIDENCE_BASED (default: %d)", params.diffusion.algorithm),
        [](common_params & params, int value) { params.diffusion.algorithm = value; }
    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
    add_opt(common_arg(
@@ -4066,7 +4050,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--spec-default"},
        string_format("enable default speculative decoding config"),
        [](common_params & params) {
-            params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
+            params.speculative.types = { COMMON_SPECULATIVE_TYPE_NGRAM_MOD };
            params.speculative.ngram_mod.n_match = 24;
            params.speculative.ngram_mod.n_min = 48;
            params.speculative.ngram_mod.n_max = 64;
--- a/common/arg.h
+++ b/common/arg.h
@@ -129,5 +129,8 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
 // see: https://github.com/ggml-org/llama.cpp/issues/18163
 void common_params_add_preset_options(std::vector<common_arg> & args);

+// Populate model paths (main model, mmproj, etc) from -hf if necessary
+void common_params_handle_models(common_params & params, llama_example curr_ex);
+
 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@@ -136,10 +136,10 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co
        if (!end.empty()) {
            if (!start.empty()) {
                // Standard tag-based: optional(<think>reasoning</think>)
-                return p.optional(start + p.reasoning(p.until(end)) + end + p.space());
+                return p.optional(p.optspace(start) + p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end));
            }
            // Delimiter-style (empty start)
-            return p.optional(p.reasoning(p.until(end)) + end + p.space());
+            return p.optional(p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end));
        }
    }

@@ -186,7 +186,6 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const
 common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
-    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;

    // Build effective field names with dot notation if function_field is set
    std::string name_field = format.name_field;
@@ -225,8 +224,7 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
        tool_start = format.per_call_start;
    }

-    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(p.until(tool_start)))) + tools_parser +
-           p.end();
+    return ctx.reasoning_parser + p.optional(p.content(p.until(tool_start))) + tools_parser + p.end();
 }

 common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p, const std::string & name,
@@ -270,7 +268,6 @@ common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p,
 common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
-    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;

    common_peg_parser tool_choice = p.choice();

@@ -336,14 +333,12 @@ common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context

    std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
    auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
-    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
-           p.end();
+    return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end();
 }

 common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
-    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;

    auto until_suffix = p.rule("until-suffix", p.until(arguments.value_suffix));

@@ -374,9 +369,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
                                           arguments.name_suffix) +
                           arguments.value_prefix +
                           (schema_info.resolves_to_string(param_schema) ?
-                                p.tool_arg_string_value(p.schema(until_suffix,
-                                                                 "tool-" + name + "-arg-" + param_name + "-schema",
-                                                                 param_schema, true)) :
+                                p.tool_arg_string_value(until_suffix) :
                                p.tool_arg_json_value(p.schema(
                                    p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
                                    p.space()) +
@@ -471,8 +464,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte

    std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
    auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
-    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
-           p.end();
+    return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end();
 }

 }  // namespace autoparser
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@@ -342,7 +342,7 @@ void analyze_reasoning::compare_thinking_enabled() {
    if (left_trimmed.empty() && !diff.right.empty()) {
        if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
            if (start.empty()) {
-                start = trim_leading_whitespace(diff.right);
+                start = diff.right;
                mode  = reasoning_mode::TAG_BASED;
            }
        }
@@ -353,7 +353,7 @@ void analyze_reasoning::compare_thinking_enabled() {
                if (seg.size() >= 2 && seg[seg.size() - 1].value == left_trimmed && seg[seg.size() - 2].type == segment_type::MARKER) {
                    start = seg[seg.size() - 2].value;
                }
-                end = trim_trailing_whitespace(diff.left);
+                end = diff.left;
                mode = reasoning_mode::TAG_BASED;
            }
        }
@@ -445,14 +445,14 @@ void analyze_reasoning::compare_reasoning_scope() {
        auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
        if (result.result.success()) {
            start = result.tags["pre"];
-            end = trim_trailing_whitespace(result.tags["post"]);
+            end = result.tags["post"];
        } else {
            auto parser_delimiter = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
                return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())));
            });
            result = parser_delimiter.parse_anywhere_and_extract(comparison->output_B);
            if (result.result.success()) {
-                end = trim_trailing_whitespace(result.tags["post"]);
+                end = result.tags["post"];
            } else {
                LOG_DBG(ANSI_ORANGE "%s: Unable to extract reasoning markers, falling back to reasoning = NONE\n" ANSI_RESET, __func__);
                mode = reasoning_mode::NONE;
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@@ -816,6 +816,32 @@ common_peg_parser common_chat_peg_builder::prefix(const std::string & s, const s
    return literal(s.substr(0, s.rfind(delimiter)));
 }

+common_peg_parser common_chat_peg_builder::optspace(const std::string & tag) {
+    auto parser = eps();
+    size_t end_of_prefix_space = tag.size();
+    size_t start_of_suffix_space = tag.size();
+    for (size_t i = 0; i < tag.size(); i++) {
+        if (!std::isspace(tag[i])) {
+            end_of_prefix_space = i;
+            break;
+        }
+    }
+    for (size_t i = tag.size(); i > 0; i--) {
+        if (!std::isspace(tag[i - 1])) {
+            start_of_suffix_space = i;
+            break;
+        }
+    }
+    for (size_t i = 0; i < end_of_prefix_space; i++) {
+        parser += optional(literal(std::string(1, tag[i])));
+    }
+    parser += literal(tag.substr(end_of_prefix_space, start_of_suffix_space - end_of_prefix_space));
+    for (size_t i = start_of_suffix_space; i < tag.size(); i++) {
+        parser += optional(literal(std::string(1, tag[i])));
+    }
+    return parser;
+}
+
 common_peg_parser common_chat_peg_builder::standard_json_tools(
                                                       const std::string &              section_start,
                                                       const std::string &              section_end,
--- a/common/chat-peg-parser.h
+++ b/common/chat-peg-parser.h
@@ -96,6 +96,9 @@ class common_chat_peg_builder : public common_peg_parser_builder {
    // Return a parser that parses the prefix of a string, up to a given delimiter.
    common_peg_parser prefix(const std::string & s, const std::string & delimiter = {});

+    // Return a parser that parses all elements of tag, but leading and trailing spaces are optional
+    common_peg_parser optspace(const std::string & tag);
+
    // Legacy-compatible helper for building standard JSON tool calls
    // Used by tests and manual parsers
    // name_key/args_key: JSON key names for function name and arguments
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -80,7 +80,7 @@ json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
    if (!content.empty()) {
        jmsg["content"] = content;
    } else if (!content_parts.empty()) {
-        if (concat_typed_text) {
+        if (concat_typed_text || contains_media()) {
            std::string text;
            bool last_was_media_marker = false;
            // join parts with newline, do not add newline before or after media markers
@@ -2116,22 +2116,38 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
    return std::nullopt;
 }

+static std::string common_chat_templates_generation_prompt(const common_chat_template & tmpl, const autoparser::generation_params & inputs) {
+    autoparser::generation_params params = inputs;
+    params.add_generation_prompt = false;
+    std::string no_gen_prompt    = common_chat_template_direct_apply_impl(tmpl, params);
+    params.add_generation_prompt = true;
+    std::string gen_prompt       = common_chat_template_direct_apply_impl(tmpl, params);
+
+    size_t prefix_len = 0;
+    size_t min_size = std::min(no_gen_prompt.size(), gen_prompt.size());
+    while (prefix_len < min_size && no_gen_prompt[prefix_len] == gen_prompt[prefix_len]) {
+        prefix_len++;
+    }
+    return gen_prompt.substr(prefix_len);
+}
+
 static common_chat_params common_chat_templates_apply_jinja(const struct common_chat_templates *        tmpls,
                                                            const struct common_chat_templates_inputs & inputs) {
    autoparser::generation_params params;
    params.tools = common_chat_tools_to_json_oaicompat(inputs.tools);
    const auto & tmpl =
        params.tools.is_array() && tmpls->template_tool_use ? *tmpls->template_tool_use : *tmpls->template_default;
-    const auto & src        = tmpl.source();
-    const auto & caps       = tmpl.original_caps();
-    params.messages         = render_message_to_json(inputs.messages, tmpl.original_caps());
-    params.tool_choice      = inputs.tool_choice;
-    params.reasoning_format = inputs.reasoning_format;
-    params.enable_thinking  = inputs.enable_thinking;
-    params.grammar          = inputs.grammar;
-    params.now              = inputs.now;
-    params.add_bos          = tmpls->add_bos;
-    params.add_eos          = tmpls->add_eos;
+    const auto & src             = tmpl.source();
+    const auto & caps            = tmpl.original_caps();
+    params.messages              = render_message_to_json(inputs.messages, tmpl.original_caps());
+    params.tool_choice           = inputs.tool_choice;
+    params.reasoning_format      = inputs.reasoning_format;
+    params.enable_thinking       = inputs.enable_thinking;
+    params.grammar               = inputs.grammar;
+    params.now                   = inputs.now;
+    params.add_generation_prompt = inputs.add_generation_prompt;
+    params.add_bos               = tmpls->add_bos;
+    params.add_eos               = tmpls->add_eos;

    if (src.find("<|channel|>") == std::string::npos) {
        // map developer to system for all models except for GPT-OSS
@@ -2153,14 +2169,7 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        workaround::func_args_not_string(params.messages);
    }

-    params.add_generation_prompt = false;
-    std::string no_gen_prompt    = common_chat_template_direct_apply_impl(tmpl, params);
-    params.add_generation_prompt = true;
-    std::string gen_prompt       = common_chat_template_direct_apply_impl(tmpl, params);
-    auto        diff             = calculate_diff_split(no_gen_prompt, gen_prompt);
-    params.generation_prompt     = diff.right + diff.suffix;
-
-    params.add_generation_prompt = inputs.add_generation_prompt;
+    params.generation_prompt = common_chat_templates_generation_prompt(tmpl, params);

    params.extra_context = common_chat_extra_context();
    for (auto el : inputs.chat_template_kwargs) {
@@ -2212,8 +2221,8 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
        auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
        if (auto_params.supports_thinking) {
-            auto_params.thinking_start_tag = autoparser.reasoning.start;
-            auto_params.thinking_end_tag   = autoparser.reasoning.end;
+            auto_params.thinking_start_tag = trim_whitespace(autoparser.reasoning.start);
+            auto_params.thinking_end_tag   = trim_whitespace(autoparser.reasoning.end);
        }
        auto_params.generation_prompt = params.generation_prompt;
        common_peg_arena arena;
--- a/common/chat.h
+++ b/common/chat.h
@@ -94,6 +94,15 @@ struct common_chat_msg {
               tool_name.empty() && tool_call_id.empty();
    }

+    bool contains_media() const {
+        for (const auto & part : content_parts) {
+            if (part.type == "media_marker") {
+                return true;
+            }
+        }
+        return false;
+    }
+
    void set_tool_call_ids(std::vector<std::string> &           ids_cache,
                           const std::function<std::string()> & gen_tool_call_id) {
        for (auto i = 0u; i < tool_calls.size(); i++) {
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -366,15 +366,29 @@ void common_init() {
    SetConsoleCP(CP_UTF8);
 #endif

-    llama_log_set(common_log_default_callback, NULL);
+    common_log_set_prefix(common_log_main(), true);
+    common_log_set_timestamps(common_log_main(), true);

+    llama_log_set(common_log_default_callback, NULL);
+}
+
+void common_params_print_info(const common_params & params) {
 #ifdef NDEBUG
    const char * build_type = "";
 #else
    const char * build_type = " (debug)";
 #endif
+    LOG_TRC("%s: build %d (%s) with %s for %s%s\n", __func__, llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);

-    LOG_DBG("build: %d (%s) with %s for %s%s\n", llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
+    LOG_INF("log_info: verbosity = %d (adjust with the `-lv N` CLI arg)\n", common_log_get_verbosity_thold());
+    LOG_INF("device_info:\n");
+    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+        auto * dev = ggml_backend_dev_get(i);
+        size_t free, total;
+        ggml_backend_dev_memory(dev, &free, &total);
+        LOG_INF("  - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
+    }
+    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
 }

 std::string common_params_get_system_info(const common_params & params) {
@@ -1147,7 +1161,8 @@ common_init_result::common_init_result(common_params & params) :
    auto cparams = common_context_params_to_llama(params);

    if (params.fit_params) {
-        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
+        LOG_INF("%s: fitting params to device memory ...\n", __func__);
+        LOG_INF("%s: (for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on)\n", __func__);
        common_fit_params(params.model.path.c_str(), &mparams, &cparams,
            params.tensor_split,
            params.tensor_buft_overrides.data(),
@@ -1196,7 +1211,7 @@ common_init_result::common_init_result(common_params & params) :
    // initialize once
    for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
        if (llama_vocab_is_eog(vocab, i)) {
-            LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
+            LOG_TRC("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
            params.sampling.logit_bias_eog.push_back({i, -INFINITY});
        }
    }
@@ -1209,12 +1224,12 @@ common_init_result::common_init_result(common_params & params) :
    }

    //if (params.sampling.penalty_last_n == -1) {
-    //    LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    LOG_TRC("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
    //    params.sampling.penalty_last_n = llama_n_ctx(lctx);
    //}

    //if (params.sampling.dry_penalty_last_n == -1) {
-    //    LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    LOG_TRC("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
    //    params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
    //}

@@ -1422,7 +1437,7 @@ common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {

    // try to remove the last tokens
    if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
-        LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
+        LOG_TRC("%s: the context does not support partial sequence removal\n", __func__);
        res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
        goto done;
    }
@@ -1960,3 +1975,102 @@ bool common_prompt_batch_decode(

    return true;
 }
+
+size_t common_prompt_checkpoint::size() const {
+    return data_tgt.size() + data_dft.size();
+}
+
+bool common_prompt_checkpoint::empty() const {
+    return data_tgt.empty();
+}
+
+void common_prompt_checkpoint::clear() {
+    n_tokens = 0;
+
+    pos_min = 0;
+    pos_max = 0;
+
+    data_tgt.clear();
+    data_dft.clear();
+}
+
+void common_prompt_checkpoint::update_pos(
+        int64_t n_tokens,
+        llama_pos pos_min,
+        llama_pos pos_max) {
+    this->n_tokens = n_tokens;
+    this->pos_min  = pos_min;
+    this->pos_max  = pos_max;
+}
+
+void common_prompt_checkpoint::update_tgt(
+        llama_context * ctx,
+        llama_seq_id seq_id,
+        llama_state_seq_flags flags) {
+    if (ctx == nullptr) {
+        return;
+    }
+
+    const size_t ckpt_size = llama_state_seq_get_size_ext(ctx, seq_id, flags);
+
+    data_tgt.resize(ckpt_size);
+
+    const size_t n = llama_state_seq_get_data_ext(ctx, data_tgt.data(), ckpt_size, seq_id, flags);
+    if (n != ckpt_size) {
+        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size, n);
+    }
+}
+
+void common_prompt_checkpoint::update_dft(
+        llama_context * ctx,
+        llama_seq_id seq_id,
+        llama_state_seq_flags flags) {
+    if (ctx == nullptr) {
+        return;
+    }
+
+    const size_t ckpt_size = llama_state_seq_get_size_ext(ctx, seq_id, flags);
+
+    data_dft.resize(ckpt_size);
+
+    const size_t n = llama_state_seq_get_data_ext(ctx, data_dft.data(), ckpt_size, seq_id, flags);
+    if (n != ckpt_size) {
+        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size, n);
+    }
+}
+
+void common_prompt_checkpoint::load_tgt(
+        llama_context * ctx,
+        llama_seq_id seq_id,
+        llama_state_seq_flags flags) const {
+    if (ctx == nullptr) {
+        return;
+    }
+
+    if (data_tgt.empty()) {
+        return;
+    }
+
+    const size_t n = llama_state_seq_set_data_ext(ctx, data_tgt.data(), data_tgt.size(), seq_id, flags);
+    if (n != data_tgt.size()) {
+        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_tgt.size(), n);
+    }
+}
+
+void common_prompt_checkpoint::load_dft(
+        llama_context * ctx,
+        llama_seq_id seq_id,
+        llama_state_seq_flags flags) const {
+    if (ctx == nullptr) {
+        return;
+    }
+
+    if (data_dft.empty()) {
+        return;
+    }
+
+    const size_t n = llama_state_seq_set_data_ext(ctx, data_dft.data(), data_dft.size(), seq_id, flags);
+    if (n != data_dft.size()) {
+        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_dft.size(), n);
+    }
+}
--- a/common/common.h
+++ b/common/common.h
@@ -157,9 +157,9 @@ enum common_params_sampling_config : uint64_t {

 enum common_speculative_type {
    COMMON_SPECULATIVE_TYPE_NONE,          // no speculative decoding
-    COMMON_SPECULATIVE_TYPE_DRAFT,         // draft model
-    COMMON_SPECULATIVE_TYPE_EAGLE3,        // eagle draft model
-    COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding
+    COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE,  // standalone draft model speculative decoding
+    COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3,  // Eagle3 speculative decoding
+    COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding based on n-grams
    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,   // self-speculative decoding with n-gram keys only
    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
    COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
@@ -295,8 +295,6 @@ struct common_params_model {
    std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
 };

-struct common_ngram_mod;
-
 // draft-model-based speculative decoding parameters
 struct common_params_speculative_draft {
    int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
@@ -307,11 +305,9 @@ struct common_params_speculative_draft {

    common_params_model mparams;

-    llama_model * model = nullptr; // a llama_model that can be shared by multiple speculative contexts
+    llama_context * ctx_tgt = nullptr;
+    llama_context * ctx_dft = nullptr;

-    llama_context_params cparams; // these are the parameters for the draft llama_context
-
-    int32_t n_ctx        = 0;  // draft context size
    int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)

    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
@@ -322,7 +318,6 @@ struct common_params_speculative_draft {

    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

-    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
 };

@@ -331,9 +326,6 @@ struct common_params_speculative_ngram_mod {

    int32_t n_max = 64;
    int32_t n_min = 48;
-
-    // shared instance of the ngram container for all speculative decoding contexts
-    std::shared_ptr<common_ngram_mod> obj;
 };

 struct common_params_speculative_ngram_map {
@@ -348,9 +340,9 @@ struct common_params_speculative_ngram_cache {
 };

 struct common_params_speculative {
-    // TODO: become a vector in order to support "chains of speculators"
-    common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE;
+    std::vector<enum common_speculative_type> types = { COMMON_SPECULATIVE_TYPE_NONE };

+    // used by Simple, MTP, Eagle3, etc. - all methods that require some kind of draft model
    common_params_speculative_draft draft;

    common_params_speculative_ngram_mod ngram_mod;
@@ -613,7 +605,11 @@ struct common_params {
    std::map<std::string, std::string> default_template_kwargs;

    // webui configs
-    bool webui = true;
+#ifdef LLAMA_WEBUI_DEFAULT_ENABLED
+    bool webui = LLAMA_WEBUI_DEFAULT_ENABLED != 0;
+#else
+    bool webui = true; // default to enabled when not set
+#endif
    bool webui_mcp_proxy = false;
    std::string webui_config_json;

@@ -694,6 +690,7 @@ struct common_params {
 // initializes the logging system and prints info about the build
 void common_init();

+void common_params_print_info(const common_params & params);
 std::string common_params_get_system_info(const common_params & params);

 bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
@@ -1026,3 +1023,47 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std

 // "adamw" or "sgd" (case insensitive)
 enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
+
+//
+// prompt utils
+//
+
+struct common_prompt_checkpoint {
+    int64_t n_tokens;
+
+    llama_pos pos_min;
+    llama_pos pos_max;
+
+    std::vector<uint8_t> data_tgt;
+    std::vector<uint8_t> data_dft;
+
+    size_t size() const;
+
+    bool empty() const;
+    void clear();
+
+    void update_pos(
+            int64_t n_tokens,
+            llama_pos pos_min,
+            llama_pos pos_max);
+
+    void update_tgt(
+            llama_context * ctx,
+            llama_seq_id seq_id,
+            llama_state_seq_flags flags);
+
+    void update_dft(
+            llama_context * ctx,
+            llama_seq_id seq_id,
+            llama_state_seq_flags flags);
+
+    void load_tgt(
+            llama_context * ctx,
+            llama_seq_id seq_id,
+            llama_state_seq_flags flags) const;
+
+    void load_dft(
+            llama_context * ctx,
+            llama_seq_id seq_id,
+            llama_state_seq_flags flags) const;
+};
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -320,9 +320,9 @@ static int common_download_file_single_online(const std::string & url,

    auto head = cli.Head(parts.path);
    if (!head || head->status < 200 || head->status >= 300) {
-        LOG_WRN("%s: HEAD failed, status: %d\n", __func__, head ? head->status : -1);
+        LOG_TRC("%s: HEAD failed, status: %d\n", __func__, head ? head->status : -1);
        if (file_exists) {
-            LOG_INF("%s: using cached file (HEAD failed): %s\n", __func__, path.c_str());
+            LOG_TRC("%s: using cached file (HEAD failed): %s\n", __func__, path.c_str());
            return 304; // 304 Not Modified - fake cached response
        }
        return head ? head->status : -1;
--- a/common/fit.cpp
+++ b/common/fit.cpp
@@ -109,16 +109,24 @@ static std::vector<llama_device_memory_data> common_get_device_memory_data(
        ret.back().total = total;
    }
    for (size_t i = 0; i < nd; i++) {
+        ggml_backend_dev_t dev = llama_model_get_device(model, i);
+
        size_t free;
        size_t total;
-        ggml_backend_dev_memory(llama_model_get_device(model, i), &free, &total);
+        ggml_backend_dev_memory(dev, &free, &total);

-        // devices can return 0 bytes for free and total memory if they do not
-        // have any to report. in this case, we will use the host memory as a fallback
-        // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
+        // Some non-GPU accelerator backends, such as BLAS, report 0/0 and rely on
+        // the host-memory fallback. For GPU-like backends, keep 0/0 so --fit does
+        // not assign anything to a device with an unknown memory budget.
        if (free == 0 && total == 0) {
-            free  = ret.back().free;
-            total = ret.back().total;
+            const enum ggml_backend_dev_type type = ggml_backend_dev_type(dev);
+            if (type == GGML_BACKEND_DEVICE_TYPE_GPU || type == GGML_BACKEND_DEVICE_TYPE_IGPU) {
+                LOG_WRN("%s: device %s did not report memory; --fit will not use it\n",
+                        __func__, ggml_backend_dev_name(dev));
+            } else {
+                free  = ret.back().free;
+                total = ret.back().total;
+            }
        }
        ret[i].free  = free;
        ret[i].total = total;
@@ -160,7 +168,7 @@ static void common_params_fit_impl(

    // step 1: get data for default parameters and check whether any changes are necessary in the first place

-    LOG_INF("%s: getting device memory data for initial parameters:\n", __func__);
+    LOG_TRC("%s: getting device memory data for initial parameters:\n", __func__);
    const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
    const size_t nd = devs.size(); // number of devices

@@ -205,13 +213,13 @@ static void common_params_fit_impl(
        LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
            __func__, sum_projected_used/MiB, sum_free/MiB);
        if (sum_projected_free >= margins[0]) {
-            LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
+            LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
                __func__, sum_projected_free/MiB, margins[0]/MiB);
            return;
        }
    } else {
        if (nd > 1) {
-            LOG_INF("%s: projected memory use with initial parameters [MiB]:\n", __func__);
+            LOG_TRC("%s: projected memory use with initial parameters [MiB]:\n", __func__);
        }
        for (size_t id = 0; id < nd; id++) {
            const llama_device_memory_data & dmd = dmds_full[id];
@@ -226,16 +234,16 @@ static void common_params_fit_impl(
            sum_projected_model += dmd.mb.model;

            if (nd > 1) {
-                LOG_INF("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
+                LOG_TRC("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
                    __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
            }
        }
        assert(sum_free >= 0 && sum_projected_used >= 0);
-        LOG_INF("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
+        LOG_TRC("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
            __func__, sum_projected_used/MiB, sum_free/MiB);
        if (nd == 1) {
            if (projected_free_per_device[0] >= margins[0]) {
-                LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
+                LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
                    __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
                return;
            }
@@ -248,7 +256,7 @@ static void common_params_fit_impl(
                }
            }
            if (!changes_needed) {
-                LOG_INF("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
+                LOG_TRC("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
                return;
            }
        }
@@ -267,10 +275,10 @@ static void common_params_fit_impl(
        }
        if (global_surplus < 0) {
            if (nd <= 1) {
-                LOG_INF("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
+                LOG_TRC("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
                    __func__, margins[0]/MiB, -global_surplus/MiB);
            } else {
-                LOG_INF(
+                LOG_TRC(
                    "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
                    __func__, -global_surplus/MiB);
            }
@@ -312,28 +320,28 @@ static void common_params_fit_impl(

                        const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
                        const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
-                        LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
+                        LOG_TRC("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
                        if (nd <= 1) {
-                            LOG_INF("%s: entire model can be fit by reducing context\n", __func__);
+                            LOG_TRC("%s: entire model can be fit by reducing context\n", __func__);
                            return;
                        }
-                        LOG_INF("%s: entire model should be fit across devices by reducing context\n", __func__);
+                        LOG_TRC("%s: entire model should be fit across devices by reducing context\n", __func__);
                    } else {
                        const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
-                        LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
+                        LOG_TRC("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
                    }
                } else {
                    if (n_ctx_min == UINT32_MAX) {
-                        LOG_INF("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
+                        LOG_TRC("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
                    } else {
-                        LOG_INF("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
+                        LOG_TRC("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
                            __func__, hp_nct, n_ctx_min);
                    }
                }
            } else {
-                LOG_INF("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
+                LOG_TRC("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
            }
        }
    }
@@ -477,10 +485,10 @@ static void common_params_fit_impl(
        const dmds_t dmd_nl = common_get_device_memory_data(
            path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);

-        LOG_INF("%s: memory for test allocation by device:\n", func_name);
+        LOG_TRC("%s: memory for test allocation by device:\n", func_name);
        for (size_t id = 0; id < nd; id++) {
            const ngl_t & n = ngl_per_device[id];
-            LOG_INF(
+            LOG_TRC(
                "%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
                func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
        }
@@ -501,7 +509,7 @@ static void common_params_fit_impl(
        tensor_buft_overrides[1] = {nullptr, nullptr};
        mparams->tensor_buft_overrides = tensor_buft_overrides;

-        LOG_INF("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
+        LOG_TRC("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
        const dmds_t dmds_cpu_moe = common_get_device_memory_data(
            path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);

@@ -511,10 +519,10 @@ static void common_params_fit_impl(
        }

        if (global_surplus_cpu_moe > 0) {
-            LOG_INF("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
+            LOG_TRC("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
                __func__, global_surplus_cpu_moe/MiB);
        } else {
-            LOG_INF("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
+            LOG_TRC("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
                __func__, -global_surplus_cpu_moe/MiB);
        }

@@ -527,7 +535,7 @@ static void common_params_fit_impl(
    targets.reserve(nd);
    for (size_t id = 0; id < nd; id++) {
        targets.push_back(dmds_full[id].free - margins[id]);
-        LOG_INF("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
+        LOG_TRC("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
    }

    std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
@@ -547,9 +555,9 @@ static void common_params_fit_impl(
    //   - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
    //   - the last device has the output layer, which cannot be a partial layer
    if (hp_nex == 0) {
-        LOG_INF("%s: filling dense layers back-to-front:\n", __func__);
+        LOG_TRC("%s: filling dense layers back-to-front:\n", __func__);
    } else {
-        LOG_INF("%s: filling dense-only layers back-to-front:\n", __func__);
+        LOG_TRC("%s: filling dense-only layers back-to-front:\n", __func__);
    }
    for (int id = nd - 1; id >= 0; id--) {
        uint32_t n_unassigned = hp_ngl + 1;
@@ -568,7 +576,7 @@ static void common_params_fit_impl(
            if (mem_high[id] > targets[id]) {
                assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
                uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
-                LOG_INF("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
+                LOG_TRC("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
                while (delta > 1) {
                    uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
                    step_size = std::max(step_size, uint32_t(1));
@@ -585,11 +593,11 @@ static void common_params_fit_impl(
                    if (mem_test[id] <= targets[id]) {
                        ngl_per_device = ngl_per_device_test;
                        mem            = mem_test;
-                        LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
+                        LOG_TRC("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
                    } else {
                        ngl_per_device_high = ngl_per_device_test;
                        mem_high            = mem_test;
-                        LOG_INF("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
+                        LOG_TRC("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
                    }
                    delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
                }
@@ -597,12 +605,12 @@ static void common_params_fit_impl(
                assert(ngl_per_device_high[id].n_layer == n_unassigned);
                ngl_per_device = ngl_per_device_high;
                mem            = mem_high;
-                LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
+                LOG_TRC("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
            }
        }

        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_INF(
+        LOG_TRC(
            "%s:   - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
    }
@@ -626,7 +634,7 @@ static void common_params_fit_impl(
    }
    assert(id_dense_start < nd);

-    LOG_INF("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
+    LOG_TRC("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
    for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
        for (size_t jd = id_dense_start; jd < nd; jd++) {
@@ -666,13 +674,13 @@ static void common_params_fit_impl(
                    ngl_per_device = ngl_per_device_test;
                    mem            = mem_test;
                    id_dense_start = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
+                    LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
                } else {
                    ngl_per_device_high = ngl_per_device_test;
                    mem_high            = mem_test;
                    id_dense_start_high = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
+                    LOG_TRC("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
                        __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
                }
                assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
@@ -682,7 +690,7 @@ static void common_params_fit_impl(
            ngl_per_device = ngl_per_device_high;
            mem            = mem_high;
            id_dense_start = id_dense_start_high;
-            LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
+            LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
                __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
        }

@@ -702,44 +710,44 @@ static void common_params_fit_impl(
            if (id < nd - 1) {
                overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
            }
-            LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
+            LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
            if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
                ngl_per_device = ngl_per_device_test;
                overflow_bufts = overflow_bufts_test;
                mem            = mem_test;
                id_dense_start = id_dense_start_test;
-                LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
+                LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
                    __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);

                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
-                LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
+                LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
                    ngl_per_device = ngl_per_device_test;
                    overflow_bufts = overflow_bufts_test;
                    mem            = mem_test;
                    id_dense_start = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
+                    LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
                }
            } else {
                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
-                LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
+                LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
                    ngl_per_device = ngl_per_device_test;
                    overflow_bufts = overflow_bufts_test;
                    mem            = mem_test;
                    id_dense_start = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
+                    LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
                }
            }
        }

        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_INF(
+        LOG_TRC(
            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
    }
@@ -747,7 +755,7 @@ static void common_params_fit_impl(
    // print info for devices that were not changed during the conversion from dense only to full layers:
    for (size_t id = id_dense_start + 1; id < nd; id++) {
        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_INF(
+        LOG_TRC(
            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
    }
@@ -768,7 +776,7 @@ enum common_params_fit_status common_fit_params(
    common_params_fit_status status = COMMON_PARAMS_FIT_STATUS_SUCCESS;
    try {
        common_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
-        LOG_INF("%s: successfully fit params to free device memory\n", __func__);
+        LOG_TRC("%s: successfully fit params to free device memory\n", __func__);
    } catch (const common_params_fit_exception & e) {
        LOG_WRN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
        status = COMMON_PARAMS_FIT_STATUS_FAILURE;
@@ -777,7 +785,7 @@ enum common_params_fit_status common_fit_params(
        status = COMMON_PARAMS_FIT_STATUS_ERROR;
    }
    const int64_t t1_us = llama_time_us();
-    LOG_INF("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
+    LOG_TRC("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
    return status;
 }

@@ -917,7 +925,7 @@ void common_memory_breakdown_print(const struct llama_context * ctx) {
        }
    }
    for (const auto & td : table_data) {
-        LOG_INF(td[0].c_str(),
+        LOG_TRC(td[0].c_str(),
            __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
            td[6].c_str(), td[7].c_str(), td[8].c_str());
    }
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -435,10 +435,10 @@ void common_log_flush(struct common_log * log) {
 static int common_get_verbosity(enum ggml_log_level level) {
    switch (level) {
        case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
-        case GGML_LOG_LEVEL_INFO:  return LOG_LEVEL_INFO;
+        case GGML_LOG_LEVEL_INFO:  return LOG_LEVEL_TRACE;
        case GGML_LOG_LEVEL_WARN:  return LOG_LEVEL_WARN;
        case GGML_LOG_LEVEL_ERROR: return LOG_LEVEL_ERROR;
-        case GGML_LOG_LEVEL_CONT:  return LOG_LEVEL_INFO; // same as INFO
+        case GGML_LOG_LEVEL_CONT:  return LOG_LEVEL_TRACE;
        case GGML_LOG_LEVEL_NONE:
        default:
            return LOG_LEVEL_OUTPUT;
--- a/common/log.h
+++ b/common/log.h
@@ -21,7 +21,8 @@
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif

-#define LOG_LEVEL_DEBUG  4
+#define LOG_LEVEL_DEBUG  5
+#define LOG_LEVEL_TRACE  4
 #define LOG_LEVEL_INFO   3
 #define LOG_LEVEL_WARN   2
 #define LOG_LEVEL_ERROR  1
@@ -111,13 +112,15 @@ void common_log_flush         (struct common_log * log);                    // f
 #define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity,        __VA_ARGS__)

 #define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_LEVEL_DEBUG,  __VA_ARGS__)
+#define LOG_TRC(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  LOG_LEVEL_TRACE,  __VA_ARGS__)
 #define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  LOG_LEVEL_INFO,   __VA_ARGS__)
 #define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  LOG_LEVEL_WARN,   __VA_ARGS__)
 #define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, LOG_LEVEL_ERROR,  __VA_ARGS__)
 #define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  LOG_LEVEL_INFO,   __VA_ARGS__) // same as INFO

+#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
+#define LOG_TRCV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_TRACE, verbosity, __VA_ARGS__)
 #define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
 #define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
 #define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
-#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
 #define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  verbosity, __VA_ARGS__)
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -163,8 +163,13 @@ void common_preset::merge(const common_preset & other) {
    }
 }

-void common_preset::apply_to_params(common_params & params) const {
+void common_preset::apply_to_params(common_params & params, const std::set<std::string> & handled_keys) const {
    for (const auto & [opt, val] : options) {
+        if (!handled_keys.empty()) {
+            if (!opt.env || handled_keys.find(opt.env) == handled_keys.end()) {
+                continue;
+            }
+        }
        // apply each option to params
        if (opt.handler_string) {
            opt.handler_string(params, val);
--- a/common/preset.h
+++ b/common/preset.h
@@ -43,7 +43,8 @@ struct common_preset {
    void merge(const common_preset & other);

    // apply preset options to common_params
-    void apply_to_params(common_params & params) const;
+    // optionally specify handled_keys to only apply a subset of options (identified by their env), if empty, apply all options
+    void apply_to_params(common_params & params, const std::set<std::string> & handled_keys = std::set<std::string>()) const;
 };

 // interface for multiple presets in one file
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -547,6 +547,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    auto & chain = gsmpl->chain;
    auto & cur_p = gsmpl->cur_p; // initialized by set_logits

+    gsmpl->set_logits(ctx, idx);
+
    // Check if a backend sampler has already sampled a token in which case we
    // return that token id directly.
    {
@@ -558,17 +560,17 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
            GGML_ASSERT(!gsmpl->grmr    && "using grammar in combination with backend sampling is not supported");
            GGML_ASSERT(!gsmpl->rbudget && "using reasoning budget in combination with backend sampling is not supported");

-            // TODO: simplify
-            gsmpl->cur.resize(1);
-            gsmpl->cur[0] = { id, 0.0f, 1.0f };
-            cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
+            for (size_t i = 0; i < cur_p.size; ++i) {
+                if (cur_p.data[i].id == id) {
+                    cur_p.selected = i;
+                    break;
+                }
+            }

            return id;
        }
    }

-    gsmpl->set_logits(ctx, idx);
-
    // apply reasoning budget first
    llama_sampler_apply(rbudget, &cur_p);

--- a/common/speculative.cpp
+++ b/common/speculative.cpp
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -5,8 +5,14 @@

 struct common_speculative;

+// comma separated list the provided types
+std::string common_speculative_type_name_str(const std::vector<enum common_speculative_type> & types);
+
 // comma separated list of all types
-std::string common_speculative_type_name_str();
+const char * common_speculative_all_types_str();
+
+// parse user provided types
+std::vector<enum common_speculative_type> common_speculative_types_from_names(const std::vector<std::string> & names);

 // convert string to type
 enum common_speculative_type common_speculative_type_from_name(const std::string & name);
@@ -14,27 +20,44 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
 // convert type to string
 std::string common_speculative_type_to_str(enum common_speculative_type type);

-common_speculative * common_speculative_init(
-        common_params_speculative & params,
-        llama_context             * ctx_tgt);
+common_speculative * common_speculative_init(common_params_speculative & params, uint32_t n_seq);

 void common_speculative_free(common_speculative * spec);

+struct common_speculative_draft_params {
+    // this flag is used to chain the drafts through all the available implementations
+    // after the first successful draft from an implementation, we set it
+    //   to false to prevent further drafts for that sequence
+    // at the end of the draft() call, all drafting flags will be reset to false
+    bool drafting = false;
+
+    // overrides individual configurations (-1 disabled)
+    // can be used to constraint the max draft based on the remaining context size
+    int32_t n_max = -1;
+
+    llama_pos   n_past;
+    llama_token id_last;
+
+    // TODO: remove in the future by keeping track of the prompt from the _begin() call and the consecutive accept calls
+    const llama_tokens * prompt;
+
+    // the generated draft from the last _draft() call
+    llama_tokens * result;
+};
+
+common_speculative_draft_params & common_speculative_get_draft_params(common_speculative * spec, llama_seq_id seq_id);
+
 // optionally call once at the beginning of a new generation
-void common_speculative_begin(common_speculative * spec, const llama_tokens & prompt);
+void common_speculative_begin(common_speculative * spec, llama_seq_id seq_id, const llama_tokens & prompt);

-// sample up to n_draft tokens and add them to the batch using the draft model
-llama_tokens common_speculative_draft(
-                     common_speculative * spec,
-        const common_params_speculative & params,
-                     const llama_tokens & prompt,
-                            llama_token   id_last);
+// process the batch and update the internal state of the speculative context
+bool common_speculative_process(common_speculative * spec, const llama_batch & batch);

-// informs the speculative decoder that n_accepted tokens were accepted by the target model
-void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);
+// generate drafts for the sequences specified with `common_speculative_get_draft_params`
+void common_speculative_draft(common_speculative * spec);

-int32_t common_speculative_n_max(const common_speculative * spec, const common_params_speculative & params);
-int32_t common_speculative_n_min(const common_speculative * spec, const common_params_speculative & params);
+// informs the speculative context that n_accepted tokens were accepted by the target model
+void common_speculative_accept(common_speculative * spec, llama_seq_id, uint16_t n_accepted);

 // print statistics about the speculative decoding
 void common_speculative_print_stats(const common_speculative * spec);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -155,6 +155,7 @@ models = [
    {"name": "joyai-llm",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
    {"name": "kanana2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
    {"name": "f2llmv2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
+    {"name": "sarvam-moe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
@@ -175,6 +176,7 @@ pre_computed_hashes = [
    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
    {"name": "kimi-k2",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base",   "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
    {"name": "qwen2",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
+    {"name": "qwen35",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM-V-4_6", "chkhsh": "1444df51289cfa8063b96f0e62b1125440111bc79a52003ea14b6eac7016fd5f"},
    {"name": "grok-2",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
    # jina-v2-de variants
    {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -188,6 +188,24 @@ class LoraTorchTensor:
    def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
        return self.transpose(axis0, axis1)

+    def split(self, split_size: int | Sequence[int], dim: int = 0) -> tuple[LoraTorchTensor, ...]:
+        shape = self.shape
+        ndim = len(shape)
+        if dim < 0:
+            dim += ndim
+        if dim == ndim - 1:
+            A_chunks = self._lora_A.split(split_size, dim=-1)
+            return tuple(LoraTorchTensor(a, self._lora_B) for a in A_chunks)
+        elif dim == ndim - 2:
+            B_chunks = self._lora_B.split(split_size, dim=-2)
+            return tuple(LoraTorchTensor(self._lora_A, b) for b in B_chunks)
+        else:
+            B_chunks = self._lora_B.split(split_size, dim=dim)
+            if self._lora_A.shape[dim] == 1:
+                return tuple(LoraTorchTensor(self._lora_A, b) for b in B_chunks)
+            A_chunks = self._lora_A.split(split_size, dim=dim)
+            return tuple(LoraTorchTensor(a, b) for a, b in zip(A_chunks, B_chunks))
+
    def to(self, *args, **kwargs):
        return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))

@@ -230,6 +248,11 @@ class LoraTorchTensor:
                )
            else:
                raise NotImplementedError
+        elif func is torch.split:
+            assert len(args) and len(args) >= 2
+            tensor, split_size = args[0], args[1]
+            dim = args[2] if len(args) > 2 else kwargs.get("dim", 0)
+            return tensor.split(split_size, dim=dim)
        else:
            raise NotImplementedError

--- a/docs/backend/OPENVINO.md
+++ b/docs/backend/OPENVINO.md
@@ -57,17 +57,22 @@ Although OpenVINO supports a wide range of [Intel hardware](https://docs.openvin

 ## Validated Models

-The following models have been validated for functionality on Intel® Core™ Ultra Series 1 and Series 2:
+The following models were validated on Intel® Core™ Ultra Series 2. While our testing was limited, the OpenVINO backend is expected to work across a broad range of [Intel hardware](https://docs.openvino.ai/2026/about-openvino/release-notes-openvino/system-requirements.html).
+- Use `GGML_OPENVINO_STATEFUL_EXECUTION=1` when using GPU device.
+- `-fa 1` is required when running llama-bench with the OpenVINO backend.
+- Additional model support, quantization formats and validations are work in progress.

- [Llama-3.2-1B-Instruct-GGUF](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/)
- [Llama-3.1-8B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF)
- [microsoft/Phi-3-mini-4k-instruct-gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf)
- [Qwen/Qwen2.5-1.5B-Instruct-GGUF](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF)
- [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B-GGUF)
- [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-S-1B-sft-gguf)
- [tencent/Hunyuan-7B-Instruct](https://huggingface.co/bartowski/tencent_Hunyuan-7B-Instruct-GGUF)
- [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF)
- [bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF)
+| Model  | Validated   | Known Issues  |
+| :------| :---------- | :-------------|
+| [Llama-3.2-1B-Instruct](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
+| [Meta-Llama-3.1-8B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF) | `Q8_0`, `Q4_K_M` on CPU/GPU/NPU | `Q4_0_8_8`, `Q4_0_4_8`, `Q4_0_4_4` fail |
+| [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) | `FP16`, `Q4` on CPU/NPU | GPU unsupported for `FP16` and `Q4` (`llama-cli`, `llama-bench`) |
+| [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
+| [Qwen3-8B-Instruct](https://huggingface.co/Qwen/Qwen3-8B-GGUF) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/NPU; GPU works via `llama-bench` | GPU `llama-cli` unsupported for all quantizations |
+| [MiniCPM-V-2_6-GGUF](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `Q4_0` on CPU/GPU/NPU | — |
+| [DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF) | `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
+| [Hunyuan-7B-Instruct](https://huggingface.co/bartowski/tencent_Hunyuan-7B-Instruct-GGUF) | CPU: `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M`; GPU: `Q8_0`, `Q4_0`, `Q4_1`; NPU (`llama-bench` only): `Q4_0`, `Q4_1`, `Q4_K_M` | GPU `Q4_K_M` unsupported; NPU `llama-cli` unsupported |
+| [Mistral-7B-Instruct-v0.3](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF/) | CPU/GPU: `Q8_0`, `Q4_K_M`; NPU: `Q8_0`, `Q4_K_M` (via `llama-bench`) | NPU `llama-cli` unsupported for `Q8_0`, `Q4_K_M` |

 ## Build Instructions

--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -720,6 +720,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | GGML_SYCL_GRAPH    | OFF *(default)* \|ON *(Optional)*     | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
 | GGML_SYCL_DNN      | ON *(default)* \|OFF *(Optional)*     | Enable build with oneDNN.                   |
 | GGML_SYCL_HOST_MEM_FALLBACK | ON *(default)* \|OFF *(Optional)* | Allow host memory fallback when device memory is full during quantized weight reorder. Enables inference to continue at reduced speed (reading over PCIe) instead of failing. Requires Linux kernel 6.8+. |
+| GGML_SYCL_SUPPORT_LEVEL_ZERO | ON *(default)* \|OFF *(Optional)* | Enable Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. |
 | CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |

@@ -733,9 +734,18 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.|
 | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) |
 | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
+| GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. |
 | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
-| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
+| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Allow SYCL/Unified Runtime Level Zero device allocations larger than 4 GiB. llama.cpp's direct Level Zero allocation path requests the relaxed maximum-size limit itself when GGML_SYCL_ENABLE_LEVEL_ZERO=1. |
+
+## Compile-time Flags
+
+Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spot.
+
+| Name            | Function                                                                         |
+|-----------------|----------------------------------------------------------------------------------|
+| DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. |

 ## Design Rule

@@ -811,7 +821,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 - `ggml_backend_sycl_buffer_type_alloc_buffer: can't allocate 5000000000 Bytes of memory on device`

-  You need to enable to support 4GB memory malloc by:
+  With the default `GGML_SYCL_ENABLE_LEVEL_ZERO=1`, llama.cpp requests Level Zero's relaxed maximum-size allocation limit directly. If Level Zero support is disabled at build time or runtime and the allocation goes through SYCL/Unified Runtime instead, enable support for allocations larger than 4 GiB by:
  ```
    export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
    set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
--- a/docs/build-riscv64-spacemit.md
+++ b/docs/build-riscv64-spacemit.md
@@ -9,18 +9,20 @@ wget https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_6
 ~~~

 2. Build
-Below is the build script: it requires utilizing RISC-V vector instructions for acceleration. Ensure the `GGML_CPU_RISCV64_SPACEMIT` compilation option is enabled. The currently supported optimization version is `RISCV64_SPACEMIT_IME1`, corresponding to the `RISCV64_SPACEMIT_IME_SPEC` compilation option. Compiler configurations are defined in the `riscv64-spacemit-linux-gnu-gcc.cmake` file. Please ensure you have installed the RISC-V compiler and set the environment variable via `export RISCV_ROOT_PATH={your_compiler_path}`.
+Below is the build script: it requires utilizing RISC-V vector instructions for acceleration. Ensure the `GGML_CPU_RISCV64_SPACEMIT` compilation option is enabled. The currently supported optimization version is `RISCV64_SPACEMIT_IME1` and `RISCV64_SPACEMIT_IME2`, corresponding to the `RISCV64_SPACEMIT_IME_SPEC` compilation option. Compiler configurations are defined in the `riscv64-spacemit-linux-gnu-gcc.cmake` file. Please ensure you have installed the RISC-V compiler and set the environment variable via `export RISCV_ROOT_PATH={your_compiler_path}`.
 ```bash

 cmake -B build \
    -DCMAKE_BUILD_TYPE=Release \
    -DGGML_CPU_RISCV64_SPACEMIT=ON \
+    -DGGML_CPU_REPACK=OFF \
    -DLLAMA_OPENSSL=OFF \
    -DGGML_RVV=ON \
+    -DGGML_RV_ZVFH=ON \
    -DGGML_RV_ZFH=ON \
    -DGGML_RV_ZICBOP=ON \
    -DGGML_RV_ZIHINTPAUSE=ON \
-    -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
+    -DGGML_RV_ZBA=ON \
    -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake \
    -DCMAKE_INSTALL_PREFIX=build/installed

@@ -47,8 +49,25 @@ export RISCV_ROOT_PATH_IME1={your RISC-V compiler path}

 ${QEMU_ROOT_PATH}/bin/qemu-riscv64 -L ${RISCV_ROOT_PATH_IME1}/sysroot -cpu max,vlen=256,elen=64,vext_spec=v1.0 ${PWD}/build/bin/llama-cli -m ${PWD}/models/Qwen2.5-0.5B-Instruct-Q4_0.gguf -t 1
 ~~~
+
+## Quantization Support For Matrix
+
+| Quantization Type | X60 | A100 |
+| ---: | ---: | ---: |
+| Q2_K |  | :heavy_check_mark: |
+| Q3_K |  | :heavy_check_mark: |
+| Q4_0 | :heavy_check_mark: | :heavy_check_mark: |
+| Q4_1 | :heavy_check_mark: | :heavy_check_mark: |
+| Q4_K | :heavy_check_mark: | :heavy_check_mark: |
+| Q5_0 |  | :heavy_check_mark: |
+| Q5_1 |  | :heavy_check_mark: |
+| Q5_K |  | :heavy_check_mark: |
+| Q6_K |  | :heavy_check_mark: |
+| Q8_0 |  | :heavy_check_mark: |
+
+
 ## Performance
-#### Quantization Support For Matrix
+* Spacemit(R) X60
 ~~~
 model name      : Spacemit(R) X60
 isa             : rv64imafdcv_zicbom_zicboz_zicntr_zicond_zicsr_zifencei_zihintpause_zihpm_zfh_zfhmin_zca_zcd_zba_zbb_zbc_zbs_zkt_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkt_sscofpmf_sstc_svinval_svnapot_svpbmt
@@ -58,33 +77,34 @@ mvendorid       : 0x710
 marchid         : 0x8000000058000001
 ~~~

-Q4_0
-|   Model    |   Size   | Params | backend | threads | test | t/s |
-| -----------| -------- | ------ | ------- | ------- | ---- |------|
-Qwen2.5 0.5B |403.20 MiB|630.17 M|   cpu   |    4    | pp512|64.12 ± 0.26|
-Qwen2.5 0.5B |403.20 MiB|630.17 M|   cpu   |    4    | tg128|10.03 ± 0.01|
-Qwen2.5 1.5B |1011.16 MiB| 1.78 B |   cpu   |    4    | pp512|24.16 ± 0.02|
-Qwen2.5 1.5B |1011.16 MiB| 1.78 B |   cpu   |    4    | tg128|3.83 ± 0.06|
-Qwen2.5 3B   | 1.86 GiB  | 3.40 B |   cpu   |    4    | pp512|12.08 ± 0.02|
-Qwen2.5 3B   | 1.86 GiB  | 3.40 B |   cpu   |    4    | tg128|2.23 ± 0.02|
-
-Q4_1
-|   Model    |   Size   | Params | backend | threads | test | t/s |
-| -----------| -------- | ------ | ------- | ------- | ---- |------|
-Qwen2.5 0.5B |351.50 MiB|494.03 M|   cpu   |    4    | pp512|62.07 ± 0.12|
-Qwen2.5 0.5B |351.50 MiB|494.03 M|   cpu   |    4    | tg128|9.91 ± 0.01|
-Qwen2.5 1.5B |964.06 MiB| 1.54 B |   cpu   |    4    | pp512|22.95 ± 0.25|
-Qwen2.5 1.5B |964.06 MiB| 1.54 B |   cpu   |    4    | tg128|4.01 ± 0.15|
-Qwen2.5 3B   | 1.85 GiB | 3.09 B |   cpu   |    4    | pp512|11.55 ± 0.16|
-Qwen2.5 3B   | 1.85 GiB | 3.09 B |   cpu   |    4    | tg128|2.25 ± 0.04|
+| model                          |       size |     params | backend    | threads | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | ---: | --------------: | -------------------: |
+| qwen35 2B Q4_1                 |   1.19 GiB |     1.88 B | CPU        |       4 |      128 |  1 |    0 |           pp128 |         10.32 ± 0.02 |
+| qwen35 2B Q4_1                 |   1.19 GiB |     1.88 B | CPU        |       4 |      128 |  1 |    0 |           tg128 |          3.07 ± 0.01 |
+| qwen3 0.6B Q4_0                | 358.78 MiB |   596.05 M | CPU        |       4 |      128 |  1 |    0 |           pp128 |         49.15 ± 0.25 |
+| qwen3 0.6B Q4_0                | 358.78 MiB |   596.05 M | CPU        |       4 |      128 |  1 |    0 |           tg128 |         11.73 ± 0.02 |


-Q4_K
-|   Model    |   Size   | Params | backend | threads | test | t/s |
-| -----------| -------- | ------ | ------- | ------- | ---- |------|
-Qwen2.5 0.5B |462.96 MiB|630.17 M|   cpu   |    4    | pp512|9.29 ± 0.05|
-Qwen2.5 0.5B |462.96 MiB|630.17 M|   cpu   |    4    | tg128|5.67 ± 0.04|
-Qwen2.5 1.5B | 1.04 GiB | 1.78 B |   cpu   |    4    | pp512|10.38 ± 0.10|
-Qwen2.5 1.5B | 1.04 GiB | 1.78 B |   cpu   |    4    | tg128|3.17 ± 0.08|
-Qwen2.5 3B   | 1.95 GiB | 3.40 B |   cpu   |    4    | pp512|4.23 ± 0.04|
-Qwen2.5 3B   | 1.95 GiB | 3.40 B |   cpu   |    4    | tg128|1.73 ± 0.00|
+* Spacemit(R) A100
+~~~
+model name      : Spacemit(R) A100
+isa             : rv64imafdcvh_zicbom_zicbop_zicboz_zicntr_zicond_zicsr_zifencei_zihintntl_zihintpause_zihpm_zimop_zaamo_zalrsc_zawrs_zfa_zfh_zfhmin_zca_zcb_zcd_zcmop_zba_zbb_zbc_zbs_zkt_zvbb_zvbc_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkb_zvkg_zvkned_zvknha_zvknhb_zvksed_zvksh_zvkt_smaia_smstateen_ssaia_sscofpmf_sstc_svinval_svnapot_svpbmt_sdtrig
+mmu             : sv39
+mvendorid       : 0x710
+marchid         : 0x8000000041000002
+mimpid          : 0x10000000d5686200
+hart isa        : rv64imafdcv_zicbom_zicbop_zicboz_zicntr_zicond_zicsr_zifencei_zihintntl_zihintpause_zihpm_zimop_zaamo_zalrsc_zawrs_zfa_zfh_zfhmin_zca_zcb_zcd_zcmop_zba_zbb_zbc_zbs_zkt_zvbb_zvbc_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkb_zvkg_zvkned_zvknha_zvknhb_zvksed_zvksh_zvkt_smaia_smstateen_ssaia_sscofpmf_sstc_svinval_svnapot_svpbmt_sdtrig
+~~~
+
+| model                          |       size |     params | backend    | threads | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | ---: | --------------: | -------------------: |
+| qwen3 0.6B Q4_0                | 358.78 MiB |   596.05 M | CPU        |       8 |      128 |  1 |    0 |           pp128 |        565.83 ± 0.31 |
+| qwen3 0.6B Q4_0                | 358.78 MiB |   596.05 M | CPU        |       8 |      128 |  1 |    0 |           tg128 |         55.77 ± 0.02 |
+| qwen3 4B Q4_0                  |   2.21 GiB |     4.02 B | CPU        |       8 |      128 |  1 |    0 |           pp128 |         79.74 ± 0.04 |
+| qwen3 4B Q4_0                  |   2.21 GiB |     4.02 B | CPU        |       8 |      128 |  1 |    0 |           tg128 |         11.29 ± 0.00 |
+| qwen3moe 30B.A3B Q4_0          |  16.18 GiB |    30.53 B | CPU        |       8 |      128 |  1 |    0 |           pp128 |         57.88 ± 0.31 |
+| qwen3moe 30B.A3B Q4_0          |  16.18 GiB |    30.53 B | CPU        |       8 |      128 |  1 |    0 |           tg128 |         12.79 ± 0.00 |
+| qwen35 2B Q4_1                 |   1.19 GiB |     1.88 B | CPU        |       8 |      128 |  1 |    0 |           pp128 |        115.23 ± 0.04 |
+| qwen35 2B Q4_1                 |   1.19 GiB |     1.88 B | CPU        |       8 |      128 |  1 |    0 |           tg128 |         16.49 ± 0.01 |
+| gemma4 E4B Q4_K - Medium       |   4.76 GiB |     7.52 B | CPU        |       8 |      128 |  1 |    0 |           pp128 |         21.13 ± 0.01 |
+| gemma4 E4B Q4_K - Medium       |   4.76 GiB |     7.52 B | CPU        |       8 |      128 |  1 |    0 |           tg128 |          5.66 ± 0.00 |
--- a/docs/multi-gpu.md
+++ b/docs/multi-gpu.md
@@ -0,0 +1,127 @@
+# Using multiple GPUs with llama.cpp
+
+This guide explains how to run [llama.cpp](https://github.com/ggml-org/llama.cpp) across more than one GPU. It covers the split modes, the command-line flags that control them, the limitations you need to know about, and ready-to-use recipes for `llama-cli` and `llama-server`.
+
+The CLI arguments listed here are the same for both tools - or most llama.cpp binaries for that matter.
+
+---
+
+## When you need multi-GPU
+
+Reach for multi-GPU when one of these is true:
+
+- **The model doesn't fit in a single GPU's VRAM.** By spreading the weights across two or more GPUs the whole model can stay on accelerators. Otherwise part of the model will need to be run off of the comparatively slower system RAM.
+- **You want more throughput.** By distributing the computation across multiple GPUs, each individual GPU has to do less work. This can result in better prefill and/or token generation performance, depending on the split mode and interconnect speed vs. the speed of an individual GPU.
+
+---
+
+## The split modes
+
+Set with `--split-mode` / `-sm`.
+
+| Mode | What it does | When to use |
+|---|---|---|
+| `none` | Use a single GPU only. Pick which one with `--main-gpu`. | You explicitly want to confine the model to one GPU even though more are visible. |
+| `layer` (**default**) | Pipeline parallelism. Each GPU holds a contiguous slice of layers. The KV cache for layer *l* lives on the GPU that owns layer *l*. | Default and most compatible multi-GPU choice. You want more memory than a single GPU provides and your priority is a fast prefill. Can tolerate slow interconnect speeds between GPUs. |
+| `row` | **Deprecated.** Older row-split tensor-parallel path with comparatively poor performance. Splits only dense weights across GPUs. Superseded by `tensor` which should be universally superior if it can be used. | Avoid in new deployments. |
+| `tensor` | **EXPERIMENTAL.** Tensor parallelism that splits both weights *and* KV across the participating GPUs via a "meta device" abstraction. | You want more memory than a single GPU provides and your priority is fast token generation. Prefill speeds approach pipeline parallel speeds for large, dense models and fast GPU interconnect speeds. Treat as experimental as the code is less mature than pipeline parallelism. Performance should be good for multiple NVIDIA GPUs using the CUDA backend, no guarantees otherwise. |
+
+> Pipeline parallel (`layer`) vs. tensor parallel (`tensor`): pipeline-parallel runs different layers on different GPUs and processes tokens sequentially through the pipeline. This minimizes data transfers between GPUs but requires many tokens to scale well. Tensor-parallel splits each layer across GPUs and does multiple cross-GPU reductions per layer. This enables parallelizing any workload but is much more bottlenecked by the GPU interconnect speed. Pipeline-parallel maximizes batch throughput; tensor-parallel minimizes latency.
+
+---
+
+## Command-line arguments reference
+
+| Short | Long | Value | Default | Notes |
+|---|---|---|---|---|
+| `-sm` | `--split-mode` | `none` \| `layer` \| `tensor` | `layer` | See modes above. |
+| `-ts` | `--tensor-split` | comma-separated proportions, e.g. `3,1` | mode-dependent | How much of the model goes to each GPU. If omitted, `layer`/`row` use automatic splitting proportional to memory, while `tensor` splits tensor segments evenly. With `3,1` on two GPUs, GPU 0 gets 75 %, GPU 1 gets 25 %. The values follow the order in `--device`. |
+| `-mg` | `--main-gpu` | integer device index | `0` | The single GPU used in `--split-mode none`. |
+| `-ngl` | `--n-gpu-layers` / `--gpu-layers` | integer \| `auto` \| `all` | `auto` | Maximum number of layers to keep in VRAM. Use `999` or `all` to push everything possible to the GPUs. |
+| `-dev` | `--device` | comma-separated device names, or `none` | auto | Restrict which devices llama.cpp may use. See `--list-devices` for names. |
+| | `--list-devices` | - | - | Print the available devices and their memory. Run this first to learn the names you'd pass to `--device`. |
+| `-fa` | `--flash-attn` | `on` \| `off` \| `auto` | `auto` | Required when using `--split-mode tensor` and/or quantized V cache. Supported (and therefore enabled by default) for most combinations of models and backends. |
+| `-ctk` | `--cache-type-k` | `f32` \| `f16` \| `bf16` \| `q8_0` \| `q4_0` \| ... | `f16` | KV cache type for K. |
+| `-ctv` | `--cache-type-v` | same as `-ctk` | `f16` | KV cache type for V. |
+| `-fit` | `--fit` | `on` \| `off` | `on` | Auto-fit unset args to device memory. **Not supported with `tensor`. You may need to manually set the `--ctx-size` to make the model fit.**  |
+
+As for any CUDA program, the environment variable `CUDA_VISIBLE_DEVICES` can be used to control which GPUs to use for the CUDA backend: if you set it, llama.cpp only sees the specified GPUs. Use `--device` for selecting GPUs from among those visible to llama.cpp, this works for any backend.
+
+---
+
+## Recipes
+
+### 1. Default - pipeline parallel across all visible GPUs
+
+```bash
+llama-cli -m model.gguf
+llama-server -m model.gguf
+```
+
+Easiest configuration. KV cache spreads across the GPUs along with the layers. `--fit` (on by default) sizes things automatically.
+
+### 2. Pipeline parallel with a custom split ratio
+
+```bash
+llama-cli -m model.gguf -ts 3,1
+```
+
+Useful when GPUs have different memory: GPU 0 (3 parts) and GPU 1 (1 part). Proportions are normalized so `-ts 3,1` is the same as e.g. `-ts 75,25`.
+
+### 3. Single-GPU mode, picking a specific GPU
+
+```bash
+llama-cli --list-devices
+llama-cli -m model.gguf -dev CUDA1
+```
+
+Use only the device listed as `CUDA1` when calling with `--list-devices`.
+
+### 4. Tensor parallelism (experimental)
+
+```bash
+llama-cli -m model.gguf -sm tensor -ctk f16 -ctv f16
+```
+
+- `--flash-attn off` or (`--flash-attn auto` resolving to `off` when it isn't supported) is a hard error.
+- KV cache types must be non-quantized: `f32`, `f16`, or `bf16`. Support for quantized KV cache is not implemented and trying to use it will result in an error.
+- Mark this configuration as experimental in your tooling: validate output quality before deploying.
+- `--split-mode tensor`is not implemented for all architectures. The following will fail with *"LLAMA_SPLIT_MODE_TENSOR not implemented for architecture '...'"*:
+
+  - **MoE / hybrid:** Grok, MPT, OLMoE, DeepSeek2, GLM-DSA, Nemotron-H, Nemotron-H-MoE, Granite-Hybrid, LFM2-MoE, Minimax-M2, Mistral4, Kimi-Linear, Jamba, Falcon-H1
+  - **State-space / RWKV-style:** Mamba, Mamba2 (and the hybrid Mamba-attention models above)
+  - **Other:** PLAMO2, MiniCPM3, Gemma-3n, OLMo2, BitNet, T5
+
+### 5. With NCCL
+
+There's no runtime flag for NCCL - it's selected at build time (`-DGGML_CUDA_NCCL=ON`, this is the default). Note that NCCL is **not** automatically distributed with CUDA and you may need to install it manually - when in doubt check the CMake log to see whether or not it can find the package. When llama.cpp is compiled with NCCL support it uses it automatically for cross-GPU reductions in `tensor` mode. When NCCL is missing on a multi-GPU build, you'll see this one-time warning and performance will be lower:
+
+```
+NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal
+```
+
+When using the "ROCm" backend (which is the ggml CUDA code translated for AMD via HIP), the AMD equivalent RCCL can be used by compiling with `-DGGML_HIP_RCCL=ON`. Note that RCCL is by default *disabled* because (unlike NCCL) it was not universally beneficial during testing.
+### 6. With CUDA peer-to-peer access (`GGML_CUDA_P2P`)
+
+CUDA peer-to-peer (P2P) lets GPUs transfer data directly between each other instead of going through system memory, which generally improves multi-GPU performance. It is **opt-in** at runtime - set the environment variable `GGML_CUDA_P2P` to any value to enable it:
+
+```bash
+GGML_CUDA_P2P=1 llama-cli -m model.gguf -sm tensor
+```
+
+P2P requires driver support (usually restricted to workstation/datacenter GPUs) and **may cause crashes or corrupted outputs on some motherboards or BIOS configurations** (e.g. when IOMMU is enabled). If you see instability after enabling it, unset the variable.
+
+---
+
+## Troubleshooting
+
+| Symptom | How to fix |
+|---|---|
+| Startup error *"SPLIT_MODE_TENSOR requires flash_attn to be enabled"* | Add `-fa on` or remove `-fa off`. |
+| Startup error *"simultaneous use of SPLIT_MODE_TENSOR and KV cache quantization not implemented"* | Use `-ctk f16 -ctv f16` (or `bf16`/`f32`) with `--split-mode tensor`. |
+| Startup error *"LLAMA_SPLIT_MODE_TENSOR not implemented for architecture 'X'"* | Architecture not on the TENSOR allow-list. Use `--split-mode layer`. |
+| Warning *"NCCL is unavailable, multi GPU performance will be suboptimal"* | llama.cpp wasn't built with NCCL. Either accept the lower performance or install NCCL and rebuild. |
+| CUDA OOM at startup or during prefill in `--split-mode tensor` | Auto-fit is disabled in this mode, so reduce memory pressure yourself. In order from least to most disruptive: lower `--ctx-size` (`-c`) (KV cache is roughly proportional to `n_ctx`); for `llama-server`, lower `--parallel` (`-np`) (a slot KV cache is allocated per concurrent sequence); as a last resort, reduce `--n-gpu-layers` (`-ngl`) (the remaining layers run on CPU and inference will be much slower). |
+| Performance is worse with multi-GPU than single-GPU | The performance is bottlenecked by GPU interconnect speed. For `--split-mode tensor`, verify that NCCL is being used. Try `--split-mode layer` (less communication than `tensor`). Increase GPU interconnect speed via more PCIe lanes or e.g. NVLink (if available). |
+| GPU not used at all | `--n-gpu-layers` is `0` or too low - try explicitly setting `-ngl all`. Or you are accidentally hiding the GPUs via an environment variable like `CUDA_VISIBLE_DEVICES=-1`. Or your build doesn't include support for the relevant backend. |
+| Crashes or corrupted outputs after setting `GGML_CUDA_P2P=1` | Some motherboards and BIOS settings (e.g. with IOMMU enabled) don't support CUDA peer-to-peer reliably. Unset `GGML_CUDA_P2P`. |
--- a/docs/multimodal/minicpmv4.6.md
+++ b/docs/multimodal/minicpmv4.6.md
@@ -0,0 +1,49 @@
+## MiniCPM-V 4.6
+
+### Prepare models and code
+
+Download [MiniCPM-V-4_6](https://huggingface.co/openbmb/MiniCPM-V-4_6) PyTorch model from huggingface to "MiniCPM-V-4_6" folder.
+
+The model must be the standard `transformers` v5.7.0+ checkpoint (no `trust_remote_code`); the architecture in `config.json` is `MiniCPMV4_6ForConditionalGeneration` with a `qwen3_5_text` text model and a SigLIP-based vision tower plus a window-attention `vit_merger`.
+
+### Build llama.cpp
+
+If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
+
+Clone llama.cpp:
+```bash
+git clone https://github.com/ggml-org/llama.cpp
+cd llama.cpp
+```
+
+Build llama.cpp using `CMake`:
+```bash
+cmake -B build
+cmake --build build --config Release
+```
+
+
+### Usage of MiniCPM-V 4.6
+
+Unlike older MiniCPM-V variants, MiniCPM-V 4.6 is converted directly through `convert_hf_to_gguf.py`. The same script is invoked twice on the original Hugging Face directory: once to produce the language-model GGUF and once with `--mmproj` to produce the multimodal projector GGUF.
+
+```bash
+# language model
+python ./convert_hf_to_gguf.py ../MiniCPM-V-4_6 --outfile ../MiniCPM-V-4_6/ggml-model-f16.gguf
+
+# multimodal projector (vision tower + window-attention vit_merger + DownsampleMLP merger)
+python ./convert_hf_to_gguf.py ../MiniCPM-V-4_6 --mmproj --outfile ../MiniCPM-V-4_6/mmproj-model-f16.gguf
+
+# optional: quantize to Q4_K_M
+./build/bin/llama-quantize ../MiniCPM-V-4_6/ggml-model-f16.gguf ../MiniCPM-V-4_6/ggml-model-Q4_K_M.gguf Q4_K_M
+```
+
+
+Inference on Linux or Mac
+```bash
+# run in single-turn mode
+./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_6/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+
+# run in conversation mode
+./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_6/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4_6/mmproj-model-f16.gguf
+```
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -17,8 +17,8 @@ Legend:
 |                              ABS | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
@@ -36,15 +36,15 @@ Legend:
 |                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |               CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                           CUMSUM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
-|                             DIAG | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                           CUMSUM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                             DIAG | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              ELU | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                              EXP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                             FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                             FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                  GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
@@ -61,16 +61,17 @@ Legend:
 |                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                           IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                          L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
 |                              LOG | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                 MUL_MAT_HADAMARD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
 |                              NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
+|                             NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                     OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                         OUT_PROD | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 |
@@ -101,11 +102,11 @@ Legend:
 |                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
-|                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                              SQR | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                             SQRT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                         SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ |
+|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                             STEP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              SUM | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
@@ -117,5 +118,5 @@ Legend:
 |                            TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                            XIELU | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
--- a/docs/ops/SYCL.csv
+++ b/docs/ops/SYCL.csv
--- a/docs/ops/WebGPU.csv
+++ b/docs/ops/WebGPU.csv
--- a/docs/speculative.md
+++ b/docs/speculative.md
@@ -33,18 +33,18 @@ An example to use this approach can be the rewriting of source code by a LLM.
 This implementation looks for the last n-gram in history that matches the current n-gram and creates a draft using the m tokens following the matched n-gram. It is the simplest self-speculative approach with minimal overhead.

 ```
-llama-server [...] --spec-type ngram-simple --draft-max 64
+llama-server [...] --spec-type ngram-simple --spec-draft-n-max 64
 ```

 #### n-gram Map Key (`ngram-map-k`)

-This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`, default is 1) before generating drafts.
+This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-map-k-min-hits`, default is 1) before generating drafts.

 The number of accepted tokens is stored for each used n-gram.

 **Example:**
 ```
-llama-server [...] --spec-type ngram-map-k --draft-max 64
+llama-server [...] --spec-type ngram-map-k --spec-draft-n-max 64
 ```

 #### n-gram Map Key-4-Values (`ngram-map-k4v`)
@@ -55,7 +55,7 @@ The number of accepted tokens is stored for each used n-gram.

 **Example:** Server options to be used if there are a lot of longer repetitions.
 ```
-llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2 --draft-max 64
+llama-server [...] --spec-type ngram-map-k4v --spec-ngram-map-k4v-size-n 8 --spec-ngram-map-k4v-size-m 8 --spec-ngram-map-k4v-min-hits 2 --spec-draft-n-max 64
 ```

 ### n-gram Mod (`ngram-mod`)
@@ -80,9 +80,9 @@ Currently, a single hash pool is shared across all server slots, so different re
 # notes:
 # - small `n` are not recommended
 # - MoEs require long drafts
-# - dense models: can reduce `--draft-min` and `--draft-max`
+# - dense models: can reduce `--spec-ngram-mod-n-min` and `--spec-ngram-mod-n-max`

-llama-server ... --spec-type ngram-mod --spec-ngram-size-n 24 --draft-min 48 --draft-max 64
+llama-server ... --spec-type ngram-mod --spec-ngram-mod-n-match 24 --spec-ngram-mod-n-min 48 --spec-ngram-mod-n-max 64
 ```

 Applications:
@@ -105,21 +105,90 @@ Example Video:

 If a draft model is combined with a draftless decoding the draftless decoding has higher precedence.

+### General Speculative Parameters
+
 ```
--draft, --draft-n, --draft-max N       number of tokens to draft for speculative decoding (default: 16)
-                                        (env: LLAMA_ARG_DRAFT_MAX)
--draft-min, --draft-n-min N            minimum number of draft tokens to use for speculative decoding
-                                        (default: 0)
-                                        (env: LLAMA_ARG_DRAFT_MIN)
-[...]
 --spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
                                        type of speculative decoding to use when no draft model is provided
                                        (default: none)
--spec-ngram-size-n N                   ngram size N for ngram-simple/ngram-map speculative decoding, length
-                                        of lookup n-gram (default: 12)
--spec-ngram-size-m N                   ngram size M for ngram-simple/ngram-map speculative decoding, length
-                                        of draft m-gram (default: 48)
--spec-ngram-min-hits N                 minimum hits for ngram-map speculative decoding (default: 1)
+                                        (env: LLAMA_ARG_SPEC_TYPE)
+--spec-default                          use default speculative decoding
+```
+
+### Draft Model Parameters
+
+```
+--spec-draft-model, -md, --model-draft  FNAME
+                                        draft model for speculative decoding (default: unused)
+                                        (env: LLAMA_ARG_SPEC_DRAFT_MODEL)
+--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft  <user>/<model>[:quant]
+                                        HuggingFace repository for the draft model
+--spec-draft-n-max                      N
+                                        number of tokens to draft for speculative decoding (default: 16)
+                                        (env: LLAMA_ARG_SPEC_DRAFT_N_MAX)
+--spec-draft-n-min                      N
+                                        minimum number of draft tokens to use for speculative decoding (default: 0)
+                                        (env: LLAMA_ARG_SPEC_DRAFT_N_MIN)
+--spec-draft-p-split, --draft-p-split   P
+                                        speculative decoding split probability (default: 0.10)
+                                        (env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT)
+--spec-draft-p-min, --draft-p-min       P
+                                        minimum speculative decoding probability (greedy) (default: 0.75)
+                                        (env: LLAMA_ARG_SPEC_DRAFT_P_MIN)
+--spec-draft-ctx-size, -cd, --ctx-size-draft  N
+                                        size of the prompt context for the draft model (default: 0, 0 = loaded from model)
+                                        (env: LLAMA_ARG_SPEC_DRAFT_CTX_SIZE)
+--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft  N
+                                        max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
+                                        (env: LLAMA_ARG_N_GPU_LAYERS_DRAFT)
+--spec-draft-device, -devd, --device-draft  <dev1,dev2,..>
+                                        comma-separated list of devices to use for offloading the draft model
+--spec-draft-replace, --spec-replace    TARGET  DRAFT
+                                        translate the string in TARGET into DRAFT if the draft model and main model are not compatible
+```
+
+### n-gram Mod Parameters
+
+```
+--spec-ngram-mod-n-match                N
+                                        ngram-mod lookup length (default: 24)
+--spec-ngram-mod-n-min                  N
+                                        minimum number of ngram tokens to use for ngram-based speculative decoding (default: 48)
+--spec-ngram-mod-n-max                  N
+                                        maximum number of ngram tokens to use for ngram-based speculative decoding (default: 64)
+```
+
+### n-gram Simple Parameters
+
+```
+--spec-ngram-simple-size-n              N
+                                        ngram size N for ngram-simple speculative decoding, length of lookup n-gram (default: 12)
+--spec-ngram-simple-size-m              N
+                                        ngram size M for ngram-simple speculative decoding, length of draft m-gram (default: 48)
+--spec-ngram-simple-min-hits            N
+                                        minimum hits for ngram-simple speculative decoding (default: 1)
+```
+
+### n-gram Map Key Parameters
+
+```
+--spec-ngram-map-k-size-n               N
+                                        ngram size N for ngram-map-k speculative decoding, length of lookup n-gram (default: 12)
+--spec-ngram-map-k-size-m               N
+                                        ngram size M for ngram-map-k speculative decoding, length of draft m-gram (default: 48)
+--spec-ngram-map-k-min-hits             N
+                                        minimum hits for ngram-map-k speculative decoding (default: 1)
+```
+
+### n-gram Map Key-4-Values Parameters
+
+```
+--spec-ngram-map-k4v-size-n             N
+                                        ngram size N for ngram-map-k4v speculative decoding, length of lookup n-gram (default: 12)
+--spec-ngram-map-k4v-size-m             N
+                                        ngram size M for ngram-map-k4v speculative decoding, length of draft m-gram (default: 48)
+--spec-ngram-map-k4v-min-hits           N
+                                        minimum hits for ngram-map-k4v speculative decoding (default: 1)
 ```

 ### `--spec-type TYPE`
@@ -140,21 +209,40 @@ Specifies a type of speculative decoding without draft model.
 ./llama-server [...] --spec-type ngram-simple
 ```

-### `--spec-ngram-size-n N`
+### `--spec-ngram-*-size-n N`

 Sets the size N of the lookup n-gram for n-gram map based speculative decoding.
 The n-gram size N determines how many tokens in a row to look back when searching for matching patterns.

-### `--spec-ngram-size-m M`
+Each n-gram implementation has its own parameter:
+
+- `--spec-ngram-simple-size-n` for `ngram-simple`
+- `--spec-ngram-map-k-size-n` for `ngram-map-k`
+- `--spec-ngram-map-k4v-size-n` for `ngram-map-k4v`
+- `--spec-ngram-mod-n-match` for `ngram-mod`
+
+### `--spec-ngram-*-size-m M`

 Sets the size M of the draft m-gram for n-gram map based speculative decoding.
 The m-gram size determines how many tokens to draft when a match is found.
 Larger values can provide more speedup but may reduce acceptance rate.

-### `--spec-ngram-min-hits H`
+Each n-gram implementation has its own parameter:
+
+- `--spec-ngram-simple-size-m` for `ngram-simple`
+- `--spec-ngram-map-k-size-m` for `ngram-map-k`
+- `--spec-ngram-map-k4v-size-m` for `ngram-map-k4v`
+
+### `--spec-ngram-*-min-hits H`

 This option defines how often a key has to appear in the token history to be used as a draft (default is 1).

+Each n-gram implementation has its own parameter:
+
+- `--spec-ngram-simple-min-hits` for `ngram-simple`
+- `--spec-ngram-map-k-min-hits` for `ngram-map-k`
+- `--spec-ngram-map-k4v-min-hits` for `ngram-map-k4v`
+
 ## Statistics
 Each speculative decoding implementation prints statistics.

@@ -180,4 +268,3 @@ statistics ngram_map_k: #calls(b,g,a) = 6 1690 26, #gen drafts = 26, #acc drafts
 - `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
 - `#acc tokens`: number of tokens accepted by the main model
 - `dur(b,g,a): durations of begin (new prompt), generation and accumulation (process acceptance).
-
--- a/examples/diffusion/CMakeLists.txt
+++ b/examples/diffusion/CMakeLists.txt
@@ -1,5 +1,10 @@
+set(TARGET llama-diffusion)
+add_library(${TARGET} STATIC diffusion.cpp diffusion.h)
+target_link_libraries(${TARGET} PUBLIC llama llama-common ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PUBLIC cxx_std_17)
+
 set(TARGET llama-diffusion-cli)
 add_executable(${TARGET} diffusion-cli.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama llama-common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-diffusion llama llama-common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/diffusion/README.md
+++ b/examples/diffusion/README.md
@@ -12,11 +12,11 @@ The diffusion CLI supports various parameters to control the generation process:
 ### Core Diffusion Parameters
 - `--diffusion-steps`: Number of diffusion steps (default: 256)
 - `--diffusion-algorithm`: Algorithm for token selection
-  - `0`: ORIGIN - Token will be generated in a purely random order from https://arxiv.org/abs/2107.03006.
-  - `1`: ENTROPY_BASED - Entropy-based selection
-  - `2`: MARGIN_BASED - Margin-based selection
-  - `3`: RANDOM - Random selection
-  - `4`: CONFIDENCE_BASED - Confidence-based selection (default)
+  - `0`: DIFFUSION_ALGORITHM_ORIGIN - Token will be generated in a purely random order from https://arxiv.org/abs/2107.03006.
+  - `1`: DIFFUSION_ALGORITHM_ENTROPY_BASED - Entropy-based selection
+  - `2`: DIFFUSION_ALGORITHM_MARGIN_BASED - Margin-based selection
+  - `3`: DIFFUSION_ALGORITHM_RANDOM - Random selection
+  - `4`: DIFFUSION_ALGORITHM_CONFIDENCE_BASED - Confidence-based selection (default)
  - More documentation here https://github.com/DreamLM/Dream
 - `--diffusion-visual`: Enable live visualization during generation

--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@@ -1,127 +1,23 @@
 #include "arg.h"
 #include "chat.h"
 #include "common.h"
+#include "diffusion.h"
 #include "llama.h"
 #include "log.h"

 #include <limits.h>

-#include <algorithm>
 #include <clocale>
-#include <cmath>
 #include <cstring>
-#include <limits>
-#include <random>
 #include <string>
 #include <vector>

-enum diffusion_algorithm { ORIGIN = 0, ENTROPY_BASED = 1, MARGIN_BASED = 2, RANDOM = 3, CONFIDENCE_BASED = 4 };
-
-// Unified transfer scheduling methods
-enum transfer_schedule {
-    TIMESTEP_BASED = 0,  // Dream-style: (1.0 - s/t) * remaining
-    BLOCK_BASED    = 1,  // LLaDA-style: process in blocks with get_num_transfer_tokens
-};
-
-typedef bool (*diffusion_step_callback_t)(int32_t             step,
-                                          int32_t             total_steps,
-                                          const llama_token * tokens,
-                                          int32_t             n_tokens,
-                                          void *              user_data);
-
-struct diffusion_params {
-    int32_t                   steps                   = 0;
-    float                     temperature             = 0;
-    llama_token               mask_token_id           = LLAMA_TOKEN_NULL;
-    diffusion_step_callback_t step_callback           = nullptr;
-    void *                    step_callback_user_data = nullptr;
-    int32_t                   seed                    = 0;
-    bool                      visual_mode             = false;
-    bool                      shift_logits            = false;  // Shift logits by -1 after decode
-
-    float   top_p = 0.;
-    int32_t top_k = 0.;
-
-    diffusion_algorithm algorithm = CONFIDENCE_BASED;
-    transfer_schedule   schedule  = TIMESTEP_BASED;
-
-    float   cfg_scale        = 0.;     // Config scale for classifier-free guidance
-    float   eps              = 0.;     // Timestep scheduling
-    int32_t block_length     = 0;      // Block size (for block scheduling)
-    float   alg_temp         = 0;      // algorithm temperature (0.0 = deterministic)
-    bool    add_gumbel_noise = false;  // Add gumbel noise to the logits if temp > 0.0
-
-    int32_t max_length = 0;            // Maximum sequence length
-};
-
 struct callback_data {
    diffusion_params *  diff_params;
    const llama_vocab * vocab;
    int32_t             n_input;
 };

-static float calculate_confidence(const llama_token_data_array & cur_p,
-                                  diffusion_algorithm            algorithm,
-                                  std::mt19937 &                 rng) {
-    switch (algorithm) {
-        case CONFIDENCE_BASED:
-            return cur_p.data[cur_p.selected].p;  // Selected token probability
-
-        case ENTROPY_BASED:
-            {
-                float       entropy = 0.0f;
-                const float epsilon = 1e-10f;
-                for (size_t i = 0; i < cur_p.size; i++) {
-                    float prob = cur_p.data[i].p;
-                    entropy += prob * logf(prob + epsilon);
-                }
-                return -entropy;  // Higher entropy = lower confidence
-            }
-
-        case MARGIN_BASED:
-            return (cur_p.size > 1) ? cur_p.data[0].p - cur_p.data[1].p : cur_p.data[0].p;
-
-        case RANDOM:
-            {
-                std::uniform_real_distribution<float> uniform(0.0f, 1.0f);
-                return uniform(rng);  // Random confidence
-            }
-
-        case ORIGIN:
-            return cur_p.data[cur_p.selected].p;
-
-        default:
-            return 0.0f;
-    }
-}
-
-// Unified transfer count calculation function
-static int32_t calculate_transfer_count(int32_t                      step,
-                                        int32_t                      total_steps,
-                                        int32_t                      remaining_masked,
-                                        transfer_schedule            schedule,
-                                        float                        eps,
-                                        const std::vector<int32_t> & num_transfer_tokens = {}) {
-    switch (schedule) {
-        case TIMESTEP_BASED:
-            {
-                float t          = 1.0f - (float) step / total_steps * (1.0f - eps);
-                float s          = 1.0f - (float) (step + 1) / total_steps * (1.0f - eps);
-                float p_transfer = (step < total_steps - 1) ? (1.0f - s / t) : 1.0f;
-                return (int32_t) (remaining_masked * p_transfer);
-            }
-
-        case BLOCK_BASED:
-            if (!num_transfer_tokens.empty() && step < (int32_t) num_transfer_tokens.size()) {
-                return num_transfer_tokens[step];
-            }
-            return remaining_masked / (total_steps - step);  // Fallback
-
-        default:
-            return remaining_masked / (total_steps - step);
-    }
-}
-
 static bool diffusion_step_callback(int32_t             step,
                                    int32_t             total_steps,
                                    const llama_token * tokens,
@@ -176,341 +72,6 @@ static bool diffusion_step_callback(int32_t             step,
    return true;
 }

-static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) {
-    if (temperature == 0.0f) {
-        return;
-    }
-
-    std::uniform_real_distribution<double> uniform(0.0, 1.0);
-    for (int32_t i = 0; i < n_vocab; i++) {
-        double noise        = uniform(rng);
-        // Prevent log(0)
-        noise               = std::max(noise, 1e-20);
-        double gumbel_noise = std::pow(-std::log(noise), temperature);
-        logits[i]           = std::exp(logits[i]) / gumbel_noise;
-    }
-}
-
-static std::vector<int32_t> get_num_transfer_tokens(int32_t mask_count, int32_t steps) {
-    std::vector<int32_t> num_transfer_tokens(steps);
-
-    int32_t base      = mask_count / steps;
-    int32_t remainder = mask_count % steps;
-
-    for (int32_t i = 0; i < steps; i++) {
-        num_transfer_tokens[i] = base + (i < remainder ? 1 : 0);
-    }
-
-    return num_transfer_tokens;
-}
-
-static void diffusion_generate(llama_context *          ctx,
-                               const llama_token *      input_tokens,
-                               llama_token *            output_tokens,
-                               int32_t                  n_input,
-                               const diffusion_params & params,
-                               int32_t &                n_generated) {
-    n_generated = 0;
-    if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || params.max_length <= n_input) {
-        return;
-    }
-
-    const llama_model * model = llama_get_model(ctx);
-
-    // Initialize with input and pad with mask tokens
-    std::copy(input_tokens, input_tokens + n_input, output_tokens);
-    std::fill(output_tokens + n_input, output_tokens + params.max_length, params.mask_token_id);
-
-    std::mt19937 rng(params.seed);
-
-    llama_set_causal_attn(ctx, false);
-
-    int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
-
-    std::vector<llama_token_data> candidates(n_vocab);
-    std::vector<llama_token_data> conf_candidates;
-    conf_candidates.reserve(params.max_length);
-    std::vector<int32_t> mask_positions;
-    mask_positions.reserve(params.max_length);
-
-    // Setup sampler chain
-    struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
-    if (params.top_k > 0) {
-        llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
-    }
-    if (params.top_p < 1.0f) {
-        llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
-    }
-    if (params.temperature > 0.0f) {
-        llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
-    }
-    llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
-
-    struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
-
-    llama_batch batch = llama_batch_init(params.max_length, 0, 1);
-    batch.n_tokens    = params.max_length;
-
-    // Pre-allocate buffers for CFG if needed
-    int32_t                  logits_size = n_vocab * params.max_length;
-    std::vector<float>       cond_logits_buffer;
-    std::vector<llama_token> un_x_buffer;
-    if (params.cfg_scale > 0.0f) {
-        cond_logits_buffer.resize(logits_size);
-        un_x_buffer.resize(params.max_length);
-    }
-
-    // For block-based processing
-    std::vector<int32_t> num_transfer_tokens;
-    int32_t              num_blocks      = 1;
-    int32_t              steps_per_block = params.steps;
-
-    if (params.schedule == BLOCK_BASED) {
-        GGML_ASSERT(params.max_length % params.block_length == 0);
-        num_blocks = params.max_length / params.block_length;
-        GGML_ASSERT(params.steps % num_blocks == 0);
-        steps_per_block = params.steps / num_blocks;
-    }
-
-    std::vector<float> confidence(params.max_length);
-
-    int64_t total_sampling_time = 0;
-    int64_t total_time          = 0;
-    int64_t time_start          = ggml_time_us();
-
-    for (int block_num = 0; block_num < num_blocks; block_num++) {
-        int32_t block_start = (params.schedule == BLOCK_BASED) ? n_input + block_num * params.block_length : 0;
-        int32_t block_end   = (params.schedule == BLOCK_BASED) ?
-                                  std::min(n_input + (block_num + 1) * params.block_length, params.max_length) :
-                                  params.max_length;
-
-        // Count masked tokens in current block for block-based processing
-        if (params.schedule == BLOCK_BASED) {
-            int32_t block_mask_count = 0;
-            for (int i = block_start; i < block_end; i++) {
-                if (output_tokens[i] == params.mask_token_id) {
-                    block_mask_count++;
-                }
-            }
-            num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block);
-        }
-
-        for (int32_t step = 0; step < steps_per_block; step++) {
-            int32_t global_step = block_num * steps_per_block + step;
-
-            if (params.step_callback) {
-                if (!params.step_callback(
-                        global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) {
-                    break;
-                }
-            }
-
-            // Setup batch
-            for (int32_t i = 0; i < params.max_length; i++) {
-                batch.token[i]     = output_tokens[i];
-                batch.pos[i]       = i;
-                batch.n_seq_id[i]  = 1;
-                batch.seq_id[i][0] = 0;
-                batch.logits[i]    = 1;
-            }
-
-            float * logits = nullptr;
-
-            if (params.cfg_scale > 0.0f) {
-                int ret = llama_decode(ctx, batch);
-                if (ret != 0) {
-                    LOG_ERR("Failed to generate conditional");
-                    break;
-                }
-                float * cond_logits_ptr = llama_get_logits(ctx);
-                std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float));
-
-                // Unconditional generation (mask input)
-                std::copy(output_tokens, output_tokens + params.max_length, un_x_buffer.begin());
-                for (int32_t i = 0; i < n_input; i++) {
-                    un_x_buffer[i] = params.mask_token_id;
-                }
-
-                for (int32_t i = 0; i < params.max_length; i++) {
-                    batch.token[i] = un_x_buffer[i];
-                }
-                ret = llama_decode(ctx, batch);
-                if (ret != 0) {
-                    LOG_ERR("Failed to generate unconditional");
-                    break;
-                }
-                float * uncond_logits = llama_get_logits(ctx);
-
-                // Apply CFG
-                for (int32_t i = 0; i < logits_size; i++) {
-                    cond_logits_buffer[i] =
-                        uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]);
-                }
-                logits = cond_logits_buffer.data();
-            } else {
-                int ret = llama_decode(ctx, batch);
-                if (ret != 0) {
-                    LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, global_step, ret);
-                    break;
-                }
-                logits = llama_get_logits(ctx);
-            }
-
-            if (!logits) {
-                LOG_ERR("%s: failed to get logits at step %d\n", __func__, global_step);
-                break;
-            }
-
-            auto get_logits_for_pos = [&](int32_t pos) -> const float * {
-                if (params.shift_logits) {
-                    return pos == 0 ? logits : logits + (pos - 1) * n_vocab;
-                }
-                return logits + (pos) *n_vocab;
-            };
-
-            int64_t time_start_sampling = ggml_time_us();
-
-            mask_positions.clear();
-            for (int32_t i = 0; i < params.max_length; i++) {
-                if (output_tokens[i] == params.mask_token_id) {
-                    // For block-based, only consider current block
-                    if (params.schedule != BLOCK_BASED || (i >= block_start && i < block_end)) {
-                        mask_positions.push_back(i);
-                    }
-                }
-            }
-
-            if (mask_positions.empty()) {
-                break;
-            }
-
-            if (params.add_gumbel_noise && params.temperature > 0.0f) {
-                add_gumbel_noise(logits, n_vocab, params.temperature, rng);
-            }
-
-            if (params.algorithm == ORIGIN) {
-                int32_t transfer_count = calculate_transfer_count(
-                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
-                float p_transfer = (float) transfer_count / mask_positions.size();
-
-                for (int32_t pos : mask_positions) {
-                    if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
-                        const float * pos_logits = get_logits_for_pos(pos);
-                        for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
-                            candidates[token_id].id    = token_id;
-                            candidates[token_id].logit = pos_logits[token_id];
-                            candidates[token_id].p     = 0.0f;
-                        }
-
-                        llama_token_data_array cur_p = {
-                            candidates.data(),
-                            (size_t) n_vocab,
-                            -1,
-                            false,
-                        };
-
-                        llama_sampler_apply(sampler, &cur_p);
-                        output_tokens[pos] = cur_p.data[cur_p.selected].id;
-                    }
-                }
-            } else {
-                std::vector<std::pair<float, int32_t>> confidences;
-                std::vector<llama_token>               sampled_tokens(mask_positions.size());
-
-                for (size_t i = 0; i < mask_positions.size(); i++) {
-                    int32_t       pos        = mask_positions[i];
-                    const float * pos_logits = get_logits_for_pos(pos);
-
-                    for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
-                        candidates[token_id].logit = pos_logits[token_id];
-                        candidates[token_id].p     = 0.0f;
-                        candidates[token_id].id    = token_id;
-                    }
-
-                    llama_token_data_array cur_p = {
-                        candidates.data(),
-                        candidates.size(),
-                        -1,
-                        false,
-                    };
-
-                    llama_sampler_apply(sampler, &cur_p);
-                    llama_token sampled_token = cur_p.data[cur_p.selected].id;
-
-                    float conf = calculate_confidence(cur_p, params.algorithm, rng);
-
-                    sampled_tokens[i] = sampled_token;
-                    confidences.emplace_back(conf, i);
-                }
-
-                int32_t transfer_count = calculate_transfer_count(
-                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
-
-                if (transfer_count > 0) {
-                    if (params.alg_temp == 0.0f) {
-                        std::partial_sort(confidences.begin(),
-                                          confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()),
-                                          confidences.end(),
-                                          [](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
-                                              if (a.first != b.first) {
-                                                  return a.first > b.first;
-                                              }
-                                              return a.second < b.second;
-                                          });
-
-                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
-                            int32_t mask_idx   = confidences[i].second;
-                            int32_t pos        = mask_positions[mask_idx];
-                            output_tokens[pos] = sampled_tokens[mask_idx];
-                        }
-                    } else {
-                        conf_candidates.clear();
-                        for (size_t i = 0; i < confidences.size(); i++) {
-                            float conf_logit = confidences[i].first / params.alg_temp;
-                            conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f });
-                        }
-
-                        llama_token_data_array conf_array = {
-                            conf_candidates.data(),
-                            conf_candidates.size(),
-                            -1,
-                            false,
-                        };
-
-                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
-                            llama_sampler_apply(dist_sampler, &conf_array);
-                            int32_t selected_idx = conf_array.selected;
-                            int32_t mask_idx     = selected_idx;
-                            int32_t pos          = mask_positions[mask_idx];
-                            output_tokens[pos]   = sampled_tokens[mask_idx];
-
-                            conf_candidates[selected_idx].p = 0.0f;
-                            conf_array.selected             = -1;
-                        }
-                    }
-                }
-            }
-
-            int64_t time_end_sampling = ggml_time_us();
-            total_sampling_time += time_end_sampling - time_start_sampling;
-        }
-    }
-
-    int64_t time_end = ggml_time_us();
-    total_time += time_end - time_start;
-
-    LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
-            total_time / 1000.0,
-            total_time / 1000.0 / params.steps,
-            total_sampling_time / 1000.0 / params.steps);
-
-    llama_batch_free(batch);
-    llama_sampler_free(sampler);
-    llama_sampler_free(dist_sampler);
-
-    n_generated = params.max_length;
-}
-
 static std::string format_input_text(const std::string & prompt, const std::string & system_prompt, bool use_chat_template, llama_model * model) {
    if (!use_chat_template) {
        return prompt;
@@ -631,10 +192,10 @@ int main(int argc, char ** argv) {
    GGML_ASSERT((params.diffusion.eps == 0) ^ (params.diffusion.block_length == 0));

    if (params.diffusion.eps) {
-        diff_params.schedule = TIMESTEP_BASED;
+        diff_params.schedule = DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED;
        diff_params.eps      = params.diffusion.eps;
    } else if (params.diffusion.block_length) {
-        diff_params.schedule     = BLOCK_BASED;
+        diff_params.schedule     = DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED;
        diff_params.block_length = params.diffusion.block_length;
    }

@@ -653,8 +214,17 @@ int main(int argc, char ** argv) {
    callback_data cb_data               = { &diff_params, vocab, n_input };
    diff_params.step_callback_user_data = &cb_data;

-    const char * alg_names[]   = { "ORIGIN", "ENTROPY_BASED", "MARGIN_BASED", "RANDOM", "CONFIDENCE_BASED" };
-    const char * sched_names[] = { "TIMESTEP_BASED", "BLOCK_BASED" };
+    const char * alg_names[]   = {
+        "DIFFUSION_ALGORITHM_ORIGIN",
+        "DIFFUSION_ALGORITHM_ENTROPY_BASED",
+        "DIFFUSION_ALGORITHM_MARGIN_BASED",
+        "DIFFUSION_ALGORITHM_RANDOM",
+        "DIFFUSION_ALGORITHM_CONFIDENCE_BASED",
+    };
+    const char * sched_names[] = {
+        "DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED",
+        "DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED",
+    };
    const char * alg_name =
        (diff_params.algorithm >= 0 && diff_params.algorithm <= 4) ? alg_names[diff_params.algorithm] : "UNKNOWN";
    const char * sched_name =
@@ -666,11 +236,11 @@ int main(int argc, char ** argv) {
    LOG_INF("diffusion_params: - %-25s enum             = %d (%s)\n", "algorithm", diff_params.algorithm, alg_name);
    LOG_INF("diffusion_params: - %-25s enum             = %d (%s)\n", "schedule", diff_params.schedule, sched_name);
    LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "temperature", diff_params.temperature);
-    if (diff_params.schedule == TIMESTEP_BASED) {
+    if (diff_params.schedule == DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED) {
        LOG_INF("diffusion_params: - %-25s f32              = %.6f\n", "eps", diff_params.eps);
        LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "alg_temp", diff_params.alg_temp);
    }
-    if (diff_params.schedule == BLOCK_BASED) {
+    if (diff_params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) {
        LOG_INF("diffusion_params: - %-25s u32              = %d\n", "block_length", diff_params.block_length);
        LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "cfg_scale", diff_params.cfg_scale);
    }
--- a/examples/diffusion/diffusion.cpp
+++ b/examples/diffusion/diffusion.cpp
@@ -0,0 +1,408 @@
+#include "diffusion.h"
+
+#include "log.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cmath>
+#include <cstring>
+#include <random>
+#include <utility>
+#include <vector>
+
+static float calculate_confidence(const llama_token_data_array & cur_p,
+                                  diffusion_algorithm            algorithm,
+                                  std::mt19937 &                 rng) {
+    switch (algorithm) {
+        case DIFFUSION_ALGORITHM_CONFIDENCE_BASED:
+            return cur_p.data[cur_p.selected].p;  // Selected token probability
+
+        case DIFFUSION_ALGORITHM_ENTROPY_BASED:
+            {
+                float       entropy = 0.0f;
+                const float epsilon = 1e-10f;
+                for (size_t i = 0; i < cur_p.size; i++) {
+                    float prob = cur_p.data[i].p;
+                    entropy += prob * logf(prob + epsilon);
+                }
+                return -entropy;  // Higher entropy = lower confidence
+            }
+
+        case DIFFUSION_ALGORITHM_MARGIN_BASED:
+            return (cur_p.size > 1) ? cur_p.data[0].p - cur_p.data[1].p : cur_p.data[0].p;
+
+        case DIFFUSION_ALGORITHM_RANDOM:
+            {
+                std::uniform_real_distribution<float> uniform(0.0f, 1.0f);
+                return uniform(rng);  // Random confidence
+            }
+
+        case DIFFUSION_ALGORITHM_ORIGIN:
+            return cur_p.data[cur_p.selected].p;
+
+        default:
+            return 0.0f;
+    }
+}
+
+// Unified transfer count calculation function
+static int32_t calculate_transfer_count(int32_t                      step,
+                                        int32_t                      total_steps,
+                                        int32_t                      remaining_masked,
+                                        diffusion_transfer_schedule  schedule,
+                                        float                        eps,
+                                        const std::vector<int32_t> & num_transfer_tokens = {}) {
+    switch (schedule) {
+        case DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED:
+            {
+                float t          = 1.0f - (float) step / total_steps * (1.0f - eps);
+                float s          = 1.0f - (float) (step + 1) / total_steps * (1.0f - eps);
+                float p_transfer = (step < total_steps - 1) ? (1.0f - s / t) : 1.0f;
+                return (int32_t) (remaining_masked * p_transfer);
+            }
+
+        case DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED:
+            if (!num_transfer_tokens.empty() && step < (int32_t) num_transfer_tokens.size()) {
+                return num_transfer_tokens[step];
+            }
+            return remaining_masked / (total_steps - step);  // Fallback
+
+        default:
+            return remaining_masked / (total_steps - step);
+    }
+}
+
+static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) {
+    if (temperature == 0.0f) {
+        return;
+    }
+
+    std::uniform_real_distribution<double> uniform(0.0, 1.0);
+    for (int32_t i = 0; i < n_vocab; i++) {
+        double noise        = uniform(rng);
+        // Prevent log(0)
+        noise               = std::max(noise, 1e-20);
+        double gumbel_noise = std::pow(-std::log(noise), temperature);
+        logits[i]           = std::exp(logits[i]) / gumbel_noise;
+    }
+}
+
+static std::vector<int32_t> get_num_transfer_tokens(int32_t mask_count, int32_t steps) {
+    std::vector<int32_t> num_transfer_tokens(steps);
+
+    int32_t base      = mask_count / steps;
+    int32_t remainder = mask_count % steps;
+
+    for (int32_t i = 0; i < steps; i++) {
+        num_transfer_tokens[i] = base + (i < remainder ? 1 : 0);
+    }
+
+    return num_transfer_tokens;
+}
+
+void diffusion_generate(llama_context *          ctx,
+                        const llama_token *      input_tokens,
+                        llama_token *            output_tokens,
+                        int32_t                  n_input,
+                        const diffusion_params & params,
+                        int32_t &                n_generated) {
+    n_generated = 0;
+    if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || params.max_length <= n_input) {
+        return;
+    }
+
+    const llama_model * model = llama_get_model(ctx);
+
+    // Initialize with input and pad with mask tokens
+    std::copy(input_tokens, input_tokens + n_input, output_tokens);
+    std::fill(output_tokens + n_input, output_tokens + params.max_length, params.mask_token_id);
+
+    std::mt19937 rng(params.seed);
+
+    llama_set_causal_attn(ctx, false);
+
+    int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
+
+    std::vector<llama_token_data> candidates(n_vocab);
+    std::vector<llama_token_data> conf_candidates;
+    conf_candidates.reserve(params.max_length);
+    std::vector<int32_t> mask_positions;
+    mask_positions.reserve(params.max_length);
+
+    // Setup sampler chain
+    struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
+    if (params.top_k > 0) {
+        llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
+    }
+    if (params.top_p < 1.0f) {
+        llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
+    }
+    if (params.temperature > 0.0f) {
+        llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
+    }
+    llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
+
+    struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
+
+    llama_batch batch = llama_batch_init(params.max_length, 0, 1);
+    batch.n_tokens    = params.max_length;
+
+    // Pre-allocate buffers for CFG if needed
+    int32_t                  logits_size = n_vocab * params.max_length;
+    std::vector<float>       cond_logits_buffer;
+    std::vector<llama_token> un_x_buffer;
+    if (params.cfg_scale > 0.0f) {
+        cond_logits_buffer.resize(logits_size);
+        un_x_buffer.resize(params.max_length);
+    }
+
+    // For block-based processing
+    std::vector<int32_t> num_transfer_tokens;
+    int32_t              num_blocks      = 1;
+    int32_t              steps_per_block = params.steps;
+
+    if (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) {
+        GGML_ASSERT(params.max_length % params.block_length == 0);
+        num_blocks = params.max_length / params.block_length;
+        GGML_ASSERT(params.steps % num_blocks == 0);
+        steps_per_block = params.steps / num_blocks;
+    }
+
+    std::vector<float> confidence(params.max_length);
+
+    int64_t total_sampling_time = 0;
+    int64_t total_time          = 0;
+    int64_t time_start          = ggml_time_us();
+
+    for (int block_num = 0; block_num < num_blocks; block_num++) {
+        int32_t block_start = (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) ? n_input + block_num * params.block_length : 0;
+        int32_t block_end   = (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) ?
+                                  std::min(n_input + (block_num + 1) * params.block_length, params.max_length) :
+                                  params.max_length;
+
+        // Count masked tokens in current block for block-based processing
+        if (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) {
+            int32_t block_mask_count = 0;
+            for (int i = block_start; i < block_end; i++) {
+                if (output_tokens[i] == params.mask_token_id) {
+                    block_mask_count++;
+                }
+            }
+            num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block);
+        }
+
+        for (int32_t step = 0; step < steps_per_block; step++) {
+            int32_t global_step = block_num * steps_per_block + step;
+
+            if (params.step_callback) {
+                if (!params.step_callback(
+                        global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) {
+                    break;
+                }
+            }
+
+            // Setup batch
+            for (int32_t i = 0; i < params.max_length; i++) {
+                batch.token[i]     = output_tokens[i];
+                batch.pos[i]       = i;
+                batch.n_seq_id[i]  = 1;
+                batch.seq_id[i][0] = 0;
+                batch.logits[i]    = 1;
+            }
+
+            float * logits = nullptr;
+
+            if (params.cfg_scale > 0.0f) {
+                int ret = llama_decode(ctx, batch);
+                if (ret != 0) {
+                    LOG_ERR("Failed to generate conditional");
+                    break;
+                }
+                float * cond_logits_ptr = llama_get_logits(ctx);
+                std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float));
+
+                // Unconditional generation (mask input)
+                std::copy(output_tokens, output_tokens + params.max_length, un_x_buffer.begin());
+                for (int32_t i = 0; i < n_input; i++) {
+                    un_x_buffer[i] = params.mask_token_id;
+                }
+
+                for (int32_t i = 0; i < params.max_length; i++) {
+                    batch.token[i] = un_x_buffer[i];
+                }
+                ret = llama_decode(ctx, batch);
+                if (ret != 0) {
+                    LOG_ERR("Failed to generate unconditional");
+                    break;
+                }
+                float * uncond_logits = llama_get_logits(ctx);
+
+                // Apply CFG
+                for (int32_t i = 0; i < logits_size; i++) {
+                    cond_logits_buffer[i] =
+                        uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]);
+                }
+                logits = cond_logits_buffer.data();
+            } else {
+                int ret = llama_decode(ctx, batch);
+                if (ret != 0) {
+                    LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, global_step, ret);
+                    break;
+                }
+                logits = llama_get_logits(ctx);
+            }
+
+            if (!logits) {
+                LOG_ERR("%s: failed to get logits at step %d\n", __func__, global_step);
+                break;
+            }
+
+            auto get_logits_for_pos = [&](int32_t pos) -> const float * {
+                if (params.shift_logits) {
+                    return pos == 0 ? logits : logits + (pos - 1) * n_vocab;
+                }
+                return logits + pos * n_vocab;
+            };
+
+            int64_t time_start_sampling = ggml_time_us();
+
+            mask_positions.clear();
+            for (int32_t i = 0; i < params.max_length; i++) {
+                if (output_tokens[i] == params.mask_token_id) {
+                    // For block-based, only consider current block
+                    if (params.schedule != DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED || (i >= block_start && i < block_end)) {
+                        mask_positions.push_back(i);
+                    }
+                }
+            }
+
+            if (mask_positions.empty()) {
+                break;
+            }
+
+            if (params.add_gumbel_noise && params.temperature > 0.0f) {
+                add_gumbel_noise(logits, n_vocab, params.temperature, rng);
+            }
+
+            if (params.algorithm == DIFFUSION_ALGORITHM_ORIGIN) {
+                int32_t transfer_count = calculate_transfer_count(
+                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
+                float p_transfer = (float) transfer_count / mask_positions.size();
+
+                for (int32_t pos : mask_positions) {
+                    if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
+                        const float * pos_logits = get_logits_for_pos(pos);
+                        for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
+                            candidates[token_id].id    = token_id;
+                            candidates[token_id].logit = pos_logits[token_id];
+                            candidates[token_id].p     = 0.0f;
+                        }
+
+                        llama_token_data_array cur_p = {
+                            candidates.data(),
+                            (size_t) n_vocab,
+                            -1,
+                            false,
+                        };
+
+                        llama_sampler_apply(sampler, &cur_p);
+                        output_tokens[pos] = cur_p.data[cur_p.selected].id;
+                    }
+                }
+            } else {
+                std::vector<std::pair<float, int32_t>> confidences;
+                std::vector<llama_token>               sampled_tokens(mask_positions.size());
+
+                for (size_t i = 0; i < mask_positions.size(); i++) {
+                    int32_t       pos        = mask_positions[i];
+                    const float * pos_logits = get_logits_for_pos(pos);
+
+                    for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
+                        candidates[token_id].logit = pos_logits[token_id];
+                        candidates[token_id].p     = 0.0f;
+                        candidates[token_id].id    = token_id;
+                    }
+
+                    llama_token_data_array cur_p = {
+                        candidates.data(),
+                        candidates.size(),
+                        -1,
+                        false,
+                    };
+
+                    llama_sampler_apply(sampler, &cur_p);
+                    llama_token sampled_token = cur_p.data[cur_p.selected].id;
+
+                    float conf = calculate_confidence(cur_p, params.algorithm, rng);
+
+                    sampled_tokens[i] = sampled_token;
+                    confidences.emplace_back(conf, i);
+                }
+
+                int32_t transfer_count = calculate_transfer_count(
+                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
+
+                if (transfer_count > 0) {
+                    if (params.alg_temp == 0.0f) {
+                        std::partial_sort(confidences.begin(),
+                                          confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()),
+                                          confidences.end(),
+                                          [](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
+                                              if (a.first != b.first) {
+                                                  return a.first > b.first;
+                                              }
+                                              return a.second < b.second;
+                                          });
+
+                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
+                            int32_t mask_idx   = confidences[i].second;
+                            int32_t pos        = mask_positions[mask_idx];
+                            output_tokens[pos] = sampled_tokens[mask_idx];
+                        }
+                    } else {
+                        conf_candidates.clear();
+                        for (size_t i = 0; i < confidences.size(); i++) {
+                            float conf_logit = confidences[i].first / params.alg_temp;
+                            conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f });
+                        }
+
+                        llama_token_data_array conf_array = {
+                            conf_candidates.data(),
+                            conf_candidates.size(),
+                            -1,
+                            false,
+                        };
+
+                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
+                            llama_sampler_apply(dist_sampler, &conf_array);
+                            int32_t selected_idx = conf_array.selected;
+                            int32_t mask_idx     = selected_idx;
+                            int32_t pos          = mask_positions[mask_idx];
+                            output_tokens[pos]   = sampled_tokens[mask_idx];
+
+                            conf_candidates[selected_idx].p = 0.0f;
+                            conf_array.selected             = -1;
+                        }
+                    }
+                }
+            }
+
+            int64_t time_end_sampling = ggml_time_us();
+            total_sampling_time += time_end_sampling - time_start_sampling;
+        }
+    }
+
+    int64_t time_end = ggml_time_us();
+    total_time += time_end - time_start;
+
+    LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
+            total_time / 1000.0,
+            total_time / 1000.0 / params.steps,
+            total_sampling_time / 1000.0 / params.steps);
+
+    llama_batch_free(batch);
+    llama_sampler_free(sampler);
+    llama_sampler_free(dist_sampler);
+
+    n_generated = params.max_length;
+}
--- a/examples/diffusion/diffusion.h
+++ b/examples/diffusion/diffusion.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include "llama.h"
+
+#include <cstdint>
+
+enum diffusion_algorithm {
+    DIFFUSION_ALGORITHM_ORIGIN           = 0,
+    DIFFUSION_ALGORITHM_ENTROPY_BASED    = 1,
+    DIFFUSION_ALGORITHM_MARGIN_BASED     = 2,
+    DIFFUSION_ALGORITHM_RANDOM           = 3,
+    DIFFUSION_ALGORITHM_CONFIDENCE_BASED = 4,
+};
+
+// Unified transfer scheduling methods
+enum diffusion_transfer_schedule {
+    DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED = 0,  // Dream-style: (1.0 - s/t) * remaining
+    DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED    = 1,  // LLaDA-style: process in blocks with get_num_transfer_tokens
+};
+
+typedef bool (*diffusion_step_callback_t)(int32_t             step,
+                                          int32_t             total_steps,
+                                          const llama_token * tokens,
+                                          int32_t             n_tokens,
+                                          void *              user_data);
+
+struct diffusion_params {
+    int32_t                   steps                   = 0;
+    float                     temperature             = 0;
+    llama_token               mask_token_id           = LLAMA_TOKEN_NULL;
+    diffusion_step_callback_t step_callback           = nullptr;
+    void *                    step_callback_user_data = nullptr;
+    int32_t                   seed                    = 0;
+    bool                      visual_mode             = false;
+    bool                      shift_logits            = false;  // Shift logits by -1 after decode
+
+    float   top_p = 0.;
+    int32_t top_k = 0.;
+
+    diffusion_algorithm         algorithm = DIFFUSION_ALGORITHM_CONFIDENCE_BASED;
+    diffusion_transfer_schedule schedule  = DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED;
+
+    float   cfg_scale        = 0.;     // Config scale for classifier-free guidance
+    float   eps              = 0.;     // Timestep scheduling
+    int32_t block_length     = 0;      // Block size (for block scheduling)
+    float   alg_temp         = 0;      // algorithm temperature (0.0 = deterministic)
+    bool    add_gumbel_noise = false;  // Add gumbel noise to the logits if temp > 0.0
+
+    int32_t max_length = 0;            // Maximum sequence length
+};
+
+void diffusion_generate(llama_context *          ctx,
+                        const llama_token *      input_tokens,
+                        llama_token *            output_tokens,
+                        int32_t                  n_input,
+                        const diffusion_params & params,
+                        int32_t &                n_generated);
--- a/examples/llama-eval/README.md
+++ b/examples/llama-eval/README.md
@@ -0,0 +1,26 @@
+# llama-eval
+
+Simple evaluation tool for llama.cpp with support for multiple datasets.
+
+For a full description, usage examples, and sample results, see:
+
+- [PR 21152](https://github.com/ggml-org/llama.cpp/pull/21152)
+
+## Quick start
+
+```bash
+# Single server
+python3 llama-eval.py \
+  --server http://localhost:8033 \
+  --model my-model \
+  --dataset gsm8k --n_cases 100 \
+  --grader-type regex --threads 32
+
+# Multiple servers (comma-separated URLs and thread counts)
+python3 llama-eval.py \
+  --server http://server1:8033,http://server2:8033 \
+  --server-name server1,server2 \
+  --threads 16,16 \
+  --dataset aime2025 --n_cases 240 \
+  --grader-type regex
+```
--- a/examples/llama-eval/llama-eval.py
+++ b/examples/llama-eval/llama-eval.py
--- a/examples/llama-eval/llama-server-simulator.py
+++ b/examples/llama-eval/llama-server-simulator.py
@@ -0,0 +1,317 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import random
+import re
+import time
+import sys
+import os
+import threading
+from http.server import HTTPServer, BaseHTTPRequestHandler
+from typing import Dict, List, Optional
+from dataclasses import dataclass
+from pathlib import Path
+
+import datasets
+
+# Set cache directory for HuggingFace datasets
+cache_dir = Path.home() / ".cache" / "huggingface" / "datasets"
+cache_dir.mkdir(parents=True, exist_ok=True)
+os.environ["HF_DATASETS_CACHE"] = str(cache_dir)
+
+def dice(s1: str, s2: str) -> float:
+    """Calculate Dice coefficient between two strings based on bigram overlap."""
+    if not s1 and not s2:
+        return 1.0
+
+    def _bigrams(s: str):
+        return [s[i : i + 2] for i in range(len(s) - 1)]
+
+    bigrams1 = _bigrams(s1)
+    bigrams2 = _bigrams(s2)
+
+    if not bigrams1 and not bigrams2:
+        return 1.0
+
+    from collections import Counter
+
+    freq1 = Counter(bigrams1)
+    freq2 = Counter(bigrams2)
+
+    intersection = sum(min(freq1[bg], freq2[bg]) for bg in freq1)
+    dice_coeff = 2 * intersection / (len(bigrams1) + len(bigrams2))
+    return dice_coeff
+
+def debug_log(message: str):
+    """Log debug messages to both stdout and a file"""
+    print(message, file=sys.stderr)
+    with open("/tmp/simulator-debug.log", "a") as f:
+        f.write(message + "\n")
+
+simulator: Optional["Simulator"] = None
+
+@dataclass
+class EvalState:
+    id: str
+    tasks: List[str]
+    task_states: Dict[str, Dict]
+    sampling_config: Dict
+
+def normalize_number(s: str) -> Optional[int]:
+    match = re.match(r"\d+", s)  # match digits from the start
+    if not match:
+        return None
+    return int(match.group(0))
+
+class AimeDataset:
+    def __init__(self, split: str = "train"):
+        self.split = split
+        self.questions: List[Dict] = []
+        self._load_dataset()
+
+    def _load_dataset(self):
+        print(f"Loading AIME dataset (split: {self.split})...")
+
+        cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0"
+        if cache_path.exists():
+            print(f"Using cached dataset from {cache_path}")
+            ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path))
+        else:
+            ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
+
+        self.questions = list(ds)
+        print(f"AIME dataset loaded: {len(self.questions)} questions")
+
+    def find_question(self, request_text: str) -> Optional[Dict]:
+        best_match = None
+        best_distance = -1
+        best_index = -1
+
+        for i, question in enumerate(self.questions):
+            question_text = question["problem"]
+            request_lower = request_text.lower()
+            question_lower = question_text.lower()
+
+            # Exact match
+            if question_lower == request_lower:
+                debug_log(f"DEBUG: Found exact match at index {i}")
+                return question
+
+            # Remove LaTeX formatting for more flexible matching
+            question_no_latex = re.sub(r'\$[^$]+\$', '', question_text)
+            if question_no_latex.lower() == request_lower:
+                debug_log(f"DEBUG: Found match (no LaTeX) at index {i}")
+                return question
+
+            # Calculate Dice coefficient for partial matches
+            # Only consider if request is at least 50% of question length
+            if len(request_lower) >= len(question_lower) * 0.5:
+                distance = dice(question_lower, request_lower)
+
+                if distance > best_distance:
+                    best_distance = distance
+                    best_match = question
+                    best_index = i
+
+        if best_match and best_distance > 0.3:  # Threshold for partial match
+            debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}")
+            return best_match
+
+        debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...")
+        return None
+
+    def get_answer(self, question: Dict) -> str:
+        answer = question["answer"]
+        if isinstance(answer, str):
+            normalized = normalize_number(answer)
+            return str(normalized) if normalized is not None else answer
+        return str(answer)
+
+class Simulator:
+    def __init__(
+        self,
+        port: int = 8033,
+        host: str = "localhost",
+        success_rate: float = 0.8,
+        dataset_split: str = "train"
+    ):
+        self.port = port
+        self.host = host
+        self.success_rate = success_rate
+        self.dataset = AimeDataset(dataset_split)
+        self.eval_state = EvalState(
+            id="aime-2025",
+            tasks=["aime"],
+            task_states={},
+            sampling_config={"temperature": 0, "max_tokens": 2048}
+        )
+
+    def _generate_response(
+        self,
+        question: Dict,
+        should_be_correct: bool
+    ) -> Dict:
+        expected_answer = self.dataset.get_answer(question)
+
+        if should_be_correct:
+            response_text = expected_answer
+        else:
+            response_text = self._generate_wrong_answer(question)
+
+        return {
+            "id": f"chatcmpl-{int(time.time())}",
+            "object": "chat.completion",
+            "created": int(time.time()),
+            "model": "llama",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": response_text
+                    },
+                    "finish_reason": "stop"
+                }
+            ],
+            "usage": {
+                "prompt_tokens": 100,
+                "completion_tokens": 50,
+                "total_tokens": 150
+            }
+        }
+
+    def _generate_wrong_answer(self, question: Dict) -> str:
+        expected_answer = self.dataset.get_answer(question)
+
+        if expected_answer.isdigit():
+            wrong_answer = str(int(expected_answer) + 1)
+        else:
+            wrong_answer = expected_answer + " (wrong)"
+
+        return wrong_answer
+
+    def _process_request(self, request_data: Dict) -> Dict:
+        messages = request_data.get("messages", [])
+        if not messages:
+            return {"error": "No messages in request"}
+
+        request_text = messages[0].get("content", "")
+        debug_log(f"DEBUG: Received request with content: {request_text[:150]}...")
+
+        question = self.dataset.find_question(request_text)
+        if not question:
+            debug_log(f"DEBUG: find_question returned None")
+            return {"error": "No matching question found"}
+
+        should_be_correct = random.random() < self.success_rate
+
+        response = self._generate_response(question, should_be_correct)
+
+        task_id = "aime"
+        self.eval_state.task_states[task_id] = {
+            "correct": should_be_correct,
+            "expected": self.dataset.get_answer(question),
+            "predicted": response["choices"][0]["message"]["content"]
+        }
+
+        return response
+
+class RequestHandler(BaseHTTPRequestHandler):
+    def do_POST(self):
+        if self.path != "/v1/chat/completions":
+            self._send_json({"error": "Not found"}, 404)
+            return
+
+        try:
+            content_length = int(self.headers.get("Content-Length", 0))
+            body = self.rfile.read(content_length)
+            request_data = json.loads(body) if body else None
+
+            if not request_data:
+                self._send_json({"error": "Invalid JSON"}, 400)
+                return
+
+            if simulator is None:
+                self._send_json({"error": "Simulator not initialized"}, 500)
+                return
+
+            response = simulator._process_request(request_data)
+            self._send_json(response, 200)
+
+        except json.JSONDecodeError:
+            self._send_json({"error": "Invalid JSON"}, 400)
+        except Exception as e:
+            print(f"Error processing request: {e}")
+            self._send_json({"error": str(e)}, 500)
+
+    def _send_json(self, data: dict, status: int = 200):
+        body = json.dumps(data).encode("utf-8")
+        self.send_response(status)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def log_message(self, format, *args):
+        # Suppress default request logging
+        pass
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="llama-server simulator for testing eval scripts"
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8033,
+        help="Server port (default: 8033)"
+    )
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="localhost",
+        help="Server host (default: localhost)"
+    )
+    parser.add_argument(
+        "--success-rate",
+        type=float,
+        default=0.8,
+        help="Success rate 0-1 (default: 0.8)"
+    )
+    parser.add_argument(
+        "--dataset-split",
+        type=str,
+        default="train",
+        help="AIME dataset split to use (default: train)"
+    )
+
+    args = parser.parse_args()
+
+    global simulator
+    simulator = Simulator(
+        port=args.port,
+        host=args.host,
+        success_rate=args.success_rate,
+        dataset_split=args.dataset_split
+    )
+
+    server = HTTPServer((args.host, args.port), RequestHandler)
+    server_thread = threading.Thread(target=server.serve_forever, daemon=True)
+    server_thread.start()
+
+    print("\n=== llama-server-simulator ===")
+    print(f"Server running on http://{args.host}:{args.port}")
+    print(f"Success rate: {args.success_rate}")
+    print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions")
+    print("\nPress Ctrl+C to stop\n")
+
+    try:
+        server_thread.join()
+    except KeyboardInterrupt:
+        print("\nShutting down...")
+        server.shutdown()
+
+if __name__ == "__main__":
+    main()
--- a/examples/llama-eval/test-simulator.sh
+++ b/examples/llama-eval/test-simulator.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+set -e
+
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+echo "=== llama-server-simulator Test Script ==="
+echo ""
+
+PORT=8033
+SUCCESS_RATE=0.8
+TEST_PORT=8034
+
+echo "Starting simulator on port $PORT with success rate $SUCCESS_RATE..."
+source "$SCRIPT_DIR/venv/bin/activate"
+python3 "$SCRIPT_DIR/llama-server-simulator.py" --port $PORT --success-rate $SUCCESS_RATE > /tmp/simulator-test.log 2>&1 &
+SIMULATOR_PID=$!
+
+echo "Waiting for simulator to start..."
+sleep 5
+
+# Helper function to make a request and extract the answer
+make_request() {
+  local question="$1"
+  curl -s -X POST http://localhost:$PORT/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d "{
+      \"model\": \"llama\",
+      \"messages\": [
+        {\"role\": \"user\", \"content\": \"$question\"}
+      ],
+      \"temperature\": 0,
+      \"max_tokens\": 2048
+    }" | python3 -c "import sys, json; data = json.load(sys.stdin); print(data.get('choices', [{}])[0].get('message', {}).get('content', data.get('error', 'No response')))"
+}
+
+# Test question (repeated in multiple tests)
+TEST_QUESTION="Quadratic polynomials P(x) and Q(x) have leading coefficients 2 and -2, respectively. The graphs of both polynomials pass through the two points (16,54) and (20,53). Find P(0) + Q(0)."
+
+echo ""
+echo "=== Test 1: Correct Answer ==="
+echo "Sending request with known question..."
+answer=$(make_request "$TEST_QUESTION")
+echo "Answer: $answer"
+echo "Expected: 116"
+echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
+
+echo ""
+echo "=== Test 2: Wrong Answer ==="
+echo "Sending request with known question (success rate 0.0)..."
+answer=$(make_request "$TEST_QUESTION")
+echo "Answer: $answer"
+echo "Expected: 116"
+echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
+
+echo ""
+echo "=== Test 3: No Matching Question ==="
+echo "Sending request with non-matching text..."
+response=$(make_request "What is the capital of France?")
+echo "Response: $response"
+echo "Expected: No matching question found"
+echo "Correct: $([ "$response" == "No matching question found" ] && echo "Yes" || echo "No")"
+
+echo ""
+echo "=== Test 4: Success Rate Verification ==="
+echo "Sending 10 requests to test success rate..."
+correct_count=0
+for i in {1..10}; do
+  answer=$(make_request "$TEST_QUESTION")
+  if [ "$answer" == "116" ]; then
+    correct_count=$((correct_count + 1))
+  fi
+  echo "  Request $i: Answer = $answer"
+done
+echo "Correct answers: $correct_count/10"
+echo "Expected: ~8/10 (80% success rate)"
+echo "Success rate: $(echo "scale=1; $correct_count * 10" | bc)%"
+
+echo ""
+echo "=== Test Complete ==="
+echo "Stopping simulator..."
+kill $SIMULATOR_PID 2>/dev/null
+wait $SIMULATOR_PID 2>/dev/null || true
+
+echo "Simulator stopped."
--- a/examples/model-conversion/Makefile
+++ b/examples/model-conversion/Makefile
@@ -52,6 +52,10 @@ causal-convert-mm-model:
 	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
 	./scripts/causal/convert-model.sh

+	$(MAKE) causal-convert-mmproj MM_OUTTYPE="$(MM_OUTTYPE)"
+
+causal-convert-mmproj:
+	$(call validate_model_path,causal-convert-mmproj)
 	@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(MM_OUTTYPE)" MODEL_PATH="$(MODEL_PATH)" \
 	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
 	./scripts/causal/convert-model.sh --mmproj
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -38,8 +38,12 @@ int main(int argc, char ** argv) {
    std::string result0;
    std::string result1;
    std::string result2;
+    std::string result3;

    // init
+
+    ggml_backend_load_all();
+
    auto llama_init = common_init_from_params(params);

    auto * model = llama_init->model();
@@ -213,11 +217,83 @@ int main(int argc, char ** argv) {
        n_past += 1;
    }

+    // test on-device state save/load
+    auto params_ctx4 = common_context_params_to_llama(params);
+    params_ctx4.n_seq_max = 2;
+    llama_context * ctx4 = llama_init_from_model(model, params_ctx4);
+
+    llama_sampler * smpl4 = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl4, llama_sampler_init_dist(params.sampling.seed));
+
+    printf("\nsingle seq run: %s", params.prompt.c_str());
+
+    // load state (rng, logits, embedding and kv_cache) from file
+    n_token_count_out = 0;
+
+    if (!llama_state_load_file(ctx4, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
+        fprintf(stderr, "\n%s : failed to load state\n", __func__);
+        return 1;
+    }
+
+    fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
+
+    // restore state (last tokens)
+    n_past = n_token_count_out;
+    if (!common_replay_last_token(ctx4, tokens.back(), n_past)) {
+        return 1;
+    }
+    ++n_past;
+
+    // save seq 0 and load into seq 1
+    {
+        // save kv of seq 0
+        std::vector<uint8_t> seq_store(llama_state_seq_get_size_ext(ctx4, 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE));
+        const size_t ncopy = llama_state_seq_get_data_ext(ctx4, seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
+        if (ncopy != seq_store.size()) {
+            fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
+            return 1;
+        }
+        fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
+
+        // erase whole kv
+        llama_memory_clear(llama_get_memory(ctx4), true);
+        fprintf(stderr, "%s : kv cache cleared\n", __func__);
+
+        // restore kv into seq 0
+        const size_t nset = llama_state_seq_set_data_ext(ctx4, seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
+        if (nset != seq_store.size()) {
+            fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
+            return 1;
+        }
+        fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
+    }
+
+    // forth run
+    for (auto i = 0; i < params.n_predict; i++) {
+        auto next_token     = llama_sampler_sample(smpl4, ctx4, -1);
+        auto next_token_str = common_token_to_piece(ctx4, next_token);
+
+        printf("%s", next_token_str.c_str());
+        result3 += next_token_str;
+
+        common_batch_clear(batch);
+        common_batch_add(batch, next_token, n_past, {1}, true);
+
+        if (llama_decode(ctx4, batch)) {
+            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            llama_batch_free(batch);
+            return 1;
+        }
+        n_past += 1;
+    }
+
    printf("\n");

    llama_sampler_free(smpl);
    llama_sampler_free(smpl2);
    llama_sampler_free(smpl3);
+    llama_sampler_free(smpl4);

    llama_batch_free(batch);

@@ -226,12 +302,18 @@ int main(int argc, char ** argv) {

    llama_free(ctx2);
    llama_free(ctx3);
+    llama_free(ctx4);

    if (result0 != result2) {
        fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
        return 1;
    }

+    if (result0 != result3) {
+        fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
+        return 1;
+    }
+
    fprintf(stderr, "\n%s : success\n", __func__);

    return 0;
--- a/examples/speculative-simple/README.md
+++ b/examples/speculative-simple/README.md
@@ -6,7 +6,7 @@ Demonstration of basic greedy speculative decoding
 ./bin/llama-speculative-simple \
    -m  ../models/qwen2.5-32b-coder-instruct/ggml-model-q8_0.gguf \
    -md ../models/qwen2.5-1.5b-coder-instruct/ggml-model-q4_0.gguf \
-    -f test.txt -c 0 -ngl 99 --color \
-    --sampling-seq k --top-k 1 -fa --temp 0.0 \
-    -ngld 99 --draft-max 16 --draft-min 5 --draft-p-min 0.9
+    -f test.txt -c 0 -ngl 99 --color on \
+    --sampling-seq k --top-k 1 -fa on --temp 0.0 \
+    -ngld 99 --spec-draft-n-max 16 --spec-draft-n-draft-min 5 --draft-p-min 0.9
 ```
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -13,20 +13,6 @@
 #include <vector>
 #include <utility>

-struct spec_checkpoint {
-    int64_t n_tokens = 0;
-
-    std::vector<uint8_t> data;
-
-    size_t size() const {
-        return data.size();
-    }
-
-    bool empty() const {
-        return data.empty();
-    }
-};
-
 int main(int argc, char ** argv) {
    std::setlocale(LC_NUMERIC, "C");

@@ -43,11 +29,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    if (params.speculative.draft.mparams.path.empty()) {
-        LOG_ERR("%s: --model-draft is required\n", __func__);
-        return 1;
-    }
-
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -62,18 +43,11 @@ int main(int argc, char ** argv) {
    model_tgt = llama_init_tgt->model();
    ctx_tgt   = llama_init_tgt->context();

-    // check if the context supports partial sequence removal
-    const auto ctx_seq_rm = common_context_can_seq_rm(ctx_tgt);
-    const bool use_ckpt = (ctx_seq_rm == COMMON_CONTEXT_SEQ_RM_TYPE_FULL);
-
-    if (use_ckpt) {
-        LOG_INF("speculative decoding will use checkpoints (context does not support partial sequence removal)\n");
-    }
-
    const llama_vocab * vocab = llama_model_get_vocab(model_tgt);

    // load the draft model
    llama_model_ptr model_dft;
+    llama_context_ptr ctx_dft;

    // TODO: simplify this logic
    {
@@ -81,9 +55,6 @@ int main(int argc, char ** argv) {

        auto params_dft = params;

-        params_dft.n_parallel   = 1;
-        params_dft.n_ctx        = params_spec.n_ctx;
-        params_dft.n_batch      = llama_n_ctx_seq(ctx_tgt);
        params_dft.devices      = params_spec.devices;
        params_dft.model        = params_spec.mparams;
        params_dft.n_gpu_layers = params_spec.n_gpu_layers;
@@ -103,8 +74,19 @@ int main(int argc, char ** argv) {
            return 1;
        }

-        params.speculative.draft.model = model_dft.get();
-        params.speculative.draft.cparams = common_context_params_to_llama(params_dft);
+        auto cparams = common_context_params_to_llama(params_dft);
+        ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams));
+
+        params.speculative.draft.ctx_tgt = ctx_tgt;
+        params.speculative.draft.ctx_dft = ctx_dft.get();
+    }
+
+    // check if the context supports partial sequence removal
+    const bool use_ckpt_tgt = (common_context_can_seq_rm(ctx_tgt)       == COMMON_CONTEXT_SEQ_RM_TYPE_FULL);
+    const bool use_ckpt_dft = (common_context_can_seq_rm(ctx_dft.get()) == COMMON_CONTEXT_SEQ_RM_TYPE_FULL);
+
+    if (use_ckpt_tgt) {
+        LOG_INF("speculative decoding will use checkpoints (context does not support partial sequence removal)\n");
    }

    // Tokenize the prompt
@@ -136,6 +118,8 @@ int main(int argc, char ** argv) {
    // used to determine end of generation
    bool has_eos = false;

+    llama_seq_id seq_id = 0;
+
    // ================================================
    // everything until here is standard initialization
    // the relevant stuff for speculative decoding starts here
@@ -146,7 +130,8 @@ int main(int argc, char ** argv) {
    common_sampler_ptr smpl(common_sampler_init(model_tgt, params.sampling));

    // eval the prompt
-    llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
+    llama_decode(ctx_tgt,       llama_batch_get_one(inp.data(), inp.size() - 1));
+    llama_decode(ctx_dft.get(), llama_batch_get_one(inp.data(), inp.size() - 1));

    // note: keep the last token separate!
    llama_token id_last = inp.back();
@@ -160,16 +145,16 @@ int main(int argc, char ** argv) {
    // init the speculator
    const auto & params_spec = params.speculative;

-    struct common_speculative * spec = common_speculative_init(params.speculative, ctx_tgt);
+    struct common_speculative * spec = common_speculative_init(params.speculative, 1);

-    common_speculative_begin(spec, prompt_tgt);
+    common_speculative_begin(spec, seq_id, prompt_tgt);

    llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);

    size_t n_draft = 0;

    llama_tokens draft;
-    spec_checkpoint spec_ckpt;
+    common_prompt_checkpoint ckpt;

    const auto t_enc_end = ggml_time_us();

@@ -184,40 +169,57 @@ int main(int argc, char ** argv) {
        // from a cache or lookup tables.
        //
        if (draft.empty()) {
+            ckpt.update_pos(
+                    prompt_tgt.size(),
+                    llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), seq_id),
+                    llama_memory_seq_pos_max(llama_get_memory(ctx_tgt), seq_id));
+
+            if (use_ckpt_dft) {
+                ckpt.update_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
+            }
+
            // generate a new draft
-            draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);
+            common_speculative_get_draft_params(spec, seq_id) = {
+                /* .drafting   = */ true,
+                /* .n_max      = */ -1,
+                /* .n_past     = */ n_past,
+                /* .id_last    = */ id_last,
+                /* .prompt     = */ &prompt_tgt,
+                /* .result     = */ &draft, // output
+            };
+            common_speculative_draft(spec);

            // save the original draft size
            n_draft = draft.size();

            // save a checkpoint of the target context before evaluating the draft
            // this allows us to restore the state if partial draft acceptance occurs
-            if (!draft.empty() && use_ckpt) {
-                const size_t ckpt_size = llama_state_seq_get_size_ext(ctx_tgt, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-                spec_ckpt.data.resize(ckpt_size);
+            if (!draft.empty()) {
+                if (use_ckpt_tgt) {
+                    ckpt.update_tgt(ctx_tgt, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
+                }
+            }

-                const size_t n = llama_state_seq_get_data_ext(ctx_tgt, spec_ckpt.data.data(), ckpt_size, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-                GGML_ASSERT(n == ckpt_size);
+            {
+                ckpt.load_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);

-                spec_ckpt.n_tokens = (int64_t) prompt_tgt.size();
-                LOG_DBG("created speculative checkpoint (n_tokens = %" PRId64 ", size = %.3f MiB)\n",
-                        spec_ckpt.n_tokens, (float) spec_ckpt.data.size() / 1024 / 1024);
+                llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), seq_id, ckpt.pos_max + 1, -1);
            }
        } else {
            // we have a previous (partial) draft to reuse from checkpoint restoration
-            if (use_ckpt) {
-                GGML_ASSERT(!spec_ckpt.empty());
+            if (use_ckpt_tgt) {
+                GGML_ASSERT(!ckpt.empty());
            }
        }

        // always have a token to evaluate from before - id_last
        common_batch_clear(batch_tgt);
-        common_batch_add  (batch_tgt, id_last, n_past++, { 0 }, true);
+        common_batch_add  (batch_tgt, id_last, n_past++, { seq_id }, true);

        // evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
        {
            for (size_t i = 0; i < draft.size(); ++i) {
-                common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
+                common_batch_add(batch_tgt, draft[i], n_past + i, { seq_id }, true);
            }

            //LOG_DBG("target batch: %s\n", string_from(ctx_tgt, batch_tgt).c_str());
@@ -225,9 +227,15 @@ int main(int argc, char ** argv) {
            llama_decode(ctx_tgt, batch_tgt);
        }

+        // evaluate the same batch with the draft model
+        {
+            // TODO: extend to support MTP, Eagle, etc. See server code for reference
+            llama_decode(ctx_dft.get(), batch_tgt);
+        }
+
        // only save the sampler sampler state if we use checkpoints
        common_sampler_ptr smpl_save;
-        if (use_ckpt) {
+        if (use_ckpt_tgt) {
            smpl_save.reset(common_sampler_clone(smpl.get()));
        }

@@ -247,17 +255,24 @@ int main(int argc, char ** argv) {
        // check for partial draft acceptance:
        // if the context doesn't support partial sequence removal, restore the checkpoint
        // and make the accepted tokens the new partial draft for the next iteration
-        if (use_ckpt && ids.size() - 1 < draft.size()) {
+        if (use_ckpt_tgt && ids.size() - 1 < draft.size()) {
            LOG_DBG("partial acceptance: %zu < %zu, restoring checkpoint\n", ids.size() - 1, draft.size());

            draft = std::move(ids);

-            const size_t n = llama_state_seq_set_data_ext(ctx_tgt, spec_ckpt.data.data(), spec_ckpt.size(), 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-            GGML_ASSERT(n == spec_ckpt.size());
+            {
+                ckpt.load_tgt(ctx_tgt, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);

-            llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, spec_ckpt.n_tokens, -1);
+                llama_memory_seq_rm(llama_get_memory(ctx_tgt), seq_id, ckpt.pos_max + 1, -1);
+            }

-            prompt_tgt.resize(spec_ckpt.n_tokens);
+            {
+                ckpt.load_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
+
+                llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), seq_id, ckpt.pos_max + 1, -1);
+            }
+
+            prompt_tgt.resize(ckpt.n_tokens);
            smpl = std::move(smpl_save);

            n_past = (int) prompt_tgt.size();
@@ -265,7 +280,7 @@ int main(int argc, char ** argv) {
            continue;
        }

-        common_speculative_accept(spec, ids.size() - 1);
+        common_speculative_accept(spec, seq_id, ids.size() - 1);

        // full acceptance: consume the draft and commit accepted tokens
        n_past    += ids.size() - 1;
@@ -305,7 +320,8 @@ int main(int argc, char ** argv) {
        {
            LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);

-            llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, n_past, -1);
+            llama_memory_seq_rm(llama_get_memory(ctx_tgt),       seq_id, n_past, -1);
+            llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), seq_id, n_past, -1);
        }

        if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
--- a/examples/sycl/start-svr.sh
+++ b/examples/sycl/start-svr.sh
@@ -111,14 +111,14 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then
    echo "Use $GGML_SYCL_DEVICE as main GPU"
    #use signle GPU only
    GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
-    export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
+    export ONEAPI_DEVICE_SELECTOR="level_zero:${GGML_SYCL_DEVICE}"
    echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
 else
    echo "Use all Intel GPUs, including iGPU & dGPU"
    GPUS_SETTING="-sm ${SPLIT_MODE}"
 fi

-echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE}  --mmap "
+echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE}  --mmap --host 0.0.0.0 --port 8000"
 ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap --host 0.0.0.0 --port 8000


--- a/examples/sycl/test.sh
+++ b/examples/sycl/test.sh
@@ -119,7 +119,7 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then
    echo "Use $GGML_SYCL_DEVICE as main GPU"
    #use signle GPU only
    GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
-    export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
+    export ONEAPI_DEVICE_SELECTOR="level_zero:${GGML_SYCL_DEVICE}"
    echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
 else
    echo "Use all Intel GPUs, including iGPU & dGPU"
--- a/flake.lock
+++ b/flake.lock
@@ -1,58 +0,0 @@
-{
-  "nodes": {
-    "flake-parts": {
-      "inputs": {
-        "nixpkgs-lib": "nixpkgs-lib"
-      },
-      "locked": {
-        "lastModified": 1730504689,
-        "narHash": "sha256-hgmguH29K2fvs9szpq2r3pz2/8cJd2LPS+b4tfNFCwE=",
-        "owner": "hercules-ci",
-        "repo": "flake-parts",
-        "rev": "506278e768c2a08bec68eb62932193e341f55c90",
-        "type": "github"
-      },
-      "original": {
-        "owner": "hercules-ci",
-        "repo": "flake-parts",
-        "type": "github"
-      }
-    },
-    "nixpkgs": {
-      "locked": {
-        "lastModified": 1732014248,
-        "narHash": "sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w=",
-        "owner": "NixOS",
-        "repo": "nixpkgs",
-        "rev": "23e89b7da85c3640bbc2173fe04f4bd114342367",
-        "type": "github"
-      },
-      "original": {
-        "owner": "NixOS",
-        "ref": "nixos-unstable",
-        "repo": "nixpkgs",
-        "type": "github"
-      }
-    },
-    "nixpkgs-lib": {
-      "locked": {
-        "lastModified": 1730504152,
-        "narHash": "sha256-lXvH/vOfb4aGYyvFmZK/HlsNsr/0CVWlwYvo2rxJk3s=",
-        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
-      },
-      "original": {
-        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
-      }
-    },
-    "root": {
-      "inputs": {
-        "flake-parts": "flake-parts",
-        "nixpkgs": "nixpkgs"
-      }
-    }
-  },
-  "root": "root",
-  "version": 7
-}
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)

 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
-set(GGML_VERSION_MINOR 10)
+set(GGML_VERSION_MINOR 11)
 set(GGML_VERSION_PATCH 1)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

@@ -249,6 +249,7 @@ option(GGML_SYCL                            "ggml: use SYCL"
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 option(GGML_SYCL_GRAPH                      "ggml: enable graphs in the SYCL backend"         ON)
 option(GGML_SYCL_HOST_MEM_FALLBACK          "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON)
+option(GGML_SYCL_SUPPORT_LEVEL_ZERO         "ggml: use Level Zero API in SYCL backend"  ON)
 option(GGML_SYCL_DNN                        "ggml: enable oneDNN in the SYCL backend"         ON)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
                                            "ggml: sycl target device")
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -169,7 +169,7 @@ extern "C" {
        // device type
        enum ggml_backend_dev_type type;
        // device id
-        //   for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
+        //   for PCI devices, this should be the lower-case PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:c1:00.0")
        //   if the id is unknown, this should be NULL
        const char * device_id;
        // device capabilities
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -438,6 +438,12 @@ extern "C" {
        GGML_PREC_F32     = 10,
    };

+    // op hint
+    enum ggml_op_hint {
+        GGML_HINT_NONE             = 0,
+        GGML_HINT_SRC0_IS_HADAMARD = 1,
+    };
+
    // model file types
    enum ggml_ftype {
        GGML_FTYPE_UNKNOWN        = -1,
@@ -1419,6 +1425,11 @@ extern "C" {
            struct ggml_tensor * a,
            enum ggml_prec       prec);

+    // change the hint of a matrix multiplication
+    GGML_API void ggml_mul_mat_set_hint(
+            struct ggml_tensor * a,
+            enum ggml_op_hint    hint);
+
    // indirect matrix multiplication
    GGML_API struct ggml_tensor * ggml_mul_mat_id(
            struct ggml_context * ctx,
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -965,7 +965,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
        }
        if (sched->debug > 1) {
            ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
-            GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d,c=%d:", i, ggml_op_name(node->op), node->name,
+            GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d,c=%d:", i, ggml_op_desc(node), node->name,
                fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node),
                graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)], node->flags & GGML_TENSOR_FLAG_COMPUTE ? 1 : 0);
            for (int j = 0; j < GGML_MAX_SRC; j++) {
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -450,12 +450,22 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            ggml-cpu/arch/riscv/repack.cpp
            )
        if (GGML_CPU_RISCV64_SPACEMIT)
+            include(ggml-cpu/cmake/FindSMTIME.cmake)
            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_RISCV64_SPACEMIT ${RISCV64_SPACEMIT_IME_SPEC})
            list(APPEND GGML_CPU_SOURCES
                ggml-cpu/spacemit/ime.cpp
                ggml-cpu/spacemit/ime.h
+                ggml-cpu/spacemit/spine_mem_pool.cpp
+                ggml-cpu/spacemit/spine_mem_pool.h
+                ggml-cpu/spacemit/repack.cpp
+                ggml-cpu/spacemit/repack.h
+                ggml-cpu/spacemit/ime_env.cpp
+                ggml-cpu/spacemit/ime_env.h
                ggml-cpu/spacemit/ime1_kernels.cpp
+                ggml-cpu/spacemit/ime2_kernels.cpp
                ggml-cpu/spacemit/ime_kernels.h
+                ggml-cpu/spacemit/rvv_kernels.cpp
+                ggml-cpu/spacemit/rvv_kernels.h
            )
        endif()
        if(NOT GGML_CPU_ALL_VARIANTS)
@@ -485,6 +495,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            if (GGML_RV_ZIHINTPAUSE)
                string(APPEND MARCH_STR "_zihintpause")
            endif()
+            if (GGML_RV_ZBA)
+                string(APPEND MARCH_STR "_zba")
+            endif()
            if (GGML_CPU_RISCV64_SPACEMIT)
                # `xsmtvdotii' is only required for GCC >= 15.
                if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
@@ -578,13 +591,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)

        # Fetch KleidiAI sources:
        include(FetchContent)
-        set(KLEIDIAI_COMMIT_TAG "v1.22.0")
-        set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
-        set(KLEIDIAI_ARCHIVE_MD5  "54049037570ab0ee0a0d126b2ba5ece1")
+        set(KLEIDIAI_COMMIT_TAG "v1.24.0")
+        set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/releases/download/${KLEIDIAI_COMMIT_TAG}/kleidiai-${KLEIDIAI_COMMIT_TAG}-src.tar.gz")
+        set(KLEIDIAI_RELEASE_ARCHIVE_MD5  "2f02ebe29573d45813e671eb304f2a00")

        set(KLEIDIAI_FETCH_ARGS
            URL ${KLEIDIAI_DOWNLOAD_URL}
-            URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5}
+            URL_HASH MD5=${KLEIDIAI_RELEASE_ARCHIVE_MD5}
        )
        if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
            list(APPEND KLEIDIAI_FETCH_ARGS DOWNLOAD_EXTRACT_TIMESTAMP NEW)
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@@ -203,7 +203,6 @@
 #elif defined(__riscv)
 // quants.c
 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
-#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x1_generic ggml_quantize_mat_q8_0_4x1
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
--- a/ggml/src/ggml-cpu/arch/riscv/quants.c
+++ b/ggml/src/ggml-cpu/arch/riscv/quants.c
@@ -480,6 +480,104 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
 #endif
 }

+#if defined(__riscv_v)
+static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl256(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) {
+    const int qk = QK1_0;
+    const int nb = n / qk;
+    assert(n % qk == 0);
+
+    const block_q1_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    //LMUL = 1, VLMAX = 32
+    const size_t vl32 = __riscv_vsetvl_e8m1(32);
+    assert(vl32 == 32);
+
+    const vint16m1_t zero = __riscv_vmv_v_x_i16m1(0, 1);
+
+    float sumf = 0;
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+
+        float acc = 0;
+
+        for (int k = 0; k < 4; ++k) {
+            const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
+            const vbool8_t is_not_zero = __riscv_vlm_v_b8(x[ib].qs + 4 * k, vl32);
+
+            const vint8m1_t qy = __riscv_vle8_v_i8m1(yb->qs, vl32);
+            const vint8m1_t neg_qy = __riscv_vneg_v_i8m1(qy, vl32);
+            const vint8m1_t sy = __riscv_vmerge_vvm_i8m1(neg_qy, qy, is_not_zero, vl32);
+
+            const vint16m1_t red = __riscv_vwredsum_vs_i8m1_i16m1(sy, zero, vl32);
+            acc += GGML_CPU_FP16_TO_FP32(yb->d) * (float)__riscv_vmv_x_s_i16m1_i16(red);
+        }
+
+        sumf += d0 * acc;
+    }
+
+    *s = sumf;
+}
+
+static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl128(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) {
+    const int qk = QK1_0;
+    const int nb = n / qk;
+    assert(n % qk == 0);
+
+    const block_q1_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    //LMUL = 2, VLMAX = 32
+    const size_t vl32 = __riscv_vsetvl_e8m2(32);
+    assert(vl32 == 32);
+
+    const vint16m1_t zero = __riscv_vmv_v_x_i16m1(0, 1);
+
+    float sumf = 0;
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+
+        float acc = 0;
+
+        for (int k = 0; k < 4; ++k) {
+            const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
+            const vbool4_t is_not_zero = __riscv_vlm_v_b4(x[ib].qs + 4 * k, vl32);
+
+            const vint8m2_t qy = __riscv_vle8_v_i8m2(yb->qs, vl32);
+            const vint8m2_t neg_qy =__riscv_vneg_v_i8m2(qy, vl32);
+            const vint8m2_t sy = __riscv_vmerge_vvm_i8m2(neg_qy, qy, is_not_zero, vl32);
+
+            const vint16m1_t red = __riscv_vwredsum_vs_i8m2_i16m1(sy, zero, vl32);
+            acc += GGML_CPU_FP16_TO_FP32(yb->d) * (float)__riscv_vmv_x_s_i16m1_i16(red);
+        }
+
+        sumf += d0 * acc;
+    }
+
+    *s = sumf;
+}
+#endif
+
+void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+#if defined(__riscv_v)
+    assert(nrc == 1);
+
+    const size_t vlen_bits = __riscv_vlenb() * 8;
+
+    if (vlen_bits >= 256) {
+        ggml_vec_dot_q1_0_q8_0_vl256(n, s, vx, vy);
+    } else if (vlen_bits >= 128) {
+        ggml_vec_dot_q1_0_q8_0_vl128(n, s, vx, vy);
+    } else {
+        ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    }
+#else
+    ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
 void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    assert(nrc == 1);
    UNUSED(nrc);
--- a/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake
+++ b/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake
@@ -0,0 +1,32 @@
+include(CheckCSourceRuns)
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)" AND GGML_CPU_RISCV64_SPACEMIT)
+    set(SMT_MARCH_STR "-march=rv64gcv_zfh_zvfh_zba_zicbop")
+    if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
+        CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 15)
+        string(APPEND SMT_MARCH_STR "_xsmtvdotii")
+    endif()
+    set(CMAKE_REQUIRED_FLAGS "${SMT_MARCH_STR}")
+
+    check_c_source_compiles("int main() {__asm__ volatile(\"vmadot v2, v0, v1\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_IME1)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vmadot v2, v0, v1, i4\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOT_S4)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vmadot v2, v0, v1, i8\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOT_S8)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vfwmadot v2, v0, v1, fp16\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VFWMADOT_FP16)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vmadot.hp v2, v0, v1, v0, 0, i4\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VFMADOT_S4)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vmadot.hp v2, v0, v1, v0, 0, i8\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VFMADOT_S8)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vmadot1 v2, v0, v1\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOTN)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vpack.vv v2, v0, v1, 2\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VPACK)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vnspack.vv v2, v0, v1, 2\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VNPACK)
+    unset(CMAKE_REQUIRED_FLAGS)
+
+    list(APPEND RISCV64_SPACEMIT_IME_SPEC "")
+    if (SPACEMIT_RISCV_COMPILER_SUPPORT_IME1)
+        set(RISCV64_SPACEMIT_IME_SPEC "RISCV64_SPACEMIT_IME1")
+    endif()
+
+    if (SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOT_S4 AND SPACEMIT_RISCV_COMPILER_SUPPORT_VPACK AND SPACEMIT_RISCV_COMPILER_SUPPORT_VNPACK)
+        list(APPEND RISCV64_SPACEMIT_IME_SPEC "RISCV64_SPACEMIT_IME2")
+    endif()
+
+    message("RISCV64_SPACEMIT_IME_SPEC: ${RISCV64_SPACEMIT_IME_SPEC}")
+endif()
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -50,6 +50,10 @@
 #include "llamafile/sgemm.h"
 #endif

+#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
+#    include "spacemit/ime.h"
+#endif
+
 // Note: once we move threading into a separate C++ file
 // will use std::hardware_destructive_interference_size instead of hardcoding it here
 // and we'll use C++ attribute syntax.
@@ -1245,6 +1249,12 @@ void ggml_compute_forward_mul_mat(
    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

+    const int32_t hint = ggml_get_op_params_i32(dst, 1);
+    if (hint == GGML_HINT_SRC0_IS_HADAMARD && !params->use_ref) {
+        ggml_compute_forward_fwht(params, dst);
+        return;
+    }
+
    GGML_TENSOR_BINARY_OP_LOCALS

    const int ith = params->ith;
@@ -2959,6 +2969,45 @@ struct ggml_cplan ggml_graph_plan(
    return cplan;
 }

+
+// Try to fuse the current node with subsequent nodes for better performance.
+// Returns the number of nodes skipped by fusion (>=1), or 0 if no fusion was applied.
+static bool ggml_cpu_disable_fusion = false;  // initialized once in ggml_cpu_init(), read-only afterwards
+
+static int ggml_cpu_try_fuse_ops(
+        const struct ggml_cgraph * cgraph,
+        const int node_n,
+        const struct ggml_compute_params * params,
+        const struct ggml_cplan * cplan) {
+
+    if (ggml_cpu_disable_fusion || cplan->use_ref) {
+        return 0;
+    }
+
+    struct ggml_tensor * node = cgraph->nodes[node_n];
+
+    if (node->op == GGML_OP_RMS_NORM) {
+        // RMS_NORM + MUL fusion
+        const enum ggml_op fuse_ops[] = { GGML_OP_RMS_NORM, GGML_OP_MUL };
+        if (ggml_can_fuse(cgraph, node_n, fuse_ops, 2)) {
+            struct ggml_tensor * mul_node = cgraph->nodes[node_n + 1];
+            const struct ggml_tensor * mul_w = (mul_node->src[0] == node)
+                ? mul_node->src[1] : mul_node->src[0];
+            if (node->src[0]->type  == GGML_TYPE_F32 &&
+                mul_node->type      == GGML_TYPE_F32 &&
+                mul_w->type         == GGML_TYPE_F32 &&
+                mul_w->ne[0]        == node->ne[0]   &&
+                mul_w->nb[0]        == sizeof(float)) {
+
+                ggml_compute_forward_rms_norm_mul_fused(params, node, mul_node);
+                return 1;
+            }
+        }
+    }
+
+    return 0;
+}
+
 static thread_ret_t ggml_graph_compute_thread(void * data) {
    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
    struct ggml_threadpool    * tp    = state->threadpool;
@@ -2966,7 +3015,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
    const struct ggml_cgraph * cgraph = tp->cgraph;
    const struct ggml_cplan  * cplan  = tp->cplan;

+#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
+    ggml_backend_cpu_riscv64_spacemit_set_numa_thread_affinity(state->ith);
+#else
    set_numa_thread_affinity(state->ith);
+#endif

    struct ggml_compute_params params = {
        /*.ith        =*/ state->ith,
@@ -2995,7 +3048,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
            continue;
        }

-        ggml_compute_forward(&params, node);
+        // TODO: move fused-op detection into ggml_graph_plan so fusion decisions are made once at planning time
+        // Try fused ops, fall back to normal compute
+        const int n_fused = ggml_cpu_try_fuse_ops(cgraph, node_n, &params, cplan);
+        if (n_fused > 0) {
+            node_n += n_fused;
+        } else {
+            ggml_compute_forward(&params, node);
+        }

        if (state->ith == 0 && cplan->abort_callback &&
                cplan->abort_callback(cplan->abort_callback_data)) {
@@ -3016,6 +3076,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {

    ggml_barrier(state->threadpool);

+#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
+    ggml_backend_cpu_riscv64_spacemit_clear_numa_thread_affinity_threaded(state->ith);
+#endif
+
    return 0;
 }

@@ -3757,6 +3821,11 @@ void ggml_cpu_init(void) {
        ggml_init_riscv_arch_features();
 #endif

+        {
+            const char * env = getenv("GGML_CPU_DISABLE_FUSION");
+            ggml_cpu_disable_fusion = (env != NULL && atoi(env) == 1);
+        }
+
        is_first_call = false;
    }

--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -3713,11 +3713,27 @@ void ggml_compute_forward_norm(

 // ggml_compute_forward_group_rms_norm

+// fusion kinds that can be combined with the rms_norm computation in a single pass.
+// extend this enum when adding new fused variants (e.g. FUSE_ADD, FUSE_MUL_ADD, ...).
+enum ggml_rms_norm_fuse_op {
+    GGML_RMS_NORM_FUSE_OP_NONE,
+    GGML_RMS_NORM_FUSE_OP_MUL,
+};
+
+template <ggml_rms_norm_fuse_op FUSE_OP>
 static void ggml_compute_forward_rms_norm_f32(
        const ggml_compute_params * params,
-        ggml_tensor * dst) {
+        ggml_tensor * dst_rms_norm,
+        ggml_tensor * dst_fused = nullptr) {

-    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src0 = dst_rms_norm->src[0];
+    const ggml_tensor * src1 = nullptr;
+    ggml_tensor       * dst  = dst_rms_norm;
+
+    if constexpr (FUSE_OP == GGML_RMS_NORM_FUSE_OP_MUL) {
+        src1 = (dst_fused->src[0] == dst_rms_norm) ? dst_fused->src[1] : dst_fused->src[0];
+        dst  = dst_fused;
+    }

    GGML_ASSERT(ggml_are_same_shape(src0, dst));

@@ -3726,11 +3742,10 @@ static void ggml_compute_forward_rms_norm_f32(
    const int ith = params->ith;
    const int nth = params->nth;

-    GGML_TENSOR_UNARY_OP_LOCALS
+    GGML_TENSOR_BINARY_OP_LOCALS

    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
+    memcpy(&eps, dst_rms_norm->op_params, sizeof(float));
    GGML_ASSERT(eps >= 0.0f);

    // TODO: optimize
@@ -3740,25 +3755,32 @@ static void ggml_compute_forward_rms_norm_f32(
                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);

                ggml_float sum = 0.0;
+                // worth switching to explicit SIMD?
                for (int64_t i00 = 0; i00 < ne00; i00++) {
                    sum += (ggml_float)(x[i00] * x[i00]);
                }

-                const float mean = sum/ne00;
-
-                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-
-                memcpy(y, x, ne00 * sizeof(float));
-                // for (int i00 = 0; i00 < ne00; i00++) {
-                //     y[i00] = x[i00];
-                // }
-
+                const float mean  = sum/ne00;
                const float scale = 1.0f/sqrtf(mean + eps);

                // if you hit this, likely you got an inf somewhere earlier
                assert(scale > 0.0f);

-                ggml_vec_scale_f32(ne00, y, scale);
+                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+
+                if constexpr (FUSE_OP == GGML_RMS_NORM_FUSE_OP_MUL) {
+                    const int64_t i11 = i01 % ne11;
+                    const int64_t i12 = i02 % ne12;
+                    const int64_t i13 = i03 % ne13;
+                    const float * w = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);
+
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        y[i00] = x[i00] * scale * w[i00];
+                    }
+                } else {
+                    memcpy(y, x, ne00 * sizeof(float));
+                    ggml_vec_scale_f32(ne00, y, scale);
+                }
            }
        }
    }
@@ -3773,7 +3795,31 @@ void ggml_compute_forward_rms_norm(
    switch (src0->type) {
        case GGML_TYPE_F32:
            {
-                ggml_compute_forward_rms_norm_f32(params, dst);
+                ggml_compute_forward_rms_norm_f32<GGML_RMS_NORM_FUSE_OP_NONE>(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// Fused RMS_NORM + MUL: computes dst = rms_norm(src0) * src1 in a single pass.
+// This avoids materializing the intermediate rms_norm result in memory.
+void ggml_compute_forward_rms_norm_mul_fused(
+        const ggml_compute_params * params,
+        ggml_tensor * dst_rms_norm,
+        ggml_tensor * dst_mul) {
+
+    GGML_ASSERT(dst_mul != nullptr);
+    GGML_ASSERT(dst_mul->src[0] == dst_rms_norm || dst_mul->src[1] == dst_rms_norm);
+
+    const ggml_tensor * src0 = dst_rms_norm->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_rms_norm_f32<GGML_RMS_NORM_FUSE_OP_MUL>(params, dst_rms_norm, dst_mul);
            } break;
        default:
            {
@@ -11212,3 +11258,91 @@ void ggml_compute_forward_opt_step_sgd(const ggml_compute_params * params, ggml_
            }
    }
 }
+
+static void ggml_compute_forward_fwht_f32(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t n = ne10;
+    GGML_ASSERT((n & (n - 1)) == 0); // must be power of 2
+
+    const int64_t nr = ne11 * ne12 * ne13;
+    const int64_t rows_per_thread = (nr + nth - 1) / nth;
+    const int64_t start_row = ith * rows_per_thread;
+    const int64_t end_row = MIN(start_row + rows_per_thread, nr);
+
+    const float scale = 1.0f / sqrtf((float)n);
+
+#if defined(GGML_SIMD)
+    const GGML_F32_VEC v_minus_one = GGML_F32_VEC_SET1(-1.0f);
+#endif
+
+    for (int64_t r = start_row; r < end_row; r++) {
+        const int64_t i13 = r / (ne11 * ne12);
+        const int64_t i12 = (r - i13 * ne11 * ne12) / ne11;
+        const int64_t i11 = r - i13 * ne11 * ne12 - i12 * ne11;
+
+        const float * src_row = (const float *) ((const char *) src1->data + i11 * nb11 + i12 * nb12 + i13 * nb13);
+        float * dst_row = (float *) ((char *) dst->data + i11 * nb1 + i12 * nb2 + i13 * nb3);
+
+        for (int64_t j = 0; j < n; j++) {
+            dst_row[j] = src_row[j] * scale;
+        }
+
+        // Scalar passes
+#if defined(GGML_SIMD)
+        const int step = GGML_F32_EPR;
+#else
+        const int step = n;
+#endif
+        for (int64_t len = 1; len < step && len < n; len <<= 1) {
+            for (int64_t i = 0; i < n; i += 2 * len) {
+                for (int64_t j = 0; j < len; j++) {
+                    float u = dst_row[i + j];
+                    float v = dst_row[i + len + j];
+                    dst_row[i + j] = u + v;
+                    dst_row[i + len + j] = u - v;
+                }
+            }
+        }
+
+        // SIMD passes using GGML_F32_VEC_* macros for multi-architecture support
+#if defined(GGML_SIMD)
+        for (int64_t len = step; len < n; len <<= 1) {
+            for (int64_t i = 0; i < n; i += 2 * len) {
+                for (int64_t j = 0; j < len; j += step) {
+                    GGML_F32_VEC u = GGML_F32_VEC_LOAD(dst_row + i + j);
+                    GGML_F32_VEC v = GGML_F32_VEC_LOAD(dst_row + i + len + j);
+
+                    GGML_F32_VEC_STORE(dst_row + i + j,       GGML_F32_VEC_ADD(u, v));
+                    GGML_F32_VEC_STORE(dst_row + i + len + j, GGML_F32_VEC_FMA(u, v, v_minus_one));
+                }
+            }
+        }
+#endif
+    }
+}
+
+void ggml_compute_forward_fwht(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src1 = dst->src[1];
+
+    switch (src1->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_fwht_f32(params, dst);
+            }
+            break;
+        default:
+            {
+                GGML_ABORT("fatal error - fwht is F32 only");
+            }
+    }
+}
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -44,6 +44,7 @@ void ggml_compute_forward_concat(const struct ggml_compute_params * params, stru
 void ggml_compute_forward_silu_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_rms_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_rms_norm_mul_fused(const struct ggml_compute_params * params, struct ggml_tensor * dst_rms_norm, struct ggml_tensor * dst_mul);
 void ggml_compute_forward_rms_norm_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_group_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_l2_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -111,6 +112,7 @@ void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params *
 void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_fwht(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_opt_step_sgd(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 #ifdef __cplusplus
 }
--- a/ggml/src/ggml-cpu/spacemit/ime.cpp
+++ b/ggml/src/ggml-cpu/spacemit/ime.cpp
--- a/ggml/src/ggml-cpu/spacemit/ime.h
+++ b/ggml/src/ggml-cpu/spacemit/ime.h
@@ -8,6 +8,14 @@ extern "C" {

 ggml_backend_buffer_type_t ggml_backend_cpu_riscv64_spacemit_buffer_type(void);

+void ggml_backend_cpu_riscv64_spacemit_set_numa_thread_affinity(int thread_n);
+
+void ggml_backend_cpu_riscv64_spacemit_clear_numa_thread_affinity_threaded(int thread_n);
+
+void * ggml_backend_cpu_riscv64_spacemit_alloc_shared(size_t size, size_t alignment);
+
+void ggml_backend_cpu_riscv64_spacemit_free_shared(void * ptr);
+
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp
+++ b/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp
--- a/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp
+++ b/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp
--- a/ggml/src/ggml-cpu/spacemit/ime_env.cpp
+++ b/ggml/src/ggml-cpu/spacemit/ime_env.cpp
@@ -0,0 +1,320 @@
+#include "ime_env.h"
+
+#include "ggml-impl.h"
+#include "spine_mem_pool.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <array>
+#include <cctype>
+#include <fstream>
+#include <string>
+#include <thread>
+#include <unordered_map>
+
+namespace ggml::cpu::riscv64_spacemit {
+bool spine_core_info::get_spine_core_info(std::vector<spine_core_info> & result) {
+    static std::unordered_map<uint64_t, spine_core_arch_id> spine_march_mapping_ = {
+        {0x8000000058000001,  spine_core_arch_id::core_arch_x60 },
+        { 0x8000000041000001, spine_core_arch_id::core_arch_a60 },
+        { 0x8000000058000002, spine_core_arch_id::core_arch_x100},
+        { 0x8000000041000002, spine_core_arch_id::core_arch_a100},
+    };
+
+    result.clear();
+    std::ifstream file("/proc/cpuinfo");
+    std::string   line;
+
+    std::vector<std::array<uint64_t, 2>> cpu_info_list;
+
+    uint64_t current_processor = spine_invalid_core_id;
+    uint64_t current_marchid   = 0;
+    bool     has_processor     = false;
+    bool     has_marchid       = false;
+
+    if (!file.is_open()) {
+        return false;
+    }
+
+    while (std::getline(file, line)) {
+        if (line.substr(0, 9) == "processor") {
+            if (has_processor && has_marchid) {
+                cpu_info_list.push_back({ current_processor, current_marchid });
+            }
+
+            size_t colon_pos = line.find(':');
+            if (colon_pos != std::string::npos) {
+                current_processor = std::stoi(line.substr(colon_pos + 1));
+                has_processor     = true;
+            }
+
+            has_marchid = false;
+        } else if (line.substr(0, 7) == "marchid") {
+            size_t colon_pos = line.find(':');
+            if (colon_pos != std::string::npos) {
+                std::string marchid_str = line.substr(colon_pos + 1);
+                marchid_str.erase(std::remove_if(marchid_str.begin(), marchid_str.end(), isspace), marchid_str.end());
+                current_marchid = std::stoull(marchid_str, nullptr, 16);
+                has_marchid     = true;
+            }
+        }
+    }
+
+    if (has_processor && has_marchid) {
+        cpu_info_list.push_back({ current_processor, current_marchid });
+    }
+
+    if (has_processor && has_marchid) {
+        for (auto & cpu_info : cpu_info_list) {
+            if (cpu_info[0] != spine_invalid_core_id &&
+                spine_march_mapping_.find(cpu_info[1]) != spine_march_mapping_.end()) {
+                auto core_info    = spine_core_info();
+                core_info.core_id = cpu_info[0];
+                core_info.arch_id = spine_core_arch_id(spine_march_mapping_[cpu_info[1]]);
+
+                result.push_back(core_info);
+            }
+        }
+    }
+
+    return has_processor && has_marchid;
+}
+
+namespace {
+uint16_t hex_string_to_u16(const std::string & hex_str) {
+    try {
+        size_t pos = 0;
+        if (hex_str.substr(0, 2) == "0x" || hex_str.substr(0, 2) == "0X") {
+            pos = 2;
+        }
+        unsigned long result = std::stoul(hex_str.substr(pos), nullptr, 16);
+        if (result > std::numeric_limits<uint16_t>::max()) {
+            throw std::out_of_range("Converted value is out of range for uint16_t");
+        }
+        return static_cast<uint16_t>(result);
+    } catch (const std::invalid_argument & e) {
+        throw std::invalid_argument("Invalid hexadecimal string");
+    } catch (const std::out_of_range & e) {
+        throw;
+    }
+}
+
+const char * spine_mem_pool_backend_to_string(spine_mem_pool_backend backend) {
+    switch (backend) {
+        case spine_mem_pool_backend::none:
+            return "NONE";
+        case spine_mem_pool_backend::posix_memalign:
+            return "POSIX";
+        case spine_mem_pool_backend::transparent_hugepage:
+            return "HPAGE";
+        case spine_mem_pool_backend::hugetlb_1g:
+            return "HPAGE1GB";
+    }
+
+    return "unknown";
+}
+
+spine_mem_pool_backend parse_mem_backend(const char * mem_backend_str) {
+    if (mem_backend_str == nullptr || mem_backend_str[0] == '\0') {
+        return spine_mem_pool_backend::transparent_hugepage;
+    }
+
+    std::string value(mem_backend_str);
+    std::transform(value.begin(), value.end(), value.begin(),
+                   [](unsigned char ch) { return static_cast<char>(std::tolower(ch)); });
+
+    if (value == "none") {
+        return spine_mem_pool_backend::none;
+    }
+
+    if (value == "posix") {
+        return spine_mem_pool_backend::posix_memalign;
+    }
+
+    if (value == "hpage") {
+        return spine_mem_pool_backend::transparent_hugepage;
+    }
+
+    if (value == "hpage1gb") {
+        return spine_mem_pool_backend::hugetlb_1g;
+    }
+
+    throw std::runtime_error("invalid SPACEMIT_MEM_BACKEND: " + value + ", expected NONE, POSIX, HPAGE or HPAGE1GB");
+}
+}  // namespace
+
+spine_env_info::spine_env_info() {
+    num_cores = static_cast<int>(std::thread::hardware_concurrency());
+    spine_core_info::get_spine_core_info(core_info_list);
+
+    // special for x60 K1
+    if (core_info_list.size() == 8 && core_info_list[0].arch_id == spine_core_arch_id::core_arch_x60) {
+        for (int i = 0; i < 4; i++) {
+            core_info_list[i].arch_id = spine_core_arch_id::core_arch_a60;
+        }
+    }
+
+    // special for qemu
+    if (core_info_list.size() == 0) {
+        char * spine_core_arch_str = getenv("SPACEMIT_CORE_ARCH");
+        if (spine_core_arch_str != nullptr) {
+            auto arch_id = hex_string_to_u16(spine_core_arch_str);
+            for (int i = 0; i < num_cores; i++) {
+                auto core_info    = spine_core_info();
+                core_info.core_id = i;
+                core_info.arch_id = spine_core_arch_id{ arch_id };
+                core_info_list.push_back(core_info);
+            }
+        }
+    }
+
+    if (core_info_list.size() == 0) {
+        throw std::runtime_error(
+            "Failed to get SPACEMIT_CORE_ARCH from environment or failed to parse it from /proc/cpuinfo");
+    }
+
+    char * spine_perfer_core_arch_str = getenv("SPACEMIT_PERFER_CORE_ARCH");
+    if (spine_perfer_core_arch_str != nullptr && spine_perfer_core_arch_str != "") {
+        perfer_core_arch_id = spine_core_arch_id{ hex_string_to_u16(spine_perfer_core_arch_str) };
+    }
+
+    char *           spine_perfer_core_id_str = getenv("SPACEMIT_PERFER_CORE_ID");
+    std::vector<int> perfer_core_id_vec;
+    if (spine_perfer_core_id_str != nullptr && spine_perfer_core_id_str != "") {
+        std::string perfer_core_id_str(spine_perfer_core_id_str);
+        size_t      start = 0;
+        size_t      end   = 0;
+        while ((end = perfer_core_id_str.find(',', start)) != std::string::npos) {
+            std::string core_id_substr = perfer_core_id_str.substr(start, end - start);
+            perfer_core_id_vec.push_back(std::stoi(core_id_substr));
+            start = end + 1;
+        }
+        std::string core_id_substr = perfer_core_id_str.substr(start);
+        perfer_core_id_vec.push_back(std::stoi(core_id_substr));
+    }
+
+    perfer_core_ids.reserve(num_cores);
+    if (perfer_core_arch_id == spine_core_arch_id::core_arch_none) {
+        for (auto & core_info : core_info_list) {
+            auto core_arch_id   = core_info.arch_id;
+            auto core_arch_head = (uint16_t) (core_arch_id) >> 12;
+            if (core_arch_head == 0xA) {
+                num_perfer_cores++;
+                perfer_core_arch_id = core_arch_id;
+                cpu_mask |= (1ULL << core_info.core_id);
+                perfer_core_ids.push_back(core_info.core_id);
+            }
+        }
+    } else {
+        for (auto & core_info : core_info_list) {
+            auto core_arch_id = core_info.arch_id;
+            if (core_arch_id == perfer_core_arch_id) {
+                num_perfer_cores++;
+                cpu_mask |= (1ULL << core_info.core_id);
+
+                auto core_arch_head = (uint16_t) (core_arch_id) >> 12;
+                if (core_arch_head == 0xA) {
+                    perfer_core_ids.push_back(core_info.core_id);
+                }
+            }
+        }
+        if (num_perfer_cores == 0) {
+            GGML_ABORT("can not find core with arch id %x for SPACEMIT_PERFER_CORE_ARCH in core info list\n",
+                       (uint16_t) perfer_core_arch_id);
+        }
+    }
+
+    if (perfer_core_id_vec.size() > 0) {
+        perfer_core_ids.clear();
+        cpu_mask         = 0;
+        num_perfer_cores = 0;
+        for (int core_id : perfer_core_id_vec) {
+            if (core_id < 0 || core_id >= num_cores) {
+                GGML_ABORT("invalid core id in SPACEMIT_PERFER_CORE_ID: %d, should be between 0 and %d\n", core_id,
+                           num_cores - 1);
+            }
+            auto core_info    = core_info_list[core_id];
+            auto core_arch_id = core_info.arch_id;
+            if (core_arch_id == perfer_core_arch_id) {
+                cpu_mask |= (1ULL << core_id);
+                perfer_core_ids.push_back(core_id);
+            } else {
+                GGML_ABORT(
+                    "core id %d in SPACEMIT_PERFER_CORE_ID has arch id %x which does not match "
+                    "SPACEMIT_PERFER_CORE_ARCH %x\n",
+                    core_id, (uint16_t) core_arch_id, (uint16_t) perfer_core_arch_id);
+            }
+        }
+        std::string perfer_core_id_vec_str;
+        for (int core_id : perfer_core_id_vec) {
+            perfer_core_id_vec_str += std::to_string(core_id) + ",";
+        }
+        perfer_core_id_vec_str.pop_back();
+        GGML_LOG_DEBUG("SPACEMIT_PERFER_CORE_ID is set, perferred core ids: %s\n", perfer_core_id_vec_str.c_str());
+        num_perfer_cores = static_cast<int>(perfer_core_id_vec.size());
+    }
+
+    use_ime1 = perfer_core_arch_id == spine_core_arch_id::core_arch_a60 ||
+               perfer_core_arch_id == spine_core_arch_id::core_arch_x100;
+
+    use_ime2 = perfer_core_arch_id == spine_core_arch_id::core_arch_a100;
+
+    mem_backend                  = parse_mem_backend(getenv("SPACEMIT_MEM_BACKEND"));
+    char * spine_disable_tcm_str = getenv("SPACEMIT_DISABLE_TCM");
+    auto   user_disable_tcm      = spine_disable_tcm_str != nullptr && strcmp(spine_disable_tcm_str, "0") != 0;
+
+    if (!user_disable_tcm) {
+        spine_mem_pool_tcm_info tcm_info;
+        if (spine_mem_pool_tcm_init(&tcm_info)) {
+            use_tcm      = tcm_info.available;
+            tcm_blk_size = tcm_info.blk_size;
+            GGML_LOG_DEBUG("CPU_RISCV64_SPACEMIT: tcm is available, blk_size: %zu, blk_num: %zu, is_fake_tcm: %d\n",
+                           tcm_info.blk_size, tcm_info.blk_num, tcm_info.is_fake_tcm);
+
+            for (auto & core_info : core_info_list) {
+                auto core_arch_head = (uint16_t) (core_info.arch_id) >> 12;
+                if (core_arch_head != 0xA) {
+                    aicpu_id_offset++;
+                } else {
+                    break;
+                }
+            }
+        }
+    }
+
+    GGML_LOG_DEBUG(
+        "CPU_RISCV64_SPACEMIT: num_cores: %d, num_perfer_cores: %d, perfer_core_arch_id: %x, exclude_main_thread: %d, "
+        "use_ime1: %d, use_ime2: %d, mem_backend: %s, cpu_mask: %lx, aicpu_id_offset: %d\n",
+        num_cores, num_perfer_cores, (uint16_t) perfer_core_arch_id, exclude_main_thread, use_ime1, use_ime2,
+        spine_mem_pool_backend_to_string(mem_backend), cpu_mask, aicpu_id_offset);
+
+    const size_t init_barrier_size = sizeof(spine_barrier_t) * spine_init_barrier_count;
+    init_barrier =
+        static_cast<spine_barrier_t *>(spine_mem_pool_shared_mem_alloc(init_barrier_size, alignof(spine_barrier_t)));
+    if (init_barrier != nullptr) {
+        init_barrier_is_shared_mem = true;
+    } else {
+        GGML_LOG_WARN("CPU_RISCV64_SPACEMIT: failed to allocate init_barrier from shared mem, falling back to heap\n",
+                      __func__);
+        init_barrier = new spine_barrier_t[spine_init_barrier_count];
+    }
+
+    spine_barrier_init(init_barrier, spine_init_barrier_count, 2);
+}
+
+spine_env_info::~spine_env_info() {
+    if (init_barrier_is_shared_mem) {
+        spine_mem_pool_shared_mem_free(init_barrier);
+    } else {
+        delete[] init_barrier;
+    }
+
+    init_barrier               = nullptr;
+    init_barrier_is_shared_mem = false;
+}
+
+spine_env_info global_spine_env_info;
+
+}  // namespace ggml::cpu::riscv64_spacemit
--- a/ggml/src/ggml-cpu/spacemit/ime_env.h
+++ b/ggml/src/ggml-cpu/spacemit/ime_env.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include "spine_barrier.h"
+#include "spine_mem_pool.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace ggml::cpu::riscv64_spacemit {
+
+constexpr uint64_t spine_invalid_core_id    = 0xFFFFFFFF;
+constexpr size_t   spine_init_barrier_count = 16;
+
+enum class spine_core_arch_id : uint16_t {
+    core_arch_none = 0,
+    core_arch_x60  = 0x503C,
+    core_arch_x100 = 0x5064,
+    core_arch_x200 = 0x50C8,
+    core_arch_a60  = 0xA03C,
+    core_arch_a100 = 0xA064,
+    core_arch_a200 = 0xA0C8,
+};
+
+struct spine_core_info {
+    uint64_t           core_id{ spine_invalid_core_id };
+    spine_core_arch_id arch_id{ spine_core_arch_id::core_arch_none };
+
+    static bool get_spine_core_info(std::vector<spine_core_info> & result);
+};
+
+struct spine_env_info {
+    std::vector<spine_core_info> core_info_list;
+    std::vector<int>             perfer_core_ids;
+    int                          aicpu_id_offset{ 0 };
+    int                          num_cores{ 0 };
+    int                          num_perfer_cores{ 0 };
+    spine_core_arch_id           perfer_core_arch_id{ spine_core_arch_id::core_arch_none };
+    bool                         exclude_main_thread{ false };
+    bool                         use_ime2{ false };
+    bool                         use_ime1{ false };
+    bool                         use_tcm{ false };
+    spine_mem_pool_backend       mem_backend{ spine_mem_pool_backend::transparent_hugepage };
+    uint64_t                     tcm_blk_size{ 0 };
+    uint64_t                     cpu_mask{ 0 };
+    spine_barrier_t *            init_barrier{ nullptr };
+    bool                         init_barrier_is_shared_mem{ false };
+
+    spine_env_info();
+    ~spine_env_info();
+};
+
+extern spine_env_info global_spine_env_info;
+
+}  // namespace ggml::cpu::riscv64_spacemit
--- a/ggml/src/ggml-cpu/spacemit/ime_kernels.h
+++ b/ggml/src/ggml-cpu/spacemit/ime_kernels.h
@@ -1,26 +1,189 @@
 #pragma once

+#include <cassert>
 #include <cstddef>
+#include <functional>
+
+namespace spacemit_kernels {
+
+#define BLOCK_QNK_LEN 256
+
+template <int N> struct nrow_block_q2_k {
+    // [4bit scale + 4bit zp] * N * 16
+    uint8_t  scales[N * BLOCK_QNK_LEN / 16];
+    // [b0, b16, b32, b48] [b1, b17, b33, b49] ... [b15, b31, b47, b63]
+    // [b64, b80, b96, b112] ...[b79, b95, b111, b127]
+    // [b128, b144, b160, b176] ...[b143, b159, b175, b191]
+    // [b192, b208, b224, b240] ...[b207, b223, b239, b255]
+    uint8_t  qs[N * BLOCK_QNK_LEN / 4];
+    uint16_t scales16[N];
+    uint16_t zeros16[N];
+};
+
+template <int N> struct nrow_block_q3_k {
+    // [8bit scale] * N * 16
+    int8_t   scales[N * 16];
+    // [b0, b1, b2, b3, b4, b5, b6, b7] ... [b248, b249, b250, b251, b252, b253, b254, b255]
+    uint8_t  hmask[N * BLOCK_QNK_LEN / 8];
+    // [b0, b16, b32, b48] [b1, b17, b33, b49] ... [b15, b31, b47, b63]
+    // [b64, b80, b96, b112] ...[b79, b95, b111, b127]
+    // [b128, b144, b160, b176] ...[b143, b159, b175, b191]
+    // [b192, b208, b224, b240] ...[b207, b223, b239, b255]
+    uint8_t  qs[N * BLOCK_QNK_LEN / 4];
+    uint16_t scales16[N];
+};
+
+template <int N> struct nrow_block_mxfp4 {
+    uint8_t e[N];
+    uint8_t qh[4 * N];
+    uint8_t qs[16 * N];
+};
+
+template <int N> struct __attribute__((packed)) nrow_block_q5_1 {
+    uint16_t scales16[N];
+    uint8_t  zp[N];
+    // n0 [bh0, bh1, bh2, bh3, bh4, bh5, bh6, bh7] ....
+    uint8_t  qh[4 * N];
+    // n0 [b0, b1], [b2, b3] ....  [b30, b31]
+    // n1 [b0, b1], [b2, b3] ....  [b30, b31]
+    uint8_t  qs[16 * N];
+};
+
+static_assert(sizeof(nrow_block_q5_1<1>) == sizeof(uint8_t) + 22, "wrong nrow_block_q5_1 block size/padding");
+
+template <int N> struct __attribute__((packed)) nrow_block_q5_0 {
+    uint16_t scales16[N];
+    // n0 [bh0, bh1, bh2, bh3, bh4, bh5, bh6, bh7] ....
+    uint8_t  qh[4 * N];
+    // n0 [b0, b1], [b2, b3] ....  [b30, b31]
+    // n1 [b0, b1], [b2, b3] ....  [b30, b31]
+    uint8_t  qs[16 * N];
+};
+
+static_assert(sizeof(nrow_block_q5_0<1>) == 22, "wrong nrow_block_q5_0 block size/padding");
+
+using gemm_kernel_quantize_def = std::function<
+    size_t(size_t, const uint8_t *, const uint8_t *, const uint8_t *, float *, size_t, size_t, size_t, size_t)>;
+
+using moe_gemm_kernel_quantize_def = std::function<
+    size_t(size_t, const uint8_t **, const uint8_t *, const uint8_t *, float **, size_t, size_t, size_t, size_t)>;

-namespace sqnbitgemm_spacemit_ime {
 namespace ime1 {
-size_t gemm_kernel_i8i4(size_t            blk_len,
-                        const std::byte * quant_a_ptr,
-                        const std::byte * quant_b_data,
-                        const float *     quant_b_scale,
-                        const std::byte * quant_b_zp,
-                        float *           c_ptr,
-                        size_t            count_m,
-                        size_t            count_n,
-                        size_t            count_k,
-                        size_t            block_count_k,
-                        size_t            ldc,
-                        const float *     bias,
-                        const size_t      scale_stride);
+size_t gemm_kernel_i8i4(size_t          blk_len,
+                        const uint8_t * quant_a_ptr,
+                        const uint8_t * quant_b_data,
+                        const uint8_t * quant_b_zp,
+                        float *         c_ptr,
+                        size_t          count_m,
+                        size_t          count_n,
+                        size_t          k_blks,
+                        size_t          ldc);

-void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
+void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);

-void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
+void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);

 }  // namespace ime1
-}  // namespace sqnbitgemm_spacemit_ime
+
+namespace ime2 {
+size_t gemm_kernel_i8i2k(size_t          blk_len,
+                         const uint8_t * quant_a_ptr,
+                         const uint8_t * quant_b_data,
+                         const uint8_t * quant_b_zp,
+                         float *         c_ptr,
+                         size_t          count_m,
+                         size_t          count_n,
+                         size_t          k_blks,
+                         size_t          ldc);
+
+size_t gemm_kernel_i8i3k(size_t          blk_len,
+                         const uint8_t * quant_a_ptr,
+                         const uint8_t * quant_b_data,
+                         const uint8_t * quant_b_zp,
+                         float *         c_ptr,
+                         size_t          count_m,
+                         size_t          count_n,
+                         size_t          k_blks,
+                         size_t          ldc);
+
+size_t gemm_kernel_i8i4(size_t          blk_len,
+                        const uint8_t * quant_a_ptr,
+                        const uint8_t * quant_b_data,
+                        const uint8_t * quant_b_zp,
+                        float *         c_ptr,
+                        size_t          count_m,
+                        size_t          count_n,
+                        size_t          k_blks,
+                        size_t          ldc);
+
+size_t gemm_kernel_i8i4_hp(size_t          blk_len,
+                           const uint8_t * quant_a_ptr,
+                           const uint8_t * quant_b_data,
+                           const uint8_t * quant_b_zp,
+                           float *         c_ptr,
+                           size_t          count_m,
+                           size_t          count_n,
+                           size_t          k_blks,
+                           size_t          ldc);
+
+size_t moe_m2_gemm_kernel_i8i4(size_t           blk_len,
+                               const uint8_t ** quant_a_ptr,
+                               const uint8_t *  quant_b_data,
+                               const uint8_t *  quant_b_zp,
+                               float **         c_ptr,
+                               size_t           count_m,
+                               size_t           count_n,
+                               size_t           k_blks,
+                               size_t           ldc);
+
+size_t gemm_kernel_i8i8(size_t          blk_len,
+                        const uint8_t * quant_a_ptr,
+                        const uint8_t * quant_b_data,
+                        const uint8_t * quant_b_zp,
+                        float *         c_ptr,
+                        size_t          count_m,
+                        size_t          count_n,
+                        size_t          k_blks,
+                        size_t          ldc);
+
+size_t gemm_kernel_i8mxfp4(size_t          blk_len,
+                           const uint8_t * quant_a_ptr,
+                           const uint8_t * quant_b_data,
+                           const uint8_t * quant_b_zp,
+                           float *         c_ptr,
+                           size_t          count_m,
+                           size_t          count_n,
+                           size_t          k_blks,
+                           size_t          ldc);
+
+size_t moe_m2_gemm_kernel_i8mxfp4(size_t           blk_len,
+                                  const uint8_t ** quant_a_ptr,
+                                  const uint8_t *  quant_b_data,
+                                  const uint8_t *  quant_b_zp,
+                                  float **         c_ptr,
+                                  size_t           count_m,
+                                  size_t           count_n,
+                                  size_t           k_blks,
+                                  size_t           ldc);
+
+size_t gemm_kernel_i8i5(size_t          blk_len,
+                        const uint8_t * quant_a_ptr,
+                        const uint8_t * quant_b_data,
+                        const uint8_t * quant_b_zp,
+                        float *         c_ptr,
+                        size_t          count_m,
+                        size_t          count_n,
+                        size_t          k_blks,
+                        size_t          ldc);
+
+size_t moe_m2_gemm_kernel_i8i5(size_t           blk_len,
+                               const uint8_t ** quant_a_ptr,
+                               const uint8_t *  quant_b_data,
+                               const uint8_t *  quant_b_zp,
+                               float **         c_ptr,
+                               size_t           count_m,
+                               size_t           count_n,
+                               size_t           k_blks,
+                               size_t           ldc);
+}  // namespace ime2
+}  // namespace spacemit_kernels
--- a/ggml/src/ggml-cpu/spacemit/repack.cpp
+++ b/ggml/src/ggml-cpu/spacemit/repack.cpp
--- a/Show More
+++ b/Show More