cont : remove /api/tags

server : remove /api endpoints
2026-05-14 17:07:43 +03:00 · 2026-04-20 15:45:42 +03:00 · 2026-04-20 15:34:18 +03:00
830 changed files with 45998 additions and 86104 deletions
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
+ARG ONEAPI_VERSION=2025.3.2-0-devel-ubuntu24.04

 ## Build Image

@@ -33,10 +33,10 @@ RUN mkdir -p /app/full \

 FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

-ARG IGC_VERSION=v2.32.7
-ARG IGC_VERSION_FULL=2_2.32.7+21184
-ARG COMPUTE_RUNTIME_VERSION=26.14.37833.4
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.14.37833.4-0
+ARG IGC_VERSION=v2.30.1
+ARG IGC_VERSION_FULL=2_2.30.1+20950
+ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
+ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
 ARG IGDGMM_VERSION=22.9.0
 RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -103,7 +103,6 @@ let
    vulkan-headers
    vulkan-loader
    shaderc
-    spirv-headers
  ];
 in

@@ -147,6 +146,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
      ninja
      pkg-config
      git
+      spirv-headers
    ]
    ++ optionals useCuda [
      cudaPackages.cuda_nvcc
--- a/.devops/openvino.Dockerfile
+++ b/.devops/openvino.Dockerfile
@@ -2,19 +2,7 @@ ARG OPENVINO_VERSION_MAJOR=2026.0
 ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
 ARG UBUNTU_VERSION=24.04

-# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
-ARG IGC_VERSION=v2.30.1
-ARG IGC_VERSION_FULL=2_2.30.1+20950
-ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
-ARG IGDGMM_VERSION=22.9.0
-
-# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
-ARG NPU_DRIVER_VERSION=v1.32.0
-ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947
-ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
-
-# Optional proxy build arguments
+# Optional proxy build arguments - empty by default
 ARG http_proxy=
 ARG https_proxy=

@@ -90,47 +78,13 @@ ARG http_proxy
 ARG https_proxy

 RUN apt-get update \
-    && apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
+    && apt-get install -y libgomp1 libtbb12 curl \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete

-# Install GPU drivers
-ARG IGC_VERSION
-ARG IGC_VERSION_FULL
-ARG COMPUTE_RUNTIME_VERSION
-ARG COMPUTE_RUNTIME_VERSION_FULL
-ARG IGDGMM_VERSION
-RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
-    && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-    && dpkg --install *.deb \
-    && rm -rf /tmp/neo/
-
-# Install NPU drivers
-ARG NPU_DRIVER_VERSION
-ARG NPU_DRIVER_FULL
-ARG LIBZE1_VERSION
-RUN mkdir /tmp/npu/ && cd /tmp/npu/ \
-    && wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
-    && tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
-    && dpkg --install *.deb \
-    && rm -rf /tmp/npu/
-
-RUN cd /tmp \
-    && wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \
-    && dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \
-    && rm libze1_${LIBZE1_VERSION}_amd64.deb
-
 COPY --from=build /app/lib/ /app/

 ### Full (all binaries)
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@@ -12,8 +12,6 @@ body:
        after recreating the CMake build directory and with `-DGGML_CCACHE=OFF`.
        If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
        by clearing `~/.cache/ccache` (on Linux).
-
-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
  - type: textarea
    id: commit
    attributes:
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@@ -1,5 +1,5 @@
 name: Bug (model use)
-description: Something goes wrong when running a model (crashes, garbled outputs, etc.).
+description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
 title: "Eval bug: "
 labels: ["bug-unconfirmed", "model evaluation"]
 body:
@@ -12,8 +12,6 @@ body:
        If you encountered the issue while using an external UI (e.g. ollama),
        please reproduce your issue using one of the examples/binaries in this repository.
        The `llama-completion` binary can be used for simple and reproducible model inference.
-
-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
  - type: textarea
    id: version
    attributes:
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@@ -10,8 +10,6 @@ body:
        This issue template is intended for miscellaneous bugs that don't fit into any other category.
        If you encountered the issue while using an external UI (e.g. ollama),
        please reproduce your issue using one of the examples/binaries in this repository.
-
-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
  - type: textarea
    id: version
    attributes:
--- a/.github/ISSUE_TEMPLATE/020-enhancement.yml
+++ b/.github/ISSUE_TEMPLATE/020-enhancement.yml
@@ -8,8 +8,6 @@ body:
      value: |
        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas)

-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
-
  - type: checkboxes
    id: prerequisites
    attributes:
--- a/.github/ISSUE_TEMPLATE/030-research.yml
+++ b/.github/ISSUE_TEMPLATE/030-research.yml
@@ -8,8 +8,6 @@ body:
      value: |
        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)

-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
-
  - type: checkboxes
    id: research-stage
    attributes:
--- a/.github/ISSUE_TEMPLATE/040-refactor.yml
+++ b/.github/ISSUE_TEMPLATE/040-refactor.yml
@@ -9,8 +9,6 @@ body:
        Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
        Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.

-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
-
  - type: textarea
    id: background-description
    attributes:
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -6,7 +6,7 @@

 <!-- You can provide more details and link related discussions here. Delete this section if not applicable -->

-## Requirements
+# Requirements

 <!-- IMPORTANT: Please do NOT delete this section, otherwise your PR may be rejected -->

--- a/.github/workflows/build-and-test-snapdragon.yml
+++ b/.github/workflows/build-and-test-snapdragon.yml
@@ -1,116 +0,0 @@
-name: CI (snapdragon)
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - master
-    paths:
-      - '.github/workflows/build-and-test-snapdragon.yml'
-      - 'ggml/include/ggml-hexagon.h'
-      - 'ggml/src/ggml-hexagon/**'
-      - 'docs/backend/snapdragon/**'
-      - 'scripts/snapdragon/**'
-      - 'CMakePresets.json'
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths:
-      - '.github/workflows/build-and-test-snapdragon.yml'
-      - 'ggml/include/ggml-hexagon.h'
-      - 'ggml/src/ggml-hexagon/**'
-      - 'docs/backend/snapdragon/**'
-      - 'scripts/snapdragon/**'
-      - 'CMakePresets.json'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  android-ndk-snapdragon:
-    runs-on: ubuntu-latest
-    container:
-      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
-    defaults:
-      run:
-        shell: bash
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          lfs: false
-
-      - name: Build Llama.CPP for Snapdragon Android
-        id: build_llama_cpp_snapdragon_android
-        run: |
-          cp docs/backend/snapdragon/CMakeUserPresets.json .
-          cmake --preset arm64-android-snapdragon-release -B build
-          cmake --build build
-          cmake --install build --prefix pkg-snapdragon/llama.cpp
-
-      - name: Upload Llama.CPP Snapdragon Android Build Artifact
-        if: ${{ always() && steps.build_llama_cpp_snapdragon_android.outcome == 'success' }}
-        uses: actions/upload-artifact@v6
-        with:
-          name: llama-cpp-android-arm64-snapdragon
-          path: pkg-snapdragon/llama.cpp
-
-  test-snapdragon-qdc:
-    name: Test on QDC Android Device (${{ matrix.device }})
-    needs: [android-ndk-snapdragon]
-    runs-on: ubuntu-slim
-    strategy:
-      fail-fast: false
-      matrix:
-        device: [SM8750, SM8650, SM8850]
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-
-      - name: Download build artifact
-        uses: actions/download-artifact@v7
-        with:
-          name: llama-cpp-android-arm64-snapdragon
-          path: pkg-snapdragon/llama.cpp
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.x'
-          cache: pip
-
-      - name: Install system dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y curl unzip
-
-      - name: Install QDC SDK wheel
-        run: |
-          curl -fSL -o qdc_sdk.zip https://softwarecenter.qualcomm.com/api/download/software/tools/Qualcomm_Device_Cloud_SDK/All/0.2.3/qualcomm_device_cloud_sdk-0.2.3.zip
-          unzip qdc_sdk.zip -d qdc_sdk
-          pip install qdc_sdk/qualcomm_device_cloud_sdk-0.2.3-py3-none-any.whl
-
-      - name: Check QDC API key
-        id: check_secret
-        env:
-          QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
-        run: echo "has-qdc-key=${{ env.QDC_API_KEY != '' }}" >> "$GITHUB_OUTPUT"
-
-      - name: Run QDC tests (${{ matrix.device }})
-        if: steps.check_secret.outputs.has-qdc-key == 'true'
-        run: |
-          python scripts/snapdragon/qdc/run_qdc_jobs.py \
-              --test       all \
-              --pkg-dir    pkg-snapdragon/llama.cpp \
-              --model-url  "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf" \
-              --device     ${{ matrix.device }}
-        env:
-          QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
-
-      - name: Cleanup
-        if: always()
-        run: rm -rf pkg-snapdragon qdc_sdk qdc_sdk.zip
--- a/.github/workflows/build-android.yml
+++ b/.github/workflows/build-android.yml
@@ -1,24 +1,26 @@
 name: CI (android)

 on:
-  workflow_dispatch:
+  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
-    paths:
-      - '.github/workflows/build-android.yml'
-      - '**/CMakeLists.txt'
-      - '**/.cmake'
-      - '**/*.h'
-      - '**/*.hpp'
-      - '**/*.c'
-      - '**/*.cpp'
+    paths: [
+      '.github/workflows/build-android.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp'
+    ]

  pull_request:
    types: [opened, synchronize, reopened]
-    paths:
-      - '.github/workflows/build-android.yml'
-      - 'examples/llama.android/**'
+    paths: [
+      '.github/workflows/build-android.yml',
+      'examples/llama.android/**'
+    ]

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -65,24 +67,35 @@ jobs:
    defaults:
      run:
        shell: bash
+    strategy:
+      matrix:
+        include:
+          - build: 'arm64-cpu'
+            defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
+          - build: 'arm64-snapdragon'
+            defines: '--preset arm64-android-snapdragon-release'

    steps:
      - name: Clone
+        id: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
          lfs: false

-      - name: Build
-        id: ndk_build
+      - name: Build Llama.CPP for Hexagon Android
+        id: build_llama_cpp_hexagon_android
        run: |
-          cmake -D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF -B build
+          if [[ "${{ matrix.build }}" == "arm64-snapdragon" ]]; then
+            cp docs/backend/snapdragon/CMakeUserPresets.json .
+          fi
+          cmake ${{ matrix.defines }} -B build
          cmake --build build
          cmake --install build --prefix pkg-adb/llama.cpp

-      - name: Upload Android Build Artifact
-        if: ${{ always() && steps.ndk_build.outcome == 'success' }}
+      - name: Upload Llama.CPP Hexagon Android Build Artifact
+        if: ${{ always() && steps.build_llama_cpp_hexagon_android.outcome == 'success' }}
        uses: actions/upload-artifact@v6
        with:
-          name: llama-cpp-android-arm64-cpu
+          name: llama-cpp-android-${{ matrix.build }}
          path: pkg-adb/llama.cpp
--- a/.github/workflows/build-openvino.yml
+++ b/.github/workflows/build-openvino.yml
@@ -1,120 +0,0 @@
-name: CI (openvino)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-openvino.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-openvino.yml',
-      'ggml/src/ggml-openvino/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-  ubuntu-24-openvino:
-    name: ubuntu-24-openvino-${{ matrix.openvino_device }}
-
-    concurrency:
-      group: openvino-${{ matrix.variant }}-${{ github.head_ref || github.ref }}
-      cancel-in-progress: false
-
-    strategy:
-      matrix:
-        include:
-          - variant: cpu
-            runner: '"ubuntu-24.04"'
-            openvino_device: "CPU"
-          - variant: gpu
-            runner: '["self-hosted","Linux","Intel","OpenVINO"]'
-            openvino_device: "GPU"
-
-    runs-on: ${{ fromJSON(matrix.runner) }}
-
-    env:
-      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.0"
-      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        if: runner.environment == 'github-hosted'
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
-          sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
-
-      - name: Use OpenVINO Toolkit Cache
-        if: runner.environment == 'github-hosted'
-        uses: actions/cache@v5
-        id: cache-openvino
-        with:
-          path: ./openvino_toolkit
-          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
-
-      - name: Setup OpenVINO Toolkit
-        if: steps.cache-openvino.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-openvino
-        with:
-          path: ./openvino_toolkit
-          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
-          version_full: ${{ env.OPENVINO_VERSION_FULL }}
-
-      - name: Install OpenVINO dependencies
-        run: |
-          cd ./openvino_toolkit
-          chmod +x ./install_dependencies/install_openvino_dependencies.sh
-          echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source ./openvino_toolkit/setupvars.sh
-          cmake -B build/ReleaseOV -G Ninja \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_OPENVINO=ON
-          time cmake --build build/ReleaseOV --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        # TODO: fix and re-enable the `test-llama-archs` test below
-        run: |
-          cd ${{ github.workspace }}
-          if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
-            export GGML_OPENVINO_DEVICE=GPU
-          fi
-          ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@@ -265,10 +265,6 @@ jobs:
  ggml-ci-intel-openvino-gpu-low-perf:
    runs-on: [self-hosted, Linux, Intel, OpenVINO]

-    concurrency:
-      group: openvino-gpu-${{ github.head_ref || github.ref }}
-      cancel-in-progress: false
-
    env:
      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
      OPENVINO_VERSION_MAJOR: "2026.0"
--- a/.github/workflows/build-sycl.yml
+++ b/.github/workflows/build-sycl.yml
@@ -1,142 +0,0 @@
-name: CI (sycl)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-sycl.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-sycl.yml',
-      'ggml/src/ggml-sycl/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-
-  ubuntu-24-sycl:
-    strategy:
-      matrix:
-        build: [fp32, fp16]
-        include:
-          - build: fp32
-            fp16: OFF
-          - build: fp16
-            fp16: ON
-
-    runs-on: ubuntu-24.04
-
-    env:
-      ONEAPI_ROOT: /opt/intel/oneapi/
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v6
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          cd /tmp
-          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-sycl-${{ matrix.build }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DLLAMA_OPENSSL=OFF \
-            -DGGML_NATIVE=OFF \
-            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-          time cmake --build build --config Release -j $(nproc)
-
-  windows-latest-sycl:
-    runs-on: windows-2022
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: windows-latest-sycl
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
-      - name: Build
-        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
--- a/.github/workflows/build-virtgpu.yml
+++ b/.github/workflows/build-virtgpu.yml
@@ -1,50 +0,0 @@
-name: CI (virtgpu)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-virtgpu.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-virtgpu.yml',
-      'ggml/src/ggml-virtgpu/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  ubuntu-24-virtgpu:
-    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential libdrm-dev pkg-config libssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DGGML_VIRTGPU=ON \
-            -DGGML_VIRTGPU_BACKEND=ON
-          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -456,8 +456,7 @@ jobs:
        run: |
          cd build
          # This is using llvmpipe and runs slower than other backends
-          # test-backend-ops is too slow on llvmpipe, skip it
-          ctest -L main -E test-backend-ops --verbose --timeout 900
+          ctest -L main --verbose --timeout 900

  ubuntu-24-webgpu-wasm:
    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
@@ -556,6 +555,186 @@ jobs:
            -DGGML_MUSA=ON
          time cmake --build build --config Release -j $(nproc)

+  ubuntu-22-sycl:
+    runs-on: ubuntu-22.04
+
+    continue-on-error: true
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: add oneAPI to apt
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+
+      - name: install oneAPI dpcpp compiler
+        shell: bash
+        run: |
+          sudo apt update
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
+
+      - name: install oneAPI MKL library
+        shell: bash
+        run: |
+          sudo apt install intel-oneapi-mkl-devel
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: ubuntu-22-sycl
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx
+          time cmake --build build --config Release -j $(nproc)
+
+  ubuntu-22-sycl-fp16:
+    runs-on: ubuntu-22.04
+
+    continue-on-error: true
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: add oneAPI to apt
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+
+      - name: install oneAPI dpcpp compiler
+        shell: bash
+        run: |
+          sudo apt update
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev ninja-build
+
+      - name: install oneAPI MKL library
+        shell: bash
+        run: |
+          sudo apt install intel-oneapi-mkl-devel
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: ubuntu-22-sycl-fp16
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx \
+            -DGGML_SYCL_F16=ON
+          time cmake --build build --config Release -j $(nproc)
+
+  ubuntu-24-openvino:
+      name: ubuntu-24-openvino-${{ matrix.openvino_device }}
+      strategy:
+        matrix:
+          include:
+            - variant: cpu
+              runner: '"ubuntu-24.04"'
+              openvino_device: "CPU"
+            - variant: gpu
+              runner: '["self-hosted","Linux","X64","Intel"]'
+              openvino_device: "GPU"
+
+      runs-on: ${{ fromJSON(matrix.runner) }}
+
+      env:
+        # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
+        OPENVINO_VERSION_MAJOR: "2026.0"
+        OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
+
+      steps:
+        - name: Clone
+          id: checkout
+          uses: actions/checkout@v6
+
+        - name: ccache
+          if: runner.environment == 'github-hosted'
+          uses: ggml-org/ccache-action@v1.2.21
+          with:
+            key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
+            evict-old-files: 1d
+            save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+        - name: Dependencies
+          id: depends
+          run: |
+            sudo apt-get update
+            sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
+            sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
+
+        - name: Use OpenVINO Toolkit Cache
+          if: runner.environment == 'github-hosted'
+          uses: actions/cache@v5
+          id: cache-openvino
+          with:
+            path: ./openvino_toolkit
+            key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+
+        - name: Setup OpenVINO Toolkit
+          if: steps.cache-openvino.outputs.cache-hit != 'true'
+          uses: ./.github/actions/linux-setup-openvino
+          with:
+            path: ./openvino_toolkit
+            version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
+            version_full: ${{ env.OPENVINO_VERSION_FULL }}
+
+        - name: Install OpenVINO dependencies
+          run: |
+            cd ./openvino_toolkit
+            chmod +x ./install_dependencies/install_openvino_dependencies.sh
+            echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
+
+        - name: Build
+          id: cmake_build
+          run: |
+            source ./openvino_toolkit/setupvars.sh
+            cmake -B build/ReleaseOV -G Ninja \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DGGML_OPENVINO=ON
+            time cmake --build build/ReleaseOV --config Release -j $(nproc)
+
+        - name: Test
+          id: cmake_test
+          # TODO: fix and re-enable the `test-llama-archs` test below
+          run: |
+            cd ${{ github.workspace }}
+            if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
+              export GGML_OPENVINO_DEVICE=GPU
+            fi
+            ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000

  windows-latest:
    runs-on: windows-2025
@@ -764,6 +943,39 @@ jobs:
          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
          cmake --build build --config Release

+  windows-latest-sycl:
+    runs-on: windows-2022
+
+    defaults:
+      run:
+        shell: bash
+
+    env:
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
+      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: windows-latest-sycl
+          variant: ccache
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Install
+        run:  |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
+      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+
+      - name: Build
+        id: cmake_build
+        run:  examples/sycl/win-build-sycl.bat

  windows-latest-hip:
    runs-on: windows-2022
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@@ -29,10 +29,10 @@ jobs:
      uses: actions/setup-python@v6
      with:
        python-version: '3.11'
-        pip-install: poetry==2.4.0
    - name: Install dependencies
      run: |
        cd gguf-py
+        python -m pip install poetry==2.3.2
        poetry install

    - name: Build package
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -31,7 +31,7 @@ jobs:
        uses: actions/setup-python@v6
        with:
          python-version: "3.11"
-          pip-install: -r requirements/requirements-all.txt ty==0.0.35
+          pip-install: -r requirements/requirements-all.txt ty==0.0.26
      # - name: Type-check with Pyright
      #   uses: jakebailey/pyright-action@v2
      #   with:
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -598,29 +598,15 @@ jobs:
        shell: bash

    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6

-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
@@ -628,6 +614,10 @@ jobs:
          variant: ccache
          evict-old-files: 1d

+      - name: Install
+        run:  |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
      - name: Build
        id: cmake_build
        shell: cmd
@@ -680,82 +670,6 @@ jobs:
          path: llama-bin-win-sycl-x64.zip
          name: llama-bin-win-sycl-x64.zip

-  ubuntu-24-sycl:
-    strategy:
-      matrix:
-        build: [fp32, fp16]
-        include:
-          - build: fp32
-            fp16: OFF
-          - build: fp16
-            fp16: ON
-
-    runs-on: ubuntu-24.04
-
-    env:
-      ONEAPI_ROOT: /opt/intel/oneapi/
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          cd /tmp
-          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-sycl-${{ matrix.build }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DLLAMA_OPENSSL=OFF \
-            -DGGML_NATIVE=OFF \
-            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
-          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
-
  ubuntu-22-rocm:
    runs-on: ubuntu-22.04

@@ -1131,7 +1045,6 @@ jobs:
      - ubuntu-cpu
      - ubuntu-vulkan
      - ubuntu-24-openvino
-      - ubuntu-24-sycl
      - android-arm64
      - macOS-cpu
      - ios-xcode-build
@@ -1220,8 +1133,6 @@ jobs:
            - [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
            - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
            - [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
-            - [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
-            - [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)

            **Android:**
            - [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,7 @@
 /.vscode/
 /nppBackup

+
 # Coverage

 /gcovr-report/
@@ -73,7 +74,6 @@
 !/models/templates

 # Zig
-
 /zig-out/
 /zig-cache/

@@ -93,7 +93,6 @@
 !/examples/sycl/*.sh

 # Server Web UI temporary files
-
 /tools/server/webui/node_modules
 /tools/server/webui/dist
 # we no longer use gz for index.html
@@ -105,16 +104,11 @@
 __pycache__/
 */poetry.lock
 poetry.toml
-poetry.lock
-uv.lock

 # Nix
-
-flake.lock
 /result

 # Test binaries
-
 /tests/test-backend-ops
 /tests/test-double-float
 /tests/test-grad0
@@ -130,7 +124,6 @@ flake.lock
 /tests/test-tokenizer-1-spm

 # Scripts
-
 !/scripts/install-oneapi.bat

 # Generated by scripts
@@ -139,24 +132,16 @@ flake.lock
 /wikitext-2-raw/

 # Test models for lora adapters
-
 /lora-tests

 # Local scripts
-
 /run-vim.sh
 /run-chat.sh
 /run-spec.sh
 /.ccache/

 # IDE
-
 /*.code-workspace
 /.windsurf/
 # emscripten
 a.out.*
-
-# AGENTS
-
-AGENTS.local.md
-.pi/SYSTEM.md
--- a/.pi/gg/SYSTEM.md
+++ b/.pi/gg/SYSTEM.md
@@ -1,34 +0,0 @@
-You are a coding agent. Here are some very important rules that you must follow:
-
-General:
- By very precise and concise when writing code, comments, explanations, etc.
- PR and commit titles format: `<module> : <title>`. Lookup recents for examples
- Don't try to build or run the code unless you are explicitly asked to do so
- Use the `gh` CLI tool when querying PRs, issues, or other GitHub resources
-
-Coding:
- When in doubt, always refer to the CONTRIBUTING.md file of the project
- When referencing issues or PRs in comments, use the format:
-  - C/C++ code: `// ref: <url>`
-  - Other (CMake, etc.): `# ref: <url>`
-
-Pull requests (PRs):
- New branch names are prefixed with "gg/"
- Before opening a pull request, ask the user to confirm the description
- When creating a pull request, look for the repository's PR template and follow it
- For the AI usage disclosure section, write "YES. llama.cpp + pi"
- Always create the pull requests in draft mode
-
-Commits:
- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
- Do not explicitly set the git author in commits - rely on the default git config
-
-Resources (read on demand):
- [CONTRIBUTING.md](CONTRIBUTING.md)
- [Build documentation](docs/build.md)
- [Server usage documentation](tools/server/README.md)
- [Server development documentation](tools/server/README-dev.md)
- [PEG parser](docs/development/parsing.md)
- [Auto parser](docs/autoparser.md)
- [Jinja engine](common/jinja/README.md)
- [PR template](.github/pull_request_template.md)
--- a/11
+++ b/11
@@ -23,7 +23,6 @@
 /ci/                                    @ggerganov
 /cmake/                                 @ggerganov
 /common/                                @ggml-org/llama-common
-/common/fit.*                           @JohannesGaessler
 /common/jinja/                          @CISC
 /common/ngram-map.*                     @srogmann
 /convert_*.py                           @CISC
@@ -53,30 +52,28 @@
 /examples/speculative/                  @ggerganov
 /ggml/cmake/                            @ggerganov
 /ggml/include/                          @ggerganov
-/ggml/src/ggml-backend-meta.cpp         @JohannesGaessler
 /ggml/src/ggml-cann/                    @ggml-org/ggml-cann
 /ggml/src/ggml-common.h                 @ggerganov
 /ggml/src/ggml-cpu/                     @ggerganov
 /ggml/src/ggml-cpu/spacemit/            @alex-spacemit
 /ggml/src/ggml-cuda/                    @ggml-org/ggml-cuda
-/ggml/src/ggml-cuda/vendors/hip.h       @IMbackK
 /ggml/src/ggml-cuda/fattn-wmma*         @IMbackK
-/ggml/src/ggml-hexagon/                 @ggml-org/ggml-hexagon
 /ggml/src/ggml-hip/                     @IMbackK
+/ggml/src/ggml-cuda/vendors/hip.h       @IMbackK
 /ggml/src/ggml-impl.h                   @ggerganov
 /ggml/src/ggml-metal/                   @ggml-org/ggml-metal
 /ggml/src/ggml-opencl/                  @ggml-org/ggml-opencl
-/ggml/src/ggml-openvino/                @cavusmustafa @wine99
+/ggml/src/ggml-hexagon/                 @ggml-org/ggml-hexagon
 /ggml/src/ggml-opt.cpp                  @JohannesGaessler
 /ggml/src/ggml-quants.*                 @ggerganov
 /ggml/src/ggml-rpc/                     @ggml-org/ggml-rpc
 /ggml/src/ggml-sycl/                    @ggml-org/ggml-sycl
 /ggml/src/ggml-threading.*              @ggerganov
-/ggml/src/ggml-virtgpu/                 @kpouget
 /ggml/src/ggml-vulkan/                  @ggml-org/ggml-vulkan
+/ggml/src/ggml-virtgpu/                 @kpouget
 /ggml/src/ggml-webgpu/                  @ggml-org/ggml-webgpu
 /ggml/src/ggml-zdnn/                    @ggml-org/ggml-zdnn @Andreas-Krebbel @AlekseiNikiforovIBM
-/ggml/src/ggml-zendnn/                  @avinashcpandey @Jiten1parmar @z-vishal
+/ggml/src/ggml-openvino/                @cavusmustafa @wine99
 /ggml/src/ggml.c                        @ggerganov
 /ggml/src/ggml.cpp                      @ggerganov
 /ggml/src/gguf.cpp                      @JohannesGaessler @Green-Sky
--- a/README.md
+++ b/README.md
@@ -529,7 +529,6 @@ To learn more about model quantization, [read this documentation](tools/quantize
 - [How to build](docs/build.md)
 - [Running on Docker](docs/docker.md)
 - [Build on Android](docs/android.md)
- [Multi-GPU usage](docs/multi-gpu.md)
 - [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
 - [GGML tips & tricks](https://github.com/ggml-org/llama.cpp/wiki/GGML-Tips-&-Tricks)

--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -73,8 +73,6 @@ add_library(${TARGET}
    debug.h
    download.cpp
    download.h
-    fit.cpp
-    fit.h
    hf-cache.cpp
    hf-cache.h
    http.h
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@@ -25,8 +25,7 @@ struct common_arg {
    const char * value_hint_2 = nullptr; // for second arg value
    const char * env          = nullptr;
    std::string help;
-    bool is_sampling = false; // is current arg a sampling param?
-    bool is_spec = false; // is current arg a speculative decoding param?
+    bool is_sparam = false; // is current arg a sampling param?
    bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
    void (*handler_void)   (common_params & params) = nullptr;
    void (*handler_string) (common_params & params, const std::string &) = nullptr;
@@ -75,8 +74,7 @@ struct common_arg {
    common_arg & set_examples(std::initializer_list<enum llama_example> examples);
    common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
    common_arg & set_env(const char * env);
-    common_arg & set_sampling();
-    common_arg & set_spec();
+    common_arg & set_sparam();
    common_arg & set_preset_only();
    bool in_example(enum llama_example ex);
    bool is_exclude(enum llama_example ex);
@@ -129,8 +127,5 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
 // see: https://github.com/ggml-org/llama.cpp/issues/18163
 void common_params_add_preset_options(std::vector<common_arg> & args);

-// Populate model paths (main model, mmproj, etc) from -hf if necessary
-void common_params_handle_models(common_params & params, llama_example curr_ex);
-
 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@@ -136,10 +136,10 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co
        if (!end.empty()) {
            if (!start.empty()) {
                // Standard tag-based: optional(<think>reasoning</think>)
-                return p.optional(p.optspace(start) + p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end));
+                return p.optional(start + p.reasoning(p.until(end)) + end + p.space());
            }
            // Delimiter-style (empty start)
-            return p.optional(p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end));
+            return p.optional(p.reasoning(p.until(end)) + end + p.space());
        }
    }

@@ -186,6 +186,7 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const
 common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
+    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;

    // Build effective field names with dot notation if function_field is set
    std::string name_field = format.name_field;
@@ -224,7 +225,8 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
        tool_start = format.per_call_start;
    }

-    return ctx.reasoning_parser + p.optional(p.content(p.until(tool_start))) + tools_parser + p.end();
+    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(p.until(tool_start)))) + tools_parser +
+           p.end();
 }

 common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p, const std::string & name,
@@ -268,6 +270,7 @@ common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p,
 common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
+    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;

    common_peg_parser tool_choice = p.choice();

@@ -333,12 +336,14 @@ common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context

    std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
    auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
-    return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end();
+    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
+           p.end();
 }

 common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
+    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;

    auto until_suffix = p.rule("until-suffix", p.until(arguments.value_suffix));

@@ -369,7 +374,9 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
                                           arguments.name_suffix) +
                           arguments.value_prefix +
                           (schema_info.resolves_to_string(param_schema) ?
-                                p.tool_arg_string_value(until_suffix) :
+                                p.tool_arg_string_value(p.schema(until_suffix,
+                                                                 "tool-" + name + "-arg-" + param_name + "-schema",
+                                                                 param_schema, true)) :
                                p.tool_arg_json_value(p.schema(
                                    p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
                                    p.space()) +
@@ -464,7 +471,8 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte

    std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
    auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
-    return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end();
+    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
+           p.end();
 }

 }  // namespace autoparser
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@@ -296,7 +296,7 @@ void analyze_reasoning::compare_reasoning_presence() {
            return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())) + p.rest());
        });
        auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
-            return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.tag("post", (p.space() + p.marker() + p.space())) + p.rest();
+            return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
        });
        // try the more aggressive parse first, if it fails, fall back to the delimiter one
        auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
@@ -306,11 +306,11 @@ void analyze_reasoning::compare_reasoning_presence() {
        if (result.result.success()) {
            if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
                mode = reasoning_mode::TAG_BASED;
-                start = result.tags["pre"];
-                end   = result.tags["post"];
+                start = trim_leading_whitespace(result.tags["pre"]);
+                end   = trim_trailing_whitespace(result.tags["post"]);
            } else if (!result.tags["post"].empty()) {
                mode = reasoning_mode::TAG_BASED;
-                end = result.tags["post"];
+                end = trim_trailing_whitespace(result.tags["post"]);
            }
        }
    }
@@ -342,7 +342,7 @@ void analyze_reasoning::compare_thinking_enabled() {
    if (left_trimmed.empty() && !diff.right.empty()) {
        if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
            if (start.empty()) {
-                start = diff.right;
+                start = trim_leading_whitespace(diff.right);
                mode  = reasoning_mode::TAG_BASED;
            }
        }
@@ -353,7 +353,7 @@ void analyze_reasoning::compare_thinking_enabled() {
                if (seg.size() >= 2 && seg[seg.size() - 1].value == left_trimmed && seg[seg.size() - 2].type == segment_type::MARKER) {
                    start = seg[seg.size() - 2].value;
                }
-                end = diff.left;
+                end = trim_trailing_whitespace(diff.left);
                mode = reasoning_mode::TAG_BASED;
            }
        }
@@ -445,14 +445,14 @@ void analyze_reasoning::compare_reasoning_scope() {
        auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
        if (result.result.success()) {
            start = result.tags["pre"];
-            end = result.tags["post"];
+            end = trim_trailing_whitespace(result.tags["post"]);
        } else {
            auto parser_delimiter = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
                return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())));
            });
            result = parser_delimiter.parse_anywhere_and_extract(comparison->output_B);
            if (result.result.success()) {
-                end = result.tags["post"];
+                end = trim_trailing_whitespace(result.tags["post"]);
            } else {
                LOG_DBG(ANSI_ORANGE "%s: Unable to extract reasoning markers, falling back to reasoning = NONE\n" ANSI_RESET, __func__);
                mode = reasoning_mode::NONE;
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@@ -816,32 +816,6 @@ common_peg_parser common_chat_peg_builder::prefix(const std::string & s, const s
    return literal(s.substr(0, s.rfind(delimiter)));
 }

-common_peg_parser common_chat_peg_builder::optspace(const std::string & tag) {
-    auto parser = eps();
-    size_t end_of_prefix_space = tag.size();
-    size_t start_of_suffix_space = tag.size();
-    for (size_t i = 0; i < tag.size(); i++) {
-        if (!std::isspace(tag[i])) {
-            end_of_prefix_space = i;
-            break;
-        }
-    }
-    for (size_t i = tag.size(); i > 0; i--) {
-        if (!std::isspace(tag[i - 1])) {
-            start_of_suffix_space = i;
-            break;
-        }
-    }
-    for (size_t i = 0; i < end_of_prefix_space; i++) {
-        parser += optional(literal(std::string(1, tag[i])));
-    }
-    parser += literal(tag.substr(end_of_prefix_space, start_of_suffix_space - end_of_prefix_space));
-    for (size_t i = start_of_suffix_space; i < tag.size(); i++) {
-        parser += optional(literal(std::string(1, tag[i])));
-    }
-    return parser;
-}
-
 common_peg_parser common_chat_peg_builder::standard_json_tools(
                                                       const std::string &              section_start,
                                                       const std::string &              section_end,
--- a/common/chat-peg-parser.h
+++ b/common/chat-peg-parser.h
@@ -96,9 +96,6 @@ class common_chat_peg_builder : public common_peg_parser_builder {
    // Return a parser that parses the prefix of a string, up to a given delimiter.
    common_peg_parser prefix(const std::string & s, const std::string & delimiter = {});

-    // Return a parser that parses all elements of tag, but leading and trailing spaces are optional
-    common_peg_parser optspace(const std::string & tag);
-
    // Legacy-compatible helper for building standard JSON tool calls
    // Used by tests and manual parsers
    // name_key/args_key: JSON key names for function name and arguments
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -80,7 +80,7 @@ json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
    if (!content.empty()) {
        jmsg["content"] = content;
    } else if (!content_parts.empty()) {
-        if (concat_typed_text || contains_media()) {
+        if (concat_typed_text) {
            std::string text;
            bool last_was_media_marker = false;
            // join parts with newline, do not add newline before or after media markers
@@ -397,25 +397,6 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
    return render_message_to_json(msgs, c);
 }

-json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
-    if (tools.empty()) {
-        return json();
-    }
-
-    auto result = json::array();
-    for (const auto & tool : tools) {
-        result.push_back({
-            { "type",     "function" },
-            { "function", {
-                { "name", tool.name },
-                { "description", tool.description },
-                { "parameters", json::parse(tool.parameters) },
-            }},
-        });
-    }
-    return result;
-}
-
 std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & tools) {
    std::vector<common_chat_tool> result;

@@ -451,6 +432,56 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
    return result;
 }

+json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
+    if (tools.empty()) {
+        return json();
+    }
+
+    auto result = json::array();
+    for (const auto & tool : tools) {
+        result.push_back({
+            { "type",     "function" },
+            { "function",
+             {
+                  { "name", tool.name },
+                  { "description", tool.description },
+                  { "parameters", json::parse(tool.parameters) },
+              }                      },
+        });
+    }
+    return result;
+}
+
+json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
+    json delta = json::object();
+    if (!diff.reasoning_content_delta.empty()) {
+        delta["reasoning_content"] = diff.reasoning_content_delta;
+    }
+    if (!diff.content_delta.empty()) {
+        delta["content"] = diff.content_delta;
+    }
+    if (diff.tool_call_index != std::string::npos) {
+        json tool_call;
+        tool_call["index"] = diff.tool_call_index;
+        if (!diff.tool_call_delta.id.empty()) {
+            tool_call["id"]   = diff.tool_call_delta.id;
+            tool_call["type"] = "function";
+        }
+        if (!diff.tool_call_delta.name.empty() || !diff.tool_call_delta.arguments.empty()) {
+            json function = json::object();
+            if (!diff.tool_call_delta.name.empty()) {
+                function["name"] = diff.tool_call_delta.name;
+            }
+            if (!diff.tool_call_delta.arguments.empty()) {
+                function["arguments"] = diff.tool_call_delta.arguments;
+            }
+            tool_call["function"] = function;
+        }
+        delta["tool_calls"] = json::array({ tool_call });
+    }
+    return delta;
+}
+
 bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
    if (use_jinja) {
        try {
@@ -544,26 +575,6 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
    return tmpls->has_explicit_template;
 }

-// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
-// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
-static bool is_lfm2_template(const std::string & src) {
-    return src.find("<|tool_list_start|>") != std::string::npos &&
-           src.find("<|tool_list_end|>")   != std::string::npos;
-}
-
-common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates) {
-    common_chat_prompt_preset asr_preset;
-    asr_preset.system = "";
-    asr_preset.user   = "Transcribe audio to text";
-
-    if (chat_templates && chat_templates->template_default && is_lfm2_template(chat_templates->template_default->source())) {
-        asr_preset.system = "Perform ASR.";
-        asr_preset.user   = "";
-    }
-
-    return asr_preset;
-}
-
 std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
    if (!variant.empty()) {
        if (variant == "tool_use") {
@@ -2073,7 +2084,10 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
        return common_chat_params_init_kimi_k2(tmpl, params);
    }

-    if (is_lfm2_template(src)) {
+    // LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
+    // and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
+    if (src.find("<|tool_list_start|>") != std::string::npos &&
+        src.find("<|tool_list_end|>") != std::string::npos) {
        LOG_DBG("Using specialized template: LFM2\n");
        return common_chat_params_init_lfm2(tmpl, params);
    }
@@ -2116,38 +2130,22 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
    return std::nullopt;
 }

-static std::string common_chat_templates_generation_prompt(const common_chat_template & tmpl, const autoparser::generation_params & inputs) {
-    autoparser::generation_params params = inputs;
-    params.add_generation_prompt = false;
-    std::string no_gen_prompt    = common_chat_template_direct_apply_impl(tmpl, params);
-    params.add_generation_prompt = true;
-    std::string gen_prompt       = common_chat_template_direct_apply_impl(tmpl, params);
-
-    size_t prefix_len = 0;
-    size_t min_size = std::min(no_gen_prompt.size(), gen_prompt.size());
-    while (prefix_len < min_size && no_gen_prompt[prefix_len] == gen_prompt[prefix_len]) {
-        prefix_len++;
-    }
-    return gen_prompt.substr(prefix_len);
-}
-
 static common_chat_params common_chat_templates_apply_jinja(const struct common_chat_templates *        tmpls,
                                                            const struct common_chat_templates_inputs & inputs) {
    autoparser::generation_params params;
    params.tools = common_chat_tools_to_json_oaicompat(inputs.tools);
    const auto & tmpl =
        params.tools.is_array() && tmpls->template_tool_use ? *tmpls->template_tool_use : *tmpls->template_default;
-    const auto & src             = tmpl.source();
-    const auto & caps            = tmpl.original_caps();
-    params.messages              = render_message_to_json(inputs.messages, tmpl.original_caps());
-    params.tool_choice           = inputs.tool_choice;
-    params.reasoning_format      = inputs.reasoning_format;
-    params.enable_thinking       = inputs.enable_thinking;
-    params.grammar               = inputs.grammar;
-    params.now                   = inputs.now;
-    params.add_generation_prompt = inputs.add_generation_prompt;
-    params.add_bos               = tmpls->add_bos;
-    params.add_eos               = tmpls->add_eos;
+    const auto & src        = tmpl.source();
+    const auto & caps       = tmpl.original_caps();
+    params.messages         = render_message_to_json(inputs.messages, tmpl.original_caps());
+    params.tool_choice      = inputs.tool_choice;
+    params.reasoning_format = inputs.reasoning_format;
+    params.enable_thinking  = inputs.enable_thinking;
+    params.grammar          = inputs.grammar;
+    params.now              = inputs.now;
+    params.add_bos          = tmpls->add_bos;
+    params.add_eos          = tmpls->add_eos;

    if (src.find("<|channel|>") == std::string::npos) {
        // map developer to system for all models except for GPT-OSS
@@ -2169,7 +2167,14 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        workaround::func_args_not_string(params.messages);
    }

-    params.generation_prompt = common_chat_templates_generation_prompt(tmpl, params);
+    params.add_generation_prompt = false;
+    std::string no_gen_prompt    = common_chat_template_direct_apply_impl(tmpl, params);
+    params.add_generation_prompt = true;
+    std::string gen_prompt       = common_chat_template_direct_apply_impl(tmpl, params);
+    auto        diff             = calculate_diff_split(no_gen_prompt, gen_prompt);
+    params.generation_prompt     = diff.right + diff.suffix;
+
+    params.add_generation_prompt = inputs.add_generation_prompt;

    params.extra_context = common_chat_extra_context();
    for (auto el : inputs.chat_template_kwargs) {
@@ -2221,8 +2226,8 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
        auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
        if (auto_params.supports_thinking) {
-            auto_params.thinking_start_tag = trim_whitespace(autoparser.reasoning.start);
-            auto_params.thinking_end_tag   = trim_whitespace(autoparser.reasoning.end);
+            auto_params.thinking_start_tag = autoparser.reasoning.start;
+            auto_params.thinking_end_tag   = autoparser.reasoning.end;
        }
        auto_params.generation_prompt = params.generation_prompt;
        common_peg_arena arena;
@@ -2391,3 +2396,4 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem
    GGML_ASSERT(chat_templates->template_default != nullptr);
    return chat_templates->template_default->caps.to_map();
 }
+
--- a/common/chat.h
+++ b/common/chat.h
@@ -94,15 +94,6 @@ struct common_chat_msg {
               tool_name.empty() && tool_call_id.empty();
    }

-    bool contains_media() const {
-        for (const auto & part : content_parts) {
-            if (part.type == "media_marker") {
-                return true;
-            }
-        }
-        return false;
-    }
-
    void set_tool_call_ids(std::vector<std::string> &           ids_cache,
                           const std::function<std::string()> & gen_tool_call_id) {
        for (auto i = 0u; i < tool_calls.size(); i++) {
@@ -265,13 +256,14 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates *
 // Parses a JSON array of messages in OpenAI's chat completion API format.
 std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);

-std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
-
 // DEPRECATED: only used in tests
 nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);

+std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
 nlohmann::ordered_json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);

+nlohmann::ordered_json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
+
 // get template caps, useful for reporting to server /props endpoint
 std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates);

@@ -283,11 +275,3 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
        const common_chat_template &          tmpl,
        const std::string &                   src,
        autoparser::generation_params & params);
-
-// specialized per-task preset
-struct common_chat_prompt_preset {
-    std::string system;
-    std::string user;
-};
-
-common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -3,7 +3,6 @@

 #include "build-info.h"
 #include "common.h"
-#include "fit.h"
 #include "log.h"
 #include "llama.h"
 #include "sampling.h"
@@ -70,7 +69,7 @@ common_time_meas::~common_time_meas() {
 // CPU utils
 //

-int32_t common_cpu_get_num_physical_cores() {
+int32_t cpu_get_num_physical_cores() {
 #ifdef __linux__
    // enumerate the set of thread siblings, num entries is num cores
    std::unordered_set<std::string> siblings;
@@ -185,11 +184,11 @@ static int cpu_count_math_cpus(int n_cpu) {
 /**
 * Returns number of CPUs on system that are useful for math.
 */
-int32_t common_cpu_get_num_math() {
+int32_t cpu_get_num_math() {
 #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
    int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
    if (n_cpu < 1) {
-        return common_cpu_get_num_physical_cores();
+        return cpu_get_num_physical_cores();
    }
    if (is_hybrid_cpu()) {
        cpu_set_t affinity;
@@ -202,7 +201,7 @@ int32_t common_cpu_get_num_math() {
        }
    }
 #endif
-    return common_cpu_get_num_physical_cores();
+    return cpu_get_num_physical_cores();
 }

 // Helper for setting process priority
@@ -263,7 +262,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
 //


-void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model) {
+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
    int32_t n_set = 0;

    if (cpuparams.n_threads < 0) {
@@ -271,7 +270,7 @@ void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_para
        if (role_model != nullptr) {
            cpuparams = *role_model;
        } else {
-            cpuparams.n_threads = common_cpu_get_num_math();
+            cpuparams.n_threads = cpu_get_num_math();
        }
    }

@@ -1148,7 +1147,7 @@ common_init_result::common_init_result(common_params & params) :

    if (params.fit_params) {
        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
-        common_fit_params(params.model.path.c_str(), &mparams, &cparams,
+        llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
            params.tensor_split,
            params.tensor_buft_overrides.data(),
            params.fit_params_target.data(),
@@ -1422,7 +1421,7 @@ common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {

    // try to remove the last tokens
    if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
-        LOG_WRN("%s: the context does not support partial sequence removal\n", __func__);
+        LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
        res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
        goto done;
    }
@@ -1521,7 +1520,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    return cparams;
 }

-struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params) {
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
    struct ggml_threadpool_params tpp;

    ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
@@ -1960,102 +1959,3 @@ bool common_prompt_batch_decode(

    return true;
 }
-
-size_t common_prompt_checkpoint::size() const {
-    return data_tgt.size() + data_dft.size();
-}
-
-bool common_prompt_checkpoint::empty() const {
-    return data_tgt.empty();
-}
-
-void common_prompt_checkpoint::clear() {
-    n_tokens = 0;
-
-    pos_min = 0;
-    pos_max = 0;
-
-    data_tgt.clear();
-    data_dft.clear();
-}
-
-void common_prompt_checkpoint::update_pos(
-        int64_t n_tokens,
-        llama_pos pos_min,
-        llama_pos pos_max) {
-    this->n_tokens = n_tokens;
-    this->pos_min  = pos_min;
-    this->pos_max  = pos_max;
-}
-
-void common_prompt_checkpoint::update_tgt(
-        llama_context * ctx,
-        llama_seq_id seq_id,
-        llama_state_seq_flags flags) {
-    if (ctx == nullptr) {
-        return;
-    }
-
-    const size_t ckpt_size = llama_state_seq_get_size_ext(ctx, seq_id, flags);
-
-    data_tgt.resize(ckpt_size);
-
-    const size_t n = llama_state_seq_get_data_ext(ctx, data_tgt.data(), ckpt_size, seq_id, flags);
-    if (n != ckpt_size) {
-        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size, n);
-    }
-}
-
-void common_prompt_checkpoint::update_dft(
-        llama_context * ctx,
-        llama_seq_id seq_id,
-        llama_state_seq_flags flags) {
-    if (ctx == nullptr) {
-        return;
-    }
-
-    const size_t ckpt_size = llama_state_seq_get_size_ext(ctx, seq_id, flags);
-
-    data_dft.resize(ckpt_size);
-
-    const size_t n = llama_state_seq_get_data_ext(ctx, data_dft.data(), ckpt_size, seq_id, flags);
-    if (n != ckpt_size) {
-        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size, n);
-    }
-}
-
-void common_prompt_checkpoint::load_tgt(
-        llama_context * ctx,
-        llama_seq_id seq_id,
-        llama_state_seq_flags flags) const {
-    if (ctx == nullptr) {
-        return;
-    }
-
-    if (data_tgt.empty()) {
-        return;
-    }
-
-    const size_t n = llama_state_seq_set_data_ext(ctx, data_tgt.data(), data_tgt.size(), seq_id, flags);
-    if (n != data_tgt.size()) {
-        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_tgt.size(), n);
-    }
-}
-
-void common_prompt_checkpoint::load_dft(
-        llama_context * ctx,
-        llama_seq_id seq_id,
-        llama_state_seq_flags flags) const {
-    if (ctx == nullptr) {
-        return;
-    }
-
-    if (data_dft.empty()) {
-        return;
-    }
-
-    const size_t n = llama_state_seq_set_data_ext(ctx, data_dft.data(), data_dft.size(), seq_id, flags);
-    if (n != data_dft.size()) {
-        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_dft.size(), n);
-    }
-}
--- a/common/common.h
+++ b/common/common.h
@@ -54,7 +54,7 @@ struct common_control_vector_load_info;
 // CPU utils
 //

-struct common_cpu_params {
+struct cpu_params {
    int      n_threads                   = -1;
    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
    bool     mask_valid                  = false;   // Default: any CPU
@@ -63,8 +63,8 @@ struct common_cpu_params {
    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
 };

-int32_t common_cpu_get_num_physical_cores();
-int32_t common_cpu_get_num_math();
+int32_t cpu_get_num_physical_cores();
+int32_t cpu_get_num_math();

 //
 // Common params
@@ -274,7 +274,6 @@ struct common_params_sampling {
    std::vector<llama_token> reasoning_budget_start;           // start tag token sequence
    std::vector<llama_token> reasoning_budget_end;             // end tag token sequence
    std::vector<llama_token> reasoning_budget_forced;          // forced sequence (message + end tag)
-    std::string              reasoning_budget_message;         // message injected before end tag when budget exhausted

    bool backend_sampling = false;

@@ -295,73 +294,62 @@ struct common_params_model {
    std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
 };

-// draft-model-based speculative decoding parameters
-struct common_params_speculative_draft {
-    int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
-    int32_t n_min = 0;  // minimum number of draft tokens to use for speculative decoding
+struct common_ngram_mod;

-    float p_split = 0.1f;  // speculative decoding split probability
-    float p_min   = 0.75f; // minimum speculative decoding probability (greedy)
+struct common_params_speculative {
+    common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding

-    common_params_model mparams;
+    // general-purpose speculative decoding parameters

-    llama_context * ctx_tgt = nullptr;
-    llama_context * ctx_dft = nullptr;
+    int32_t n_max   = 16; // maximum number of tokens to draft during speculative decoding
+    int32_t n_min   = 0;  // minimum number of draft tokens to use for speculative decoding
+    float   p_split = 0.1f; // speculative decoding split probability
+    float   p_min   = 0.75f; // minimum speculative decoding probability (greedy)

+    // ngram-based speculative decoding
+
+    uint16_t ngram_size_n   = 12; // ngram size for lookup
+    uint16_t ngram_size_m   = 48; // mgram size for speculative tokens
+    uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
+
+    std::shared_ptr<common_ngram_mod> ngram_mod;
+
+    std::string lookup_cache_static;  // path of static ngram cache file for lookup decoding           // NOLINT
+    std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding          // NOLINT
+
+    // draft-model speculative decoding
+
+    struct common_params_model mparams_dft;
+
+    llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts
+
+    llama_context_params cparams_dft; // these are the parameters for the draft llama_context
+
+    int32_t n_ctx        = 0;  // draft context size
    int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)

    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V

-    common_cpu_params cpuparams;
-    common_cpu_params cpuparams_batch;
+    struct cpu_params cpuparams;
+    struct cpu_params cpuparams_batch;

    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

+    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
-};
-
-struct common_params_speculative_ngram_mod {
-    int32_t n_match = 24;
-
-    int32_t n_max = 64;
-    int32_t n_min = 48;
-};
-
-struct common_params_speculative_ngram_map {
-    uint16_t size_n   = 12; // ngram size for lookup
-    uint16_t size_m   = 48; // mgram size for speculative tokens
-    uint16_t min_hits = 1;  // minimum hits at ngram/mgram lookup for mgram to be proposed
-};
-
-struct common_params_speculative_ngram_cache {
-    std::string lookup_cache_static;  // path of static ngram cache file for lookup decoding
-    std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding
-};
-
-struct common_params_speculative {
-    std::vector<enum common_speculative_type> types = { COMMON_SPECULATIVE_TYPE_NONE };
-
-    common_params_speculative_draft draft;
-
-    common_params_speculative_ngram_mod ngram_mod;
-    common_params_speculative_ngram_map ngram_simple;
-    common_params_speculative_ngram_map ngram_map_k;
-    common_params_speculative_ngram_map ngram_map_k4v;
-
-    common_params_speculative_ngram_cache ngram_cache;

    bool has_dft() const {
-        return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty();
+        return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
    }
 };

 struct common_params_vocoder {
    struct common_params_model model;

-    std::string speaker_file; // speaker file path
+    std::string speaker_file = ""; // speaker file path                                      // NOLINT

-    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy
+    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
 };

 struct common_params_diffusion {
@@ -432,20 +420,19 @@ struct common_params {
    // offload params
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

-    int32_t n_gpu_layers       = -1;    // number of layers to store in VRAM, -1 is auto, <= -2 is all
-    int32_t main_gpu           = 0;     // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]  = {0};   // how split tensors should be distributed across GPUs
-    bool    fit_params         = true;  // whether to fit unset model/context parameters to free device memory
-    bool    fit_params_print   = false; // print the estimated required memory to run the model
-    int32_t fit_params_min_ctx = 4096;  // minimum context size to set when trying to reduce memory use
+    int32_t n_gpu_layers       = -1;   // number of layers to store in VRAM, -1 is auto, <= -2 is all
+    int32_t main_gpu           = 0;    // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]  = {0};  // how split tensors should be distributed across GPUs
+    bool    fit_params         = true; // whether to fit unset model/context parameters to free device memory
+    int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use

    // margin per device in bytes for fitting parameters to free memory:
    std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);

    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs

-    common_cpu_params cpuparams;
-    common_cpu_params cpuparams_batch;
+    struct cpu_params cpuparams;
+    struct cpu_params cpuparams_batch;

    ggml_backend_sched_eval_callback cb_eval = nullptr;
    void * cb_eval_user_data                 = nullptr;
@@ -593,6 +580,8 @@ struct common_params {
    bool force_pure_content_parser = false;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
+    int reasoning_budget = -1;
+    std::string reasoning_budget_message; // message injected before end tag when budget exhausted
    bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
    int sleep_idle_seconds = -1;   // if >0, server will sleep after this many seconds of idle time

@@ -689,7 +678,7 @@ std::string common_params_get_system_info(const common_params & params);

 bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
 bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
-void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model = nullptr);
+void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
 bool set_process_priority(enum ggml_sched_priority prio);

 //
@@ -757,11 +746,6 @@ inline bool string_starts_with(std::string_view str, std::string_view prefix) {
           str.compare(0, prefix.size(), prefix) == 0;
 }

-// remove when moving to c++20
-inline bool string_starts_with(std::string_view str, char prefix) {
-    return !str.empty() && str.front() == prefix;
-}
-
 // remove when moving to c++20
 inline bool string_ends_with(std::string_view str, std::string_view suffix) {
    return str.size() >= suffix.size() &&
@@ -857,7 +841,7 @@ common_init_result_ptr common_init_from_params(common_params & params);

 struct llama_model_params     common_model_params_to_llama  (      common_params & params);
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
-struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params);
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
@@ -1017,47 +1001,3 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std

 // "adamw" or "sgd" (case insensitive)
 enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
-
-//
-// prompt utils
-//
-
-struct common_prompt_checkpoint {
-    int64_t n_tokens;
-
-    llama_pos pos_min;
-    llama_pos pos_max;
-
-    std::vector<uint8_t> data_tgt;
-    std::vector<uint8_t> data_dft;
-
-    size_t size() const;
-
-    bool empty() const;
-    void clear();
-
-    void update_pos(
-            int64_t n_tokens,
-            llama_pos pos_min,
-            llama_pos pos_max);
-
-    void update_tgt(
-            llama_context * ctx,
-            llama_seq_id seq_id,
-            llama_state_seq_flags flags);
-
-    void update_dft(
-            llama_context * ctx,
-            llama_seq_id seq_id,
-            llama_state_seq_flags flags);
-
-    void load_tgt(
-            llama_context * ctx,
-            llama_seq_id seq_id,
-            llama_state_seq_flags flags) const;
-
-    void load_dft(
-            llama_context * ctx,
-            llama_seq_id seq_id,
-            llama_state_seq_flags flags) const;
-};
--- a/common/debug.cpp
+++ b/common/debug.cpp
@@ -1,38 +1,9 @@
 #include "debug.h"

-#include "common.h"
 #include "log.h"

 #include <cmath>
-#include <regex>
 #include <string>
-#include <vector>
-
-struct common_debug_cb_user_data::impl {
-    std::vector<uint8_t>    data;
-    std::vector<std::regex> tensor_filters;
-    bool                    abort_on_nan{false};
-};
-
-common_debug_cb_user_data::common_debug_cb_user_data() : pimpl(std::make_unique<impl>()) {}
-common_debug_cb_user_data::~common_debug_cb_user_data() = default;
-
-common_debug_cb_user_data::common_debug_cb_user_data(common_params & params, const std::vector<std::string> & filter_patterns, bool abort_on_nan)
-    : pimpl(std::make_unique<impl>())
-{
-    for (const auto & pattern : filter_patterns) {
-        try {
-            std::string anchored_pattern = "^" + pattern;
-            pimpl->tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
-        } catch (const std::regex_error & e) {
-            throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
-        }
-    }
-    pimpl->abort_on_nan = abort_on_nan;
-
-    params.cb_eval           = common_debug_cb_eval;
-    params.cb_eval_user_data = this;
-}

 static std::string common_ggml_ne_string(const ggml_tensor * t) {
    std::string str;
@@ -76,7 +47,8 @@ static float common_ggml_get_float_value(const uint8_t * data,

 #define INDENT "    "

-static void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n, bool abort_on_nan) {
+template <bool abort>
+void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
    GGML_ASSERT(n > 0);
    float sum = 0;
    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
@@ -122,7 +94,7 @@ static void common_debug_print_tensor(uint8_t * data, ggml_type type, const int6
        LOG(INDENT "sum = %f\n", sum);
    }

-    if (abort_on_nan) {
+    if constexpr (abort) {
        if (std::isnan(sum)) {
            LOG("encountered NaN - aborting\n");
            exit(0);
@@ -140,9 +112,8 @@ static void common_debug_print_tensor(uint8_t * data, ggml_type type, const int6
 * @param user_data user data to pass at each call back
 * @return true to receive data or continue the graph, false otherwise
 */
-bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
-    auto * cb_data = (common_debug_cb_user_data *) user_data;
-    auto * pimpl = cb_data->pimpl.get();
+template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
+    auto * cb_data = (base_callback_data *) user_data;

    const struct ggml_tensor * src0 = t->src[0];
    const struct ggml_tensor * src1 = t->src[1];
@@ -151,10 +122,10 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
        return true;  // Always retrieve data
    }

-    bool matches_filter = pimpl->tensor_filters.empty();
+    bool matches_filter = cb_data->tensor_filters.empty();

    if (!matches_filter) {
-        for (const auto & filter : pimpl->tensor_filters) {
+        for (const auto & filter : cb_data->tensor_filters) {
            if (std::regex_search(t->name, filter)) {
                matches_filter = true;
                break;
@@ -177,14 +148,20 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {

    if (!is_host) {
        auto n_bytes = ggml_nbytes(t);
-        pimpl->data.resize(n_bytes);
-        ggml_backend_tensor_get(t, pimpl->data.data(), 0, n_bytes);
+        cb_data->data.resize(n_bytes);
+        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
    }

    if (!ggml_is_quantized(t->type) && matches_filter) {
-        uint8_t * data = is_host ? (uint8_t *) t->data : pimpl->data.data();
-        common_debug_print_tensor(data, t->type, t->ne, t->nb, 3, pimpl->abort_on_nan);
+        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
+        common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
    }

    return true;
 }
+
+// Explicit template instantiations
+template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
+template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
+template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
+template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
--- a/common/debug.h
+++ b/common/debug.h
@@ -1,31 +1,43 @@
 #pragma once
-
-#include <memory>
+#include "common.h"
 #include <string>
 #include <vector>
+#include <regex>

 // common debug functions and structs

-struct common_params;
+// Print a tensor's detailed data
+// data - the tensor's data in byte format
+// type - the tensor's quantization type
+// ne   - the tensor dimensions array
+// nb   - the tensor strides array
+// n    - the number of rows/columns to fully print
+template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);

 // Intended to use as callback for ggml_backend_sched_eval_callback
 // prints tensors that are processed in the computation graph
-// by default prints all tensors, but can be configured by creating a `common_debug_cb_user_data` instance with
-// non-empty filter_patterns. See examples/debug.cpp for possible usage patterns
-// `common_debug_cb_user_data` contains `abort_on_nan` flag that determines whether an error should be thrown whenever a NaN is encountered
+// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
+// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
+// The template parameter determines whether an error should be thrown whenever a NaN is encountered
 // in a tensor (useful for stopping debug sessions on first erroneous tensor)
 // The callback data will be passed as the third parameter (user_data)
-bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
+template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
+struct base_callback_data {
+    std::vector<uint8_t>    data;
+    std::vector<std::regex> tensor_filters;

-struct common_debug_cb_user_data {
-    struct impl;
-    std::unique_ptr<impl> pimpl;
+    base_callback_data() = default;

-    common_debug_cb_user_data();
-    ~common_debug_cb_user_data();
-
-    common_debug_cb_user_data(const common_debug_cb_user_data &) = delete;
-    common_debug_cb_user_data & operator=(const common_debug_cb_user_data &) = delete;
-
-    common_debug_cb_user_data(common_params & params, const std::vector<std::string> & filter_patterns, bool abort_on_nan = false);
+    base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
+        for (const auto & pattern : filter_patterns) {
+            try {
+                std::string anchored_pattern = "^" + pattern;
+                tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
+            } catch (const std::regex_error & e) {
+                throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
+            }
+        }
+        params.cb_eval           = common_debug_cb_eval<false>;
+        params.cb_eval_user_data = this;
+    }
 };
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -627,7 +627,7 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
    if (!tag.empty()) {
        tags.push_back(tag);
    } else {
-        tags = {"Q4_K_M", "Q8_0"};
+        tags = {"Q4_K_M", "Q4_0"};
    }

    for (const auto & t : tags) {
--- a/common/fit.cpp
+++ b/common/fit.cpp
@@ -1,959 +0,0 @@
-#include "fit.h"
-
-#include "log.h"
-
-#include "../src/llama-ext.h"
-
-#include <array>
-#include <cassert>
-#include <stdexcept>
-#include <cinttypes>
-#include <set>
-#include <string>
-#include <vector>
-
-// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
-// enum to identify part of a layer for distributing its tensors:
-enum common_layer_fraction_t {
-    LAYER_FRACTION_NONE = 0, // nothing
-    LAYER_FRACTION_ATTN = 1, // attention
-    LAYER_FRACTION_UP   = 2, // attention + up
-    LAYER_FRACTION_GATE = 3, // attention + up + gate
-    LAYER_FRACTION_MOE  = 4, // everything but sparse MoE weights
-};
-
-class common_params_fit_exception : public std::runtime_error {
-    using std::runtime_error::runtime_error;
-};
-
-static std::vector<llama_device_memory_data> common_get_device_memory_data(
-        const char * path_model,
-        const llama_model_params * mparams,
-        const llama_context_params * cparams,
-        std::vector<ggml_backend_dev_t> & devs,
-        uint32_t & hp_ngl,
-        uint32_t & hp_n_ctx_train,
-        uint32_t & hp_n_expert,
-        ggml_log_level log_level) {
-    struct user_data_t {
-        struct {
-            ggml_log_callback callback;
-            void * user_data;
-        } original_logger;
-        ggml_log_level min_level; // prints below this log level go to debug log
-    };
-    user_data_t ud;
-    llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
-    ud.min_level = log_level;
-
-    llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
-        const user_data_t * ud = (const user_data_t *) user_data;
-        const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
-        ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
-    }, &ud);
-
-    llama_model_params mparams_copy = *mparams;
-    mparams_copy.no_alloc  = true;
-    mparams_copy.use_mmap  = false;
-    mparams_copy.use_mlock = false;
-
-    llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
-    if (model == nullptr) {
-        llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
-        throw std::runtime_error("failed to load model");
-    }
-
-    llama_context * ctx = llama_init_from_model(model, *cparams);
-    if (ctx == nullptr) {
-        llama_model_free(model);
-        llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
-        throw std::runtime_error("failed to create llama_context from model");
-    }
-
-    const size_t nd = llama_model_n_devices(model);
-    std::vector<llama_device_memory_data> ret(nd + 1);
-
-    llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
-
-    for (const auto & [buft, mb] : memory_breakdown) {
-        if (ggml_backend_buft_is_host(buft)) {
-            ret.back().mb.model   += mb.model;
-            ret.back().mb.context += mb.context;
-            ret.back().mb.compute += mb.compute;
-            continue;
-        }
-
-        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
-        if (!dev) {
-            continue;
-        }
-        for (size_t i = 0; i < nd; i++) {
-            if (dev == llama_model_get_device(model, i)) {
-                ret[i].mb.model   += mb.model;
-                ret[i].mb.context += mb.context;
-                ret[i].mb.compute += mb.compute;
-                break;
-            }
-        }
-    }
-
-    {
-        ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-        if (cpu_dev == nullptr) {
-            throw std::runtime_error("no CPU backend found");
-        }
-        size_t free;
-        size_t total;
-        ggml_backend_dev_memory(cpu_dev, &free, &total);
-        ret.back().free  = free;
-        ret.back().total = total;
-    }
-    for (size_t i = 0; i < nd; i++) {
-        ggml_backend_dev_t dev = llama_model_get_device(model, i);
-
-        size_t free;
-        size_t total;
-        ggml_backend_dev_memory(dev, &free, &total);
-
-        // Some non-GPU accelerator backends, such as BLAS, report 0/0 and rely on
-        // the host-memory fallback. For GPU-like backends, keep 0/0 so --fit does
-        // not assign anything to a device with an unknown memory budget.
-        if (free == 0 && total == 0) {
-            const enum ggml_backend_dev_type type = ggml_backend_dev_type(dev);
-            if (type == GGML_BACKEND_DEVICE_TYPE_GPU || type == GGML_BACKEND_DEVICE_TYPE_IGPU) {
-                LOG_WRN("%s: device %s did not report memory; --fit will not use it\n",
-                        __func__, ggml_backend_dev_name(dev));
-            } else {
-                free  = ret.back().free;
-                total = ret.back().total;
-            }
-        }
-        ret[i].free  = free;
-        ret[i].total = total;
-    }
-
-    devs.clear();
-    for (int i = 0; i < llama_model_n_devices(model); i++) {
-        devs.push_back(llama_model_get_device(model, i));
-    }
-
-    hp_ngl         = llama_model_n_layer(model);
-    hp_n_ctx_train = llama_model_n_ctx_train(model);
-    hp_n_expert    = llama_model_n_expert(model);
-
-    common_memory_breakdown_print(ctx);
-
-    llama_free(ctx);
-    llama_model_free(model);
-    llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
-
-    return ret;
-}
-
-static void common_params_fit_impl(
-        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
-        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
-        size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
-    if (mparams->split_mode == LLAMA_SPLIT_MODE_TENSOR) {
-        throw common_params_fit_exception("llama_params_fit is not implemented for SPLIT_MODE_TENSOR, abort");
-    }
-    constexpr int64_t MiB = 1024*1024;
-    typedef std::vector<llama_device_memory_data> dmds_t;
-    const llama_model_params default_mparams = llama_model_default_params();
-
-    std::vector<ggml_backend_dev_t> devs;
-    uint32_t hp_ngl = 0; // hparams.n_gpu_layers
-    uint32_t hp_nct = 0; // hparams.n_ctx_train
-    uint32_t hp_nex = 0; // hparams.n_expert
-
-    // step 1: get data for default parameters and check whether any changes are necessary in the first place
-
-    LOG_INF("%s: getting device memory data for initial parameters:\n", __func__);
-    const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
-    const size_t nd = devs.size(); // number of devices
-
-    std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
-    margins.reserve(nd);
-    if (nd == 0) {
-        margins.push_back(margins_s[0]);
-    } else {
-        for (size_t id = 0; id < nd; id++) {
-            margins.push_back(margins_s[id]);
-        }
-    }
-
-    std::vector<std::string> dev_names;
-    {
-        dev_names.reserve(nd);
-        size_t max_length = 0;
-        for (const auto & dev : devs) {
-            std::string name = ggml_backend_dev_name(dev);
-            name += " (";
-            name += ggml_backend_dev_description(dev);
-            name += ")";
-            dev_names.push_back(name);
-            max_length = std::max(max_length, name.length());
-        }
-        for (std::string & dn : dev_names) {
-            dn.insert(dn.end(), max_length - dn.length(), ' ');
-        }
-    }
-
-    int64_t sum_free            = 0;
-    int64_t sum_projected_free  = 0;
-    int64_t sum_projected_used  = 0;
-    int64_t sum_projected_model = 0;
-    std::vector<int64_t> projected_free_per_device;
-    projected_free_per_device.reserve(nd);
-
-    if (nd == 0) {
-        sum_projected_used = dmds_full.back().mb.total();
-        sum_free           = dmds_full.back().total;
-        sum_projected_free = sum_free - sum_projected_used;
-        LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
-            __func__, sum_projected_used/MiB, sum_free/MiB);
-        if (sum_projected_free >= margins[0]) {
-            LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
-                __func__, sum_projected_free/MiB, margins[0]/MiB);
-            return;
-        }
-    } else {
-        if (nd > 1) {
-            LOG_INF("%s: projected memory use with initial parameters [MiB]:\n", __func__);
-        }
-        for (size_t id = 0; id < nd; id++) {
-            const llama_device_memory_data & dmd = dmds_full[id];
-
-            const int64_t projected_used = dmd.mb.total();
-            const int64_t projected_free = dmd.free - projected_used;
-            projected_free_per_device.push_back(projected_free);
-
-            sum_free            += dmd.free;
-            sum_projected_used  += projected_used;
-            sum_projected_free  += projected_free;
-            sum_projected_model += dmd.mb.model;
-
-            if (nd > 1) {
-                LOG_INF("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
-                    __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
-            }
-        }
-        assert(sum_free >= 0 && sum_projected_used >= 0);
-        LOG_INF("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
-            __func__, sum_projected_used/MiB, sum_free/MiB);
-        if (nd == 1) {
-            if (projected_free_per_device[0] >= margins[0]) {
-                LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
-                    __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
-                return;
-            }
-        } else {
-            bool changes_needed = false;
-            for (size_t id = 0; id < nd; id++) {
-                if (projected_free_per_device[id] < margins[id]) {
-                    changes_needed = true;
-                    break;
-                }
-            }
-            if (!changes_needed) {
-                LOG_INF("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
-                return;
-            }
-        }
-    }
-
-    // step 2: try reducing memory use by reducing the context size
-
-    {
-        int64_t global_surplus = sum_projected_free;
-        if (nd == 0) {
-            global_surplus -= margins[0];
-        } else {
-            for (size_t id = 0; id < nd; id++) {
-                global_surplus -= margins[id];
-            }
-        }
-        if (global_surplus < 0) {
-            if (nd <= 1) {
-                LOG_INF("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
-                    __func__, margins[0]/MiB, -global_surplus/MiB);
-            } else {
-                LOG_INF(
-                    "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
-                    __func__, -global_surplus/MiB);
-            }
-            if (cparams->n_ctx == 0) {
-                if (hp_nct > n_ctx_min) {
-                    int64_t sum_used_target = sum_free;
-                    if (nd == 0) {
-                        sum_used_target -= margins[0];
-                    } else {
-                        for (size_t id = 0; id < nd; id++) {
-                            sum_used_target -= margins[id];
-                        }
-                    }
-                    if (nd > 1) {
-                        // for multiple devices we need to be more conservative in terms of how much context we think can fit:
-                        //   - for dense models only whole layers can be assigned to devices
-                        //   - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
-                        //   - on average we expect a waste of 0.5 layers/tensors per device
-                        //   - use slightly more than the expected average for nd devices to be safe
-                        const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
-                        sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
-                    }
-
-                    int64_t sum_projected_used_min_ctx = 0;
-                    cparams->n_ctx = n_ctx_min;
-                    const dmds_t dmds_min_ctx = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
-                    if (nd == 0) {
-                        sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
-                    } else {
-                        for (size_t id = 0; id < nd; id++) {
-                            sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total();
-                        }
-                    }
-                    if (sum_used_target > sum_projected_used_min_ctx) {
-                        // linear interpolation between minimum and maximum context size:
-                        cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
-                            / (sum_projected_used - sum_projected_used_min_ctx);
-                        cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
-
-                        const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
-                        const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
-                        LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
-                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
-                        if (nd <= 1) {
-                            LOG_INF("%s: entire model can be fit by reducing context\n", __func__);
-                            return;
-                        }
-                        LOG_INF("%s: entire model should be fit across devices by reducing context\n", __func__);
-                    } else {
-                        const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
-                        LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
-                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
-                    }
-                } else {
-                    if (n_ctx_min == UINT32_MAX) {
-                        LOG_INF("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
-                    } else {
-                        LOG_INF("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
-                            __func__, hp_nct, n_ctx_min);
-                    }
-                }
-            } else {
-                LOG_INF("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
-            }
-        }
-    }
-    if (nd == 0) {
-        throw common_params_fit_exception("was unable to fit model into system memory by reducing context, abort");
-    }
-
-    if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
-        throw common_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
-    }
-    if (nd > 1) {
-        if (!tensor_split) {
-            throw common_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
-        }
-        if (mparams->tensor_split) {
-            for (size_t id = 0; id < nd; id++) {
-                if (mparams->tensor_split[id] != 0.0f) {
-                    throw common_params_fit_exception("model_params::tensor_split already set by user, abort");
-                }
-            }
-        }
-        if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
-            throw common_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
-        }
-    }
-    if (!tensor_buft_overrides) {
-        throw common_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
-    }
-    if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
-        throw common_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
-    }
-
-    // step 3: iteratively fill the back to front with "dense" layers
-    //   - for a dense model simply fill full layers, giving each device a contiguous slice of the model
-    //   - for a MoE model, same as dense model but with all MoE tensors in system memory
-
-    // utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
-    auto get_overflow_pattern = [&](const size_t il, const common_layer_fraction_t lf) -> const char * {
-        constexpr size_t n_strings = 1000;
-        if (il >= n_strings) {
-            throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
-        }
-        switch (lf) {
-            case LAYER_FRACTION_ATTN: {
-                static std::array<std::string, n_strings> patterns;
-                if (patterns[il].empty()) {
-                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|up|gate_up|down).*";
-                }
-                return patterns[il].c_str();
-            }
-            case LAYER_FRACTION_UP: {
-                static std::array<std::string, n_strings> patterns;
-                if (patterns[il].empty()) {
-                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|gate_up|down).*";
-                }
-                return patterns[il].c_str();
-            }
-            case LAYER_FRACTION_GATE: {
-                static std::array<std::string, n_strings> patterns;
-                if (patterns[il].empty()) {
-                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
-                }
-                return patterns[il].c_str();
-            }
-            case LAYER_FRACTION_MOE: {
-                static std::array<std::string, n_strings> patterns;
-                if (patterns[il].empty()) {
-                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate_up|gate)_(ch|)exps";
-                }
-                return patterns[il].c_str();
-            }
-            default:
-                GGML_ABORT("fatal error");
-        }
-    };
-
-    struct ngl_t {
-        uint32_t n_layer = 0; // number of total layers
-        uint32_t n_part  = 0; // number of partial layers, <= n_layer
-
-        // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
-        common_layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
-
-        uint32_t n_full() const {
-            assert(n_layer >= n_part);
-            return n_layer - n_part;
-        }
-    };
-
-    const size_t ntbo = llama_max_tensor_buft_overrides();
-
-    // utility function to set n_gpu_layers and tensor_split
-    auto set_ngl_tensor_split_tbo = [&](
-            const std::vector<ngl_t> & ngl_per_device,
-            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
-            llama_model_params & mparams) {
-        mparams.n_gpu_layers = 0;
-        for (size_t id = 0; id < nd; id++) {
-            mparams.n_gpu_layers += ngl_per_device[id].n_layer;
-            if (nd > 1) {
-                tensor_split[id] = ngl_per_device[id].n_layer;
-            }
-        }
-        assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
-        uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
-
-        mparams.tensor_split = tensor_split;
-
-        size_t itbo = 0;
-        for (size_t id = 0; id < nd; id++) {
-            il0 += ngl_per_device[id].n_full();
-            for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
-                if (itbo + 1 >= ntbo) {
-                    tensor_buft_overrides[itbo].pattern = nullptr;
-                    tensor_buft_overrides[itbo].buft    = nullptr;
-                    itbo++;
-                    mparams.tensor_buft_overrides = tensor_buft_overrides;
-                    throw common_params_fit_exception("llama_max_tensor_buft_overrides() == "
-                        + std::to_string(ntbo) + " is insufficient for model");
-                }
-                tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
-                tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
-                itbo++;
-            }
-            il0 += ngl_per_device[id].n_part;
-        }
-        tensor_buft_overrides[itbo].pattern = nullptr;
-        tensor_buft_overrides[itbo].buft    = nullptr;
-        itbo++;
-        mparams.tensor_buft_overrides = tensor_buft_overrides;
-    };
-
-    // utility function that returns the memory use per device for given numbers of layers per device
-    auto get_memory_for_layers = [&](
-            const char * func_name,
-            const std::vector<ngl_t> & ngl_per_device,
-            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
-        llama_model_params mparams_copy = *mparams;
-        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
-
-        const dmds_t dmd_nl = common_get_device_memory_data(
-            path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
-
-        LOG_INF("%s: memory for test allocation by device:\n", func_name);
-        for (size_t id = 0; id < nd; id++) {
-            const ngl_t & n = ngl_per_device[id];
-            LOG_INF(
-                "%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
-                func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
-        }
-
-        std::vector<int64_t> ret;
-        ret.reserve(nd);
-        for (size_t id = 0; id < nd; id++) {
-            ret.push_back(dmd_nl[id].mb.total());
-        }
-        return ret;
-    };
-
-    int64_t global_surplus_cpu_moe = 0;
-    if (hp_nex > 0) {
-        const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate_up|gate)_(ch|)exps"; // matches all MoE tensors
-        ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
-        tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
-        tensor_buft_overrides[1] = {nullptr, nullptr};
-        mparams->tensor_buft_overrides = tensor_buft_overrides;
-
-        LOG_INF("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
-        const dmds_t dmds_cpu_moe = common_get_device_memory_data(
-            path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
-
-        for (size_t id = 0; id < nd; id++) {
-            global_surplus_cpu_moe += dmds_cpu_moe[id].free;
-            global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
-        }
-
-        if (global_surplus_cpu_moe > 0) {
-            LOG_INF("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
-                __func__, global_surplus_cpu_moe/MiB);
-        } else {
-            LOG_INF("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
-                __func__, -global_surplus_cpu_moe/MiB);
-        }
-
-        // reset
-        tensor_buft_overrides[0] = {nullptr, nullptr};
-        mparams->tensor_buft_overrides = tensor_buft_overrides;
-    }
-
-    std::vector<int64_t> targets; // maximum acceptable memory use per device
-    targets.reserve(nd);
-    for (size_t id = 0; id < nd; id++) {
-        targets.push_back(dmds_full[id].free - margins[id]);
-        LOG_INF("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
-    }
-
-    std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
-    overflow_bufts.reserve(nd);
-    for (size_t id = 0; id < nd; id++) {
-        overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
-    }
-
-    std::vector<ngl_t> ngl_per_device(nd);
-    std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
-
-    // optimize the number of layers per device using the method of false position:
-    //   - ngl_per_device has 0 layers for each device, lower bound
-    //   - try a "high" configuration where a device is given all unassigned layers
-    //   - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
-    //   - check memory use of our guess, replace either the low or high bound
-    //   - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
-    //   - the last device has the output layer, which cannot be a partial layer
-    if (hp_nex == 0) {
-        LOG_INF("%s: filling dense layers back-to-front:\n", __func__);
-    } else {
-        LOG_INF("%s: filling dense-only layers back-to-front:\n", __func__);
-    }
-    for (int id = nd - 1; id >= 0; id--) {
-        uint32_t n_unassigned = hp_ngl + 1;
-        for (size_t jd = id + 1; jd < nd; ++jd) {
-            assert(n_unassigned >= ngl_per_device[jd].n_layer);
-            n_unassigned -= ngl_per_device[jd].n_layer;
-        }
-
-        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
-        ngl_per_device_high[id].n_layer = n_unassigned;
-        if (hp_nex > 0) {
-            ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
-        }
-        if (ngl_per_device_high[id].n_layer > 0) {
-            std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
-            if (mem_high[id] > targets[id]) {
-                assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
-                uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
-                LOG_INF("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
-                while (delta > 1) {
-                    uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
-                    step_size = std::max(step_size, uint32_t(1));
-                    step_size = std::min(step_size, delta - 1);
-
-                    std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
-                    ngl_per_device_test[id].n_layer += step_size;
-                    if (hp_nex) {
-                        ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
-                            step_size - 1 : step_size; // the first layer is the output layer which must always be full
-                    }
-                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
-
-                    if (mem_test[id] <= targets[id]) {
-                        ngl_per_device = ngl_per_device_test;
-                        mem            = mem_test;
-                        LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
-                    } else {
-                        ngl_per_device_high = ngl_per_device_test;
-                        mem_high            = mem_test;
-                        LOG_INF("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
-                    }
-                    delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
-                }
-            } else {
-                assert(ngl_per_device_high[id].n_layer == n_unassigned);
-                ngl_per_device = ngl_per_device_high;
-                mem            = mem_high;
-                LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
-            }
-        }
-
-        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_INF(
-            "%s:   - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
-            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
-    }
-    if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
-        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
-        return;
-    }
-
-    // step 4: for a MoE model where all dense tensors fit,
-    //     convert the dense-only layers in the back to full layers in the front until all devices are full
-    // essentially the same procedure as for the dense-only layers except front-to-back
-    // also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
-
-    size_t id_dense_start = nd;
-    for (int id = nd - 1; id >= 0; id--) {
-        if (ngl_per_device[id].n_layer > 0) {
-            id_dense_start = id;
-            continue;
-        }
-        break;
-    }
-    assert(id_dense_start < nd);
-
-    LOG_INF("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
-    for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
-        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
-        for (size_t jd = id_dense_start; jd < nd; jd++) {
-            const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
-            ngl_per_device_high[id].n_layer += n_layer_move;
-            ngl_per_device_high[jd].n_layer -= n_layer_move;
-            ngl_per_device_high[jd].n_part = 0;
-        }
-        size_t id_dense_start_high = nd - 1;
-        std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
-
-        if (mem_high[id] > targets[id]) {
-            assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
-            uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
-            while (delta > 1) {
-                uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
-                step_size = std::max(step_size, uint32_t(1));
-                step_size = std::min(step_size, delta - 1);
-
-                std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
-                size_t id_dense_start_test = id_dense_start;
-                uint32_t n_converted_test = 0;
-                for (;id_dense_start_test < nd; id_dense_start_test++) {
-                    const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
-                    ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
-                    ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
-                    ngl_per_device_test[id].n_layer += n_convert_jd;
-                    n_converted_test += n_convert_jd;
-
-                    if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
-                        break;
-                    }
-                }
-                const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
-
-                if (mem_test[id] <= targets[id]) {
-                    ngl_per_device = ngl_per_device_test;
-                    mem            = mem_test;
-                    id_dense_start = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
-                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
-                } else {
-                    ngl_per_device_high = ngl_per_device_test;
-                    mem_high            = mem_test;
-                    id_dense_start_high = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
-                        __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
-                }
-                assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
-                delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
-            }
-        } else {
-            ngl_per_device = ngl_per_device_high;
-            mem            = mem_high;
-            id_dense_start = id_dense_start_high;
-            LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
-                __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
-        }
-
-        // try to fit at least part of one more layer
-        if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
-            std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
-            size_t id_dense_start_test = id_dense_start;
-            ngl_per_device_test[id_dense_start_test].n_layer--;
-            ngl_per_device_test[id_dense_start_test].n_part--;
-            ngl_per_device_test[id].n_layer++;
-            ngl_per_device_test[id].n_part++;
-            if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
-                id_dense_start_test++;
-            }
-            ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
-            std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
-            if (id < nd - 1) {
-                overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
-            }
-            LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
-            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
-            if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
-                ngl_per_device = ngl_per_device_test;
-                overflow_bufts = overflow_bufts_test;
-                mem            = mem_test;
-                id_dense_start = id_dense_start_test;
-                LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
-                    __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
-
-                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
-                LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
-                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
-                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
-                    ngl_per_device = ngl_per_device_test;
-                    overflow_bufts = overflow_bufts_test;
-                    mem            = mem_test;
-                    id_dense_start = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
-                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
-                }
-            } else {
-                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
-                LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
-                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
-                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
-                    ngl_per_device = ngl_per_device_test;
-                    overflow_bufts = overflow_bufts_test;
-                    mem            = mem_test;
-                    id_dense_start = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
-                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
-                }
-            }
-        }
-
-        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_INF(
-            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
-            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
-    }
-
-    // print info for devices that were not changed during the conversion from dense only to full layers:
-    for (size_t id = id_dense_start + 1; id < nd; id++) {
-        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_INF(
-            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
-            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
-    }
-
-    set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
-}
-
-enum common_params_fit_status common_fit_params(
-        const char * path_model,
-        llama_model_params * mparams,
-        llama_context_params * cparams,
-        float * tensor_split,
-        llama_model_tensor_buft_override * tensor_buft_overrides,
-        size_t * margins,
-        uint32_t n_ctx_min,
-        ggml_log_level log_level) {
-    const int64_t t0_us = llama_time_us();
-    common_params_fit_status status = COMMON_PARAMS_FIT_STATUS_SUCCESS;
-    try {
-        common_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
-        LOG_INF("%s: successfully fit params to free device memory\n", __func__);
-    } catch (const common_params_fit_exception & e) {
-        LOG_WRN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
-        status = COMMON_PARAMS_FIT_STATUS_FAILURE;
-    } catch (const std::runtime_error & e) {
-        LOG_ERR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
-        status = COMMON_PARAMS_FIT_STATUS_ERROR;
-    }
-    const int64_t t1_us = llama_time_us();
-    LOG_INF("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
-    return status;
-}
-
-void common_memory_breakdown_print(const struct llama_context * ctx) {
-    //const auto & devices = ctx->get_model().devices;
-    const auto * model = llama_get_model(ctx);
-
-    std::vector<ggml_backend_dev_t> devices;
-    for (int i = 0; i < llama_model_n_devices(model); i++) {
-        devices.push_back(llama_model_get_device(model, i));
-    }
-
-    llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
-
-    std::vector<std::array<std::string, 9>> table_data;
-    table_data.reserve(devices.size());
-    const std::string template_header = "%s: | %s | %s   %s    %s   %s   %s   %s    %s |\n";
-    const std::string template_gpu    = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
-    const std::string template_other  = "%s: | %s | %s   %s    %s = %s + %s + %s    %s |\n";
-
-    table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
-
-    constexpr size_t MiB = 1024 * 1024;
-    const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
-
-    // track seen buffer types to avoid double counting:
-    std::set<ggml_backend_buffer_type_t> seen_buffer_types;
-
-    // accumulative memory breakdown for each device and for host:
-    std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
-    llama_memory_breakdown_data              mb_host;
-
-    for (const auto & buft_mb : memory_breakdown) {
-        ggml_backend_buffer_type_t          buft = buft_mb.first;
-        const llama_memory_breakdown_data & mb   = buft_mb.second;
-        if (ggml_backend_buft_is_host(buft)) {
-            mb_host.model   += mb.model;
-            mb_host.context += mb.context;
-            mb_host.compute += mb.compute;
-            seen_buffer_types.insert(buft);
-            continue;
-        }
-        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
-        if (dev) {
-            int i_dev = -1;
-            for (size_t i = 0; i < devices.size(); i++) {
-                if (devices[i] == dev) {
-                    i_dev = i;
-                    break;
-                }
-            }
-            if (i_dev != -1) {
-                mb_dev[i_dev].model   += mb.model;
-                mb_dev[i_dev].context += mb.context;
-                mb_dev[i_dev].compute += mb.compute;
-                seen_buffer_types.insert(buft);
-                continue;
-            }
-        }
-    }
-
-    // print memory breakdown for each device:
-    for (size_t i = 0; i < devices.size(); i++) {
-        ggml_backend_dev_t dev = devices[i];
-        llama_memory_breakdown_data mb = mb_dev[i];
-
-        const std::string name = ggml_backend_dev_name(dev);
-        std::string desc = ggml_backend_dev_description(dev);
-        for (const std::string & prefix : desc_prefixes_strip) {
-            if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
-                desc = desc.substr(prefix.length());
-            }
-        }
-
-        size_t free, total;
-        ggml_backend_dev_memory(dev, &free, &total);
-
-        const size_t self = mb.model + mb.context + mb.compute;
-        const int64_t unaccounted = static_cast<int64_t>(total) - static_cast<int64_t>(free) - static_cast<int64_t>(self);
-
-        table_data.push_back({
-            template_gpu,
-            "  - " + name + " (" + desc + ")",
-            std::to_string(total / MiB),
-            std::to_string(free / MiB),
-            std::to_string(self / MiB),
-            std::to_string(mb.model / MiB),
-            std::to_string(mb.context / MiB),
-            std::to_string(mb.compute / MiB),
-            std::to_string(unaccounted / static_cast<int64_t>(MiB))});
-    }
-
-    // print memory breakdown for host:
-    {
-        const size_t self = mb_host.model + mb_host.context + mb_host.compute;
-        table_data.push_back({
-            template_other,
-            "  - Host",
-            "", // total
-            "", // free
-            std::to_string(self / MiB),
-            std::to_string(mb_host.model / MiB),
-            std::to_string(mb_host.context / MiB),
-            std::to_string(mb_host.compute / MiB),
-            ""}); // unaccounted
-    }
-
-    // print memory breakdown for all remaining buffer types:
-    for (const auto & buft_mb : memory_breakdown) {
-        ggml_backend_buffer_type_t          buft = buft_mb.first;
-        const llama_memory_breakdown_data & mb   = buft_mb.second;
-        if (seen_buffer_types.count(buft) == 1) {
-            continue;
-        }
-        const std::string name = ggml_backend_buft_name(buft);
-        const size_t self = mb.model + mb.context + mb.compute;
-        table_data.push_back({
-            template_other,
-            "  - " + name,
-            "", // total
-            "", // free
-            std::to_string(self / MiB),
-            std::to_string(mb.model / MiB),
-            std::to_string(mb.context / MiB),
-            std::to_string(mb.compute / MiB),
-            ""}); // unaccounted
-        seen_buffer_types.insert(buft);
-    }
-
-    for (size_t j = 1; j < table_data[0].size(); j++) {
-        size_t max_len = 0;
-        for (const auto & td : table_data) {
-            max_len = std::max(max_len, td[j].length());
-        }
-        for (auto & td : table_data) {
-            td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
-        }
-    }
-    for (const auto & td : table_data) {
-        LOG_INF(td[0].c_str(),
-            __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
-            td[6].c_str(), td[7].c_str(), td[8].c_str());
-    }
-}
-
-void common_fit_print(
-        const char * path_model,
-        llama_model_params * mparams,
-        llama_context_params * cparams) {
-    std::vector<ggml_backend_dev_t> devs;
-    uint32_t hp_ngl = 0; // hparams.n_gpu_layers
-    uint32_t hp_nct = 0; // hparams.n_ctx_train
-    uint32_t hp_nex = 0; // hparams.n_expert
-
-    auto dmd = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
-    GGML_ASSERT(dmd.size() == devs.size() + 1);
-
-    for (size_t id = 0; id < devs.size(); id++) {
-        printf("%s ",  ggml_backend_dev_name(devs[id]));
-        printf("%zu ", dmd[id].mb.model/1024/1024);
-        printf("%zu ", dmd[id].mb.context/1024/1024);
-        printf("%zu ", dmd[id].mb.compute/1024/1024);
-        printf("\n");
-    }
-
-    printf("Host ");
-    printf("%zu ", dmd.back().mb.model/1024/1024);
-    printf("%zu ", dmd.back().mb.context/1024/1024);
-    printf("%zu ", dmd.back().mb.compute/1024/1024);
-    printf("\n");
-}
--- a/common/fit.h
+++ b/common/fit.h
@@ -1,32 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-
-enum common_params_fit_status {
-    COMMON_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
-    COMMON_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
-    COMMON_PARAMS_FIT_STATUS_ERROR   = 2, // a hard error occurred, e.g. because no model could be found at the specified path
-};
-
-// fits mparams and cparams to free device memory (assumes system memory is unlimited)
-//   - returns true if the parameters could be successfully modified to fit device memory
-//   - this function is NOT thread safe because it modifies the global llama logger state
-//   - only parameters that have the same value as in llama_default_model_params are modified
-//     with the exception of the context size which is modified if and only if equal to 0
-enum common_params_fit_status common_fit_params(
-                               const char   * path_model,
-                struct llama_model_params   * mparams,
-                struct llama_context_params * cparams,
-                                      float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
-    struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
-                                     size_t * margins,               // margins of memory to leave per device in bytes
-                                   uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
-                        enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log
-
-// print estimated memory to stdout
-void common_fit_print(
-                               const char   * path_model,
-                struct llama_model_params   * mparams,
-                struct llama_context_params * cparams);
-
-void common_memory_breakdown_print(const struct llama_context * ctx);
--- a/common/hf-cache.cpp
+++ b/common/hf-cache.cpp
@@ -57,7 +57,7 @@ static fs::path get_cache_directory() {
 #ifndef _WIN32
        const struct passwd * pw = getpwuid(getuid());

-        if (pw && pw->pw_dir && *pw->pw_dir) {
+        if (pw->pw_dir && *pw->pw_dir) {
            return fs::path(pw->pw_dir) / ".cache" / "huggingface" / "hub";
        }
 #endif
--- a/common/jinja/caps.cpp
+++ b/common/jinja/caps.cpp
@@ -1,3 +1,4 @@
+#include "log.h"
 #include "value.h"
 #include "runtime.h"
 #include "caps.h"
--- a/common/jinja/runtime.h
+++ b/common/jinja/runtime.h
@@ -106,16 +106,10 @@ struct statement {
    size_t pos; // position in source, for debugging
    virtual ~statement() = default;
    virtual std::string type() const { return "Statement"; }
-
    // execute_impl must be overridden by derived classes
-    virtual value execute_impl(context &) { throw_exec_error(); }
+    virtual value execute_impl(context &) { throw std::runtime_error("cannot exec " + type()); }
    // execute is the public method to execute a statement with error handling
    value execute(context &);
-
-private:
-    [[noreturn]] void throw_exec_error() const {
-        throw std::runtime_error("cannot exec " + type());
-    }
 };

 // Type Checking Utilities
@@ -149,7 +143,7 @@ struct program : public statement {
    program() = default;
    explicit program(statements && body) : body(std::move(body)) {}
    std::string type() const override { return "Program"; }
-    [[noreturn]] value execute_impl(context &) override {
+    value execute_impl(context &) override {
        throw std::runtime_error("Cannot execute program directly, use jinja::runtime instead");
    }
 };
@@ -201,7 +195,7 @@ struct break_statement : public statement {
        }
    };

-    [[noreturn]] value execute_impl(context &) override {
+    value execute_impl(context &) override {
        throw break_statement::signal();
    }
 };
@@ -215,7 +209,7 @@ struct continue_statement : public statement {
        }
    };

-    [[noreturn]] value execute_impl(context &) override {
+    value execute_impl(context &) override {
        throw continue_statement::signal();
    }
 };
@@ -515,7 +509,7 @@ struct slice_expression : public expression {
        chk_type<expression>(this->step_expr);
    }
    std::string type() const override { return "SliceExpression"; }
-    [[noreturn]] value execute_impl(context &) override {
+    value execute_impl(context &) override {
        throw std::runtime_error("must be handled by MemberExpression");
    }
 };
--- a/common/jinja/value.cpp
+++ b/common/jinja/value.cpp
@@ -590,10 +590,6 @@ static bool string_endswith(const std::string & str, const std::string & suffix)
    return str.compare(str.length() - suffix.length(), suffix.length(), suffix) == 0;
 }

-[[noreturn]] static value string_join_not_implemented(const func_args &) {
-    throw not_implemented_exception("String join builtin not implemented");
-}
-
 const func_builtins & value_string_t::get_builtins() const {
    static const func_builtins builtins = {
        {"default", default_value},
@@ -855,7 +851,9 @@ const func_builtins & value_string_t::get_builtins() const {
            res->val_str.mark_input_based_on(val_input->as_string());
            return res;
        }},
-        {"join", string_join_not_implemented},
+        {"join", [](const func_args &) -> value {
+            throw not_implemented_exception("String join builtin not implemented");
+        }},
    };
    return builtins;
 }
@@ -886,9 +884,6 @@ const func_builtins & value_bool_t::get_builtins() const {
    return builtins;
 }

-[[noreturn]] static value array_unique_not_implemented(const func_args &) {
-    throw not_implemented_exception("Array unique builtin not implemented");
-}

 const func_builtins & value_array_t::get_builtins() const {
    static const func_builtins builtins = {
@@ -1089,14 +1084,13 @@ const func_builtins & value_array_t::get_builtins() const {
            std::reverse(arr.begin(), arr.end());
            return is_val<value_tuple>(val) ? mk_val<value_tuple>(std::move(arr)) : mk_val<value_array>(std::move(arr));
        }},
-        {"unique", array_unique_not_implemented},
+        {"unique", [](const func_args &) -> value {
+            throw not_implemented_exception("Array unique builtin not implemented");
+        }},
    };
    return builtins;
 }

-[[noreturn]] static value object_join_not_implemented(const func_args &) {
-    throw not_implemented_exception("object join not implemented");
-}

 const func_builtins & value_object_t::get_builtins() const {
    if (!has_builtins) {
@@ -1189,7 +1183,9 @@ const func_builtins & value_object_t::get_builtins() const {
            });
            return result;
        }},
-        {"join", object_join_not_implemented},
+        {"join", [](const func_args &) -> value {
+            throw not_implemented_exception("object join not implemented");
+        }},
    };
    return builtins;
 }
--- a/common/jinja/value.h
+++ b/common/jinja/value.h
@@ -129,25 +129,27 @@ struct value_t {
    // Note: only for debugging and error reporting purposes
    virtual std::string type() const { return ""; }

-    virtual int64_t as_int() const { throw_type_error("is not an int value"); }
-    virtual double as_float() const { throw_type_error("is not a float value"); }
-    virtual string as_string() const { throw_type_error("is not a string value"); }
-    virtual bool as_bool() const { throw_type_error("is not a bool value"); }
-    virtual const std::vector<value> & as_array() const { throw_type_error("is not an array value"); }
-    virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw_type_error("is not an object value"); }
-    virtual value invoke(const func_args &) const { throw_type_error("is not a function value"); }
+    virtual int64_t as_int() const { throw std::runtime_error(type() + " is not an int value"); }
+    virtual double as_float() const { throw std::runtime_error(type() + " is not a float value"); }
+    virtual string as_string() const { throw std::runtime_error(type() + " is not a string value"); }
+    virtual bool as_bool() const { throw std::runtime_error(type() + " is not a bool value"); }
+    virtual const std::vector<value> & as_array() const { throw std::runtime_error(type() + " is not an array value"); }
+    virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value invoke(const func_args &) const { throw std::runtime_error(type() + " is not a function value"); }
    virtual bool is_none() const { return false; }
    virtual bool is_undefined() const { return false; }
-    virtual const func_builtins & get_builtins() const { throw_type_error("has no builtins"); }
+    virtual const func_builtins & get_builtins() const {
+        throw std::runtime_error("No builtins available for type " + type());
+    }

-    virtual bool has_key(const value &) { throw_type_error("is not an object value"); }
-    virtual void insert(const value & /* key */, const value & /* val */) { throw_type_error("is not an object value"); }
-    virtual value & at(const value & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
-    virtual value & at(const value & /* key */) { throw_type_error("is not an object value"); }
-    virtual value & at(const std::string & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
-    virtual value & at(const std::string & /* key */) { throw_type_error("is not an object value"); }
-    virtual value & at(int64_t /* idx */, value & /* default_val */) { throw_type_error("is not an array value"); }
-    virtual value & at(int64_t /* idx */) { throw_type_error("is not an array value"); }
+    virtual bool has_key(const value &) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual void insert(const value & /* key */, const value & /* val */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(const value & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(const value & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(const std::string & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(const std::string & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
+    virtual value & at(int64_t /* idx */, value & /* default_val */) { throw std::runtime_error(type() + " is not an array value"); }
+    virtual value & at(int64_t /* idx */) { throw std::runtime_error(type() + " is not an array value"); }

    virtual bool is_numeric() const { return false; }
    virtual bool is_hashable() const { return false; }
@@ -161,11 +163,6 @@ struct value_t {
    // Note: only for debugging purposes
    virtual std::string as_repr() const { return as_string().str(); }

-private:
-    [[noreturn]] void throw_type_error(const char* expected) const {
-        throw std::runtime_error(type() + " " + expected);
-    }
-
 protected:
    virtual bool equivalent(const value_t &) const = 0;
    virtual bool nonequal(const value_t & other) const { return !equivalent(other); }
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -49,7 +49,7 @@ enum common_log_col : int {
 };

 // disable colors by default
-static const char* g_col[] = {
+static std::vector<const char *> g_col = {
    "",
    "",
    "",
@@ -247,6 +247,7 @@ public:

            entries = std::move(new_entries);
        }
+
        cv.notify_one();
    }

@@ -264,6 +265,7 @@ public:
                {
                    std::unique_lock<std::mutex> lock(mtx);
                    cv.wait(lock, [this]() { return head != tail; });
+
                    cur = entries[head];

                    head = (head + 1) % entries.size();
@@ -299,6 +301,7 @@ public:

                tail = (tail + 1) % entries.size();
            }
+
            cv.notify_one();
        }

@@ -335,7 +338,7 @@ public:
            g_col[COMMON_LOG_COL_CYAN]    = LOG_COL_CYAN;
            g_col[COMMON_LOG_COL_WHITE]   = LOG_COL_WHITE;
        } else {
-            for (size_t i = 0; i < std::size(g_col); i++) {
+            for (size_t i = 0; i < g_col.size(); i++) {
                g_col[i] = "";
            }
        }
@@ -365,20 +368,14 @@ struct common_log * common_log_init() {
 }

 struct common_log * common_log_main() {
-    // We intentionally leak (i.e. do not delete) the logger singleton because
-    // common_log destructor called at DLL teardown phase will cause hanging on Windows.
-    // OS will release resources anyway so it should not be a significant issue,
-    // though this design may cause logs to be lost if not flushed before the program exits.
-    // Refer to https://github.com/ggml-org/llama.cpp/issues/22142 for details.
-    static struct common_log * log;
+    static struct common_log log;
    static std::once_flag    init_flag;
    std::call_once(init_flag, [&]() {
-        log = new common_log;
        // Set default to auto-detect colors
-        log->set_colors(tty_can_use_colors());
+        log.set_colors(tty_can_use_colors());
    });

-    return log;
+    return &log;
 }

 void common_log_pause(struct common_log * log) {
--- a/common/log.h
+++ b/common/log.h
@@ -49,11 +49,7 @@ void common_log_default_callback(enum ggml_log_level level, const char * text, v
 struct common_log;

 struct common_log * common_log_init();
-
-// Singleton, intentionally leaked to avoid Windows teardown hangs.
-// Call common_log_flush() before exit if you want to ensure all logs are flushed.
-struct common_log * common_log_main();
-
+struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
 void                common_log_pause (struct common_log * log); // pause  the worker thread, not thread-safe
 void                common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
 void                common_log_free  (struct common_log * log);
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -43,7 +43,7 @@ static std::set<std::string> get_remote_preset_whitelist(const std::map<std::str
    for (const auto & it : key_to_opt) {
        const std::string & key = it.first;
        const common_arg & opt = it.second;
-        if (allowed_options.find(key) != allowed_options.end() || opt.is_sampling) {
+        if (allowed_options.find(key) != allowed_options.end() || opt.is_sparam) {
            allowed_keys.insert(key);
            // also add variant keys (args without leading dashes and env vars)
            for (const auto & arg : opt.get_args()) {
@@ -163,13 +163,8 @@ void common_preset::merge(const common_preset & other) {
    }
 }

-void common_preset::apply_to_params(common_params & params, const std::set<std::string> & handled_keys) const {
+void common_preset::apply_to_params(common_params & params) const {
    for (const auto & [opt, val] : options) {
-        if (!handled_keys.empty()) {
-            if (!opt.env || handled_keys.find(opt.env) == handled_keys.end()) {
-                continue;
-            }
-        }
        // apply each option to params
        if (opt.handler_string) {
            opt.handler_string(params, val);
--- a/common/preset.h
+++ b/common/preset.h
@@ -43,8 +43,7 @@ struct common_preset {
    void merge(const common_preset & other);

    // apply preset options to common_params
-    // optionally specify handled_keys to only apply a subset of options (identified by their env), if empty, apply all options
-    void apply_to_params(common_params & params, const std::set<std::string> & handled_keys = std::set<std::string>()) const;
+    void apply_to_params(common_params & params) const;
 };

 // interface for multiple presets in one file
--- a/common/reasoning-budget.cpp
+++ b/common/reasoning-budget.cpp
@@ -122,20 +122,6 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
            }
            break;
        case REASONING_BUDGET_DONE:
-            // Re-arm on a new start tag: some models emit multiple <think> blocks
-            // per response, and each should get a fresh budget window.
-            if (ctx->start_matcher.advance(token)) {
-                ctx->state = REASONING_BUDGET_COUNTING;
-                ctx->remaining = ctx->budget;
-                ctx->end_matcher.reset();
-                LOG_INF("reasoning-budget: re-activated on new start tag, budget=%d tokens\n", ctx->budget);
-
-                if (ctx->remaining <= 0) {
-                    ctx->state = REASONING_BUDGET_FORCING;
-                    ctx->force_pos = 0;
-                    LOG_INF("reasoning-budget: budget=0, forcing immediately\n");
-                }
-            }
            break;
    }
 }
@@ -232,6 +218,34 @@ static struct llama_sampler * common_reasoning_budget_init_state(
    );
 }

+struct llama_sampler * common_reasoning_budget_init(
+        const struct llama_vocab       * vocab,
+        const std::vector<llama_token> & start_tokens,
+        const std::vector<llama_token> & end_tokens,
+        const std::vector<llama_token> & forced_tokens,
+        int32_t                          budget,
+        const std::vector<llama_token> & prefill_tokens) {
+    // Determine initial state from prefill: COUNTING if the prefill begins with
+    // the start sequence but does not also contain the end sequence after it.
+    common_reasoning_budget_state initial_state = REASONING_BUDGET_IDLE;
+    if (!prefill_tokens.empty() && !start_tokens.empty() &&
+            prefill_tokens.size() >= start_tokens.size() &&
+            std::equal(start_tokens.begin(), start_tokens.end(), prefill_tokens.begin())) {
+        initial_state = REASONING_BUDGET_COUNTING;
+        // If the end sequence also follows the start in the prefill, reasoning
+        // was opened and immediately closed — stay IDLE.
+        if (!end_tokens.empty() &&
+                prefill_tokens.size() >= start_tokens.size() + end_tokens.size()) {
+            auto end_start = prefill_tokens.end() - (ptrdiff_t) end_tokens.size();
+            if (end_start >= prefill_tokens.begin() + (ptrdiff_t) start_tokens.size() &&
+                    std::equal(end_tokens.begin(), end_tokens.end(), end_start)) {
+                initial_state = REASONING_BUDGET_IDLE;
+            }
+        }
+    }
+    return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state);
+}
+
 struct llama_sampler * common_reasoning_budget_init(
        const struct llama_vocab       * vocab,
        const std::vector<llama_token> & start_tokens,
--- a/common/reasoning-budget.h
+++ b/common/reasoning-budget.h
@@ -29,7 +29,10 @@ enum common_reasoning_budget_state {
 //   end_tokens     - token sequence for natural deactivation
 //   forced_tokens  - token sequence forced when budget expires
 //   budget         - max tokens allowed in the reasoning block
-//   initial_state  - initial state
+//   prefill_tokens - tokens already present in the prompt (generation prompt);
+//                    used to determine the initial state: COUNTING if they begin
+//                    with start_tokens (but don't also end with end_tokens),
+//                    IDLE otherwise. COUNTING with budget <= 0 is promoted to FORCING.
 //
 struct llama_sampler * common_reasoning_budget_init(
        const struct llama_vocab       * vocab,
@@ -37,6 +40,16 @@ struct llama_sampler * common_reasoning_budget_init(
        const std::vector<llama_token> & end_tokens,
        const std::vector<llama_token> & forced_tokens,
        int32_t                          budget,
-        common_reasoning_budget_state    initial_state = REASONING_BUDGET_IDLE);
+        const std::vector<llama_token> & prefill_tokens = {});
+
+// Variant that takes an explicit initial state (used by tests and clone).
+// COUNTING with budget <= 0 is promoted to FORCING.
+struct llama_sampler * common_reasoning_budget_init(
+        const struct llama_vocab       * vocab,
+        const std::vector<llama_token> & start_tokens,
+        const std::vector<llama_token> & end_tokens,
+        const std::vector<llama_token> & forced_tokens,
+        int32_t                          budget,
+        common_reasoning_budget_state    initial_state);

 common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,12 +1,10 @@
 #include "sampling.h"

 #include "common.h"
-#include "fit.h"
+#include "ggml.h"
 #include "log.h"
 #include "reasoning-budget.h"

-#include "ggml.h"
-
 #include <algorithm>
 #include <cctype>
 #include <climits>
@@ -260,35 +258,32 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
        }
    }

-    // Compute prefill tokens from the generation prompt
-    std::vector<llama_token> prefill_tokens;
-    if (!params.generation_prompt.empty()) {
-        GGML_ASSERT(vocab != nullptr);
-        auto tokens = common_tokenize(vocab, params.generation_prompt, false, true);
-        for (size_t i = 0; i < tokens.size(); i++) {
-            std::string piece = common_token_to_piece(vocab, tokens[i], true);
-            if (i == 0 && std::isspace(piece[0]) && !std::isspace(params.generation_prompt[0])) {
-                // Some tokenizers will add a space before the first special token, need to exclude
-                continue;
-            }
-            LOG_DBG("%s: prefill token: %d = %s\n", __func__, tokens[i], piece.c_str());
-            prefill_tokens.push_back(tokens[i]);
-        }
-    }
-
    // Feed generation prompt tokens to the grammar sampler so it advances past
    // tokens the template already placed in the prompt.
    // Only applies to output-format and tool-call grammars; user-supplied grammars must not be prefilled.
-    if (grmr && !params.grammar_lazy && common_grammar_needs_prefill(params.grammar)) {
-        try {
-            for (const auto & token : prefill_tokens) {
-                llama_sampler_accept(grmr, token);
-                LOG_DBG("%s: grammar accepted prefill token (%d)\n", __func__, token);
+    std::vector<llama_token> prefill_tokens;
+    if (!params.generation_prompt.empty() && common_grammar_needs_prefill(params.grammar)) {
+        GGML_ASSERT(vocab != nullptr);
+        prefill_tokens = common_tokenize(vocab, params.generation_prompt, false, true);
+        if (!prefill_tokens.empty()) {
+            std::string first_token = common_token_to_piece(vocab, prefill_tokens[0], true);
+            if (std::isspace(first_token[0]) && !std::isspace(params.generation_prompt[0])) {
+                // Some tokenizers will add a space before the first special token, need to remove
+                prefill_tokens = std::vector<llama_token>(prefill_tokens.begin() + 1, prefill_tokens.end());
+            }
+        }
+
+        if (grmr && !params.grammar_lazy) {
+            try {
+                for (const auto & token : prefill_tokens) {
+                    llama_sampler_accept(grmr, token);
+                    LOG_DBG("%s: accepted prefill token (%d)\n", __func__, token);
+                }
+            } catch (std::exception &e) {
+                LOG_ERR("%s: error initializing grammar sampler for grammar:\n%s\n\nGeneration prompt:\n'%s'\n", __func__,
+                    common_grammar_value(params.grammar).c_str(), params.generation_prompt.c_str());
+                throw e;
            }
-        } catch (std::exception &e) {
-            LOG_ERR("%s: error initializing grammar sampler for grammar:\n%s\n\nGeneration prompt:\n'%s'\n", __func__,
-                common_grammar_value(params.grammar).c_str(), params.generation_prompt.c_str());
-            throw e;
        }
    }

@@ -299,12 +294,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
            params.reasoning_budget_start,
            params.reasoning_budget_end,
            params.reasoning_budget_forced,
-            params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens);
-
-        for (const auto & token : prefill_tokens) {
-            llama_sampler_accept(rbudget, token);
-            LOG_DBG("%s: reasoning-budget accepted prefill token (%d)\n", __func__, token);
-        }
+            params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens,
+            prefill_tokens);
    }

    if (params.has_logit_bias()) {
@@ -438,7 +429,7 @@ static bool grammar_should_apply(struct common_sampler * gsmpl) {
    return true;
 }

-void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool is_generated) {
+void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
    if (!gsmpl) {
        return;
    }
@@ -446,11 +437,9 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
    const auto tm = gsmpl->tm();

    // grammar_should_apply() checks the reasoning budget state, so calculate this before we accept
-    const auto accept_grammar = is_generated && grammar_should_apply(gsmpl);
+    accept_grammar = accept_grammar && grammar_should_apply(gsmpl);

-    if (gsmpl->rbudget && is_generated) {
-        llama_sampler_accept(gsmpl->rbudget, token);
-    }
+    llama_sampler_accept(gsmpl->rbudget, token);

    if (gsmpl->grmr && accept_grammar) {
        llama_sampler_accept(gsmpl->grmr, token);
@@ -522,7 +511,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
        LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %%      (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
        LOG_INF("%s:    graphs reused = %10d\n", __func__, data.n_reused);

-        common_memory_breakdown_print(ctx);
+        llama_memory_breakdown_print(ctx);
    }
 }

@@ -547,8 +536,6 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    auto & chain = gsmpl->chain;
    auto & cur_p = gsmpl->cur_p; // initialized by set_logits

-    gsmpl->set_logits(ctx, idx);
-
    // Check if a backend sampler has already sampled a token in which case we
    // return that token id directly.
    {
@@ -560,17 +547,17 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
            GGML_ASSERT(!gsmpl->grmr    && "using grammar in combination with backend sampling is not supported");
            GGML_ASSERT(!gsmpl->rbudget && "using reasoning budget in combination with backend sampling is not supported");

-            for (size_t i = 0; i < cur_p.size; ++i) {
-                if (cur_p.data[i].id == id) {
-                    cur_p.selected = i;
-                    break;
-                }
-            }
+            // TODO: simplify
+            gsmpl->cur.resize(1);
+            gsmpl->cur[0] = { id, 0.0f, 1.0f };
+            cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };

            return id;
        }
    }

+    gsmpl->set_logits(ctx, idx);
+
    // apply reasoning budget first
    llama_sampler_apply(rbudget, &cur_p);

--- a/common/sampling.h
+++ b/common/sampling.h
@@ -41,8 +41,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st

 void common_sampler_free(struct common_sampler * gsmpl);

-// if is_generated is true, the token is accepted by the sampling chain, the reasoning budget sampler, and the grammar sampler
-void                    common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool is_generated);
+// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
+void                    common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
 void                    common_sampler_reset (struct common_sampler * gsmpl);
 struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);

--- a/common/speculative.cpp
+++ b/common/speculative.cpp
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -5,14 +5,8 @@

 struct common_speculative;

-// comma separated list the provided types
-std::string common_speculative_type_name_str(const std::vector<enum common_speculative_type> & types);
-
 // comma separated list of all types
-const char * common_speculative_all_types_str();
-
-// parse user provided types
-std::vector<enum common_speculative_type> common_speculative_types_from_names(const std::vector<std::string> & names);
+std::string common_speculative_type_name_str();

 // convert string to type
 enum common_speculative_type common_speculative_type_from_name(const std::string & name);
@@ -20,44 +14,24 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
 // convert type to string
 std::string common_speculative_type_to_str(enum common_speculative_type type);

-common_speculative * common_speculative_init(common_params_speculative & params, uint32_t n_seq);
+common_speculative * common_speculative_init(
+        common_params_speculative & params,
+        llama_context             * ctx_tgt);

 void common_speculative_free(common_speculative * spec);

-struct common_speculative_draft_params {
-    // this flag is used to chain the drafts through all the available implementations
-    // after the first successful draft from an implementation, we set it
-    //   to false to prevent further drafts for that sequence
-    // at the end of the draft() call, all drafting flags will be reset to false
-    bool drafting = false;
-
-    // overrides individual configurations (-1 disabled)
-    // can be used to constraint the max draft based on the remaining context size
-    int32_t n_max = -1;
-
-    llama_pos   n_past;
-    llama_token id_last;
-
-    // TODO: remove in the future by keeping track of the prompt from the _begin() call and the consecutive accept calls
-    const llama_tokens * prompt;
-
-    // the generated draft from the last _draft() call
-    llama_tokens * result;
-};
-
-common_speculative_draft_params & common_speculative_get_draft_params(common_speculative * spec, llama_seq_id seq_id);
-
 // optionally call once at the beginning of a new generation
-void common_speculative_begin(common_speculative * spec, llama_seq_id seq_id, const llama_tokens & prompt);
+void common_speculative_begin(common_speculative * spec, const llama_tokens & prompt);

-// process the batch and update the internal state of the speculative context
-bool common_speculative_process(common_speculative * spec, const llama_batch & batch);
+// sample up to n_draft tokens and add them to the batch using the draft model
+llama_tokens common_speculative_draft(
+                     common_speculative * spec,
+        const common_params_speculative & params,
+                     const llama_tokens & prompt,
+                            llama_token   id_last);

-// generate drafts for the sequences specified with `common_speculative_get_draft_params`
-void common_speculative_draft(common_speculative * spec);
-
-// informs the speculative context that n_accepted tokens were accepted by the target model
-void common_speculative_accept(common_speculative * spec, llama_seq_id, uint16_t n_accepted);
+// informs the speculative decoder that n_accepted tokens were accepted by the target model
+void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);

 // print statistics about the speculative decoding
 void common_speculative_print_stats(const common_speculative * spec);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -155,7 +155,6 @@ models = [
    {"name": "joyai-llm",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
    {"name": "kanana2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
    {"name": "f2llmv2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
-    {"name": "sarvam-moe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
@@ -176,7 +175,6 @@ pre_computed_hashes = [
    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
    {"name": "kimi-k2",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base",   "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
    {"name": "qwen2",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
-    {"name": "qwen35",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM-V-4_6", "chkhsh": "1444df51289cfa8063b96f0e62b1125440111bc79a52003ea14b6eac7016fd5f"},
    {"name": "grok-2",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
    # jina-v2-de variants
    {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -188,24 +188,6 @@ class LoraTorchTensor:
    def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
        return self.transpose(axis0, axis1)

-    def split(self, split_size: int | Sequence[int], dim: int = 0) -> tuple[LoraTorchTensor, ...]:
-        shape = self.shape
-        ndim = len(shape)
-        if dim < 0:
-            dim += ndim
-        if dim == ndim - 1:
-            A_chunks = self._lora_A.split(split_size, dim=-1)
-            return tuple(LoraTorchTensor(a, self._lora_B) for a in A_chunks)
-        elif dim == ndim - 2:
-            B_chunks = self._lora_B.split(split_size, dim=-2)
-            return tuple(LoraTorchTensor(self._lora_A, b) for b in B_chunks)
-        else:
-            B_chunks = self._lora_B.split(split_size, dim=dim)
-            if self._lora_A.shape[dim] == 1:
-                return tuple(LoraTorchTensor(self._lora_A, b) for b in B_chunks)
-            A_chunks = self._lora_A.split(split_size, dim=dim)
-            return tuple(LoraTorchTensor(a, b) for a, b in zip(A_chunks, B_chunks))
-
    def to(self, *args, **kwargs):
        return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))

@@ -248,11 +230,6 @@ class LoraTorchTensor:
                )
            else:
                raise NotImplementedError
-        elif func is torch.split:
-            assert len(args) and len(args) >= 2
-            tensor, split_size = args[0], args[1]
-            dim = args[2] if len(args) > 2 else kwargs.get("dim", 0)
-            return tensor.split(split_size, dim=dim)
        else:
            raise NotImplementedError

--- a/docs/backend/OPENVINO.md
+++ b/docs/backend/OPENVINO.md
@@ -244,6 +244,7 @@ build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf"
 - `-fa 1` is required when running llama-bench with the OpenVINO backend.
  - `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1`
 - `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
+- For Intel GPU, NPU detection in containers, GPU, NPU user-space drivers/libraries must be present inside the image. We will include in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile)

 > [!NOTE]
 > The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved.
@@ -273,6 +274,8 @@ docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_p
 Run llama.cpp with OpenVINO backend Docker container.
 Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below.

+> [!NOTE]
+> Intel GPU, NPU detection in containers will be included in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile).

 ```bash
 #  Run Docker container
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -31,8 +31,6 @@ SYCL cross-platform capabilities enable support for other vendor GPUs as well.

 ## Recommended Release

-### Windows
-
 The following releases are verified and recommended:

 |Commit ID|Tag|Release|Verified  Platform| Update date|
@@ -41,22 +39,9 @@ The following releases are verified and recommended:
 |3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
 |fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||

-### Ubuntu 24.04
-
-The release packages for Ubuntu 24.04 x64 (FP32/FP16) only include the binary files of the llama.cpp SYCL backend. They require the target machine to have pre-installed Intel GPU drivers and oneAPI packages that are the same version as the build package. To get the version and installation info, refer to release.yml: ubuntu-24-sycl -> Download & Install oneAPI.
-
-It is recommended to use them with Intel Docker.
-
-The packages for FP32 and FP16 would have different accuracy and performance on LLMs. Please choose it acording to the test result.

 ## News

- 2026.04
-
-  - Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q_K, Q8_0.
-  - Fused MoE.
-  - Upgrate CI and built package for oneAPI 2025.3.3, support Ubuntu 24.04 built package.
-
 - 2026.03
  - Support Flash-Attention: less memory usage, performance impact depends on LLM.

@@ -244,7 +229,6 @@ Upon a successful installation, SYCL is enabled for the available intel devices,

 |Verified release|
 |-|
-|2025.3.3 |
 |2025.2.1|
 |2025.1|
 |2024.1|
@@ -355,12 +339,6 @@ Choose one of following methods to run.
 ./examples/sycl/test.sh
 ```

- Run llama-server:
-
-```sh
-./examples/sycl/start-svr.sh -m PATH/MODEL_FILE
-```
-
 2. Command line
 Launch inference

@@ -649,18 +627,10 @@ Choose one of following methods to run.

 1. Script

- Run test:
-
 ```
 examples\sycl\win-test.bat
 ```

- Run llama-server:
-
-```
-examples\sycl\win-start-svr.bat -m PATH\MODEL_FILE
-```
-
 2. Command line

 Launch inference
@@ -737,14 +707,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
 | UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|

-## Compile-time Flags
-
-Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spot.
-
-| Name            | Function                                                                         |
-|-----------------|----------------------------------------------------------------------------------|
-| DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. |
-
 ## Design Rule

 - Open to all contributors.
--- a/docs/backend/snapdragon/README.md
+++ b/docs/backend/snapdragon/README.md
@@ -249,27 +249,18 @@ build: 6a8cf8914 (6733)
  ```

 - `GGML_HEXAGON_PROFILE=1`
-  Enables Op profiling:
+  Generates a host-side profile for the ggml-hexagon Ops.

-  - `1` Basic profile with per-op `usecs` and `cycles` counters
-  - `2` Extended profile with per-op `usecs`, `cycles` and default PMU counter data
-  - `0x1,...,0x8` Extended profile with per-op `usecs`, `cycles` and custom PMU counter data
-
-  The logging output can be either saved into a file for post-processing or it can be piped directly into the post-processing tool to generate the report.
-  Examples:
-
-      `GGML_HEXAGON_PROFILE=1 llama-completion ... |& ./scripts/snapdragon/ggml-hexagon-profile.py -`
-
- `GGML_HEXAGON_OPSTAGE=0x0`
-  Allows enabling specific stages of the Op processing pipeline:
+- `GGML_HEXAGON_OPMASK=0x0`
+  Allows enabling specific stages of the processing pipeline:

  - `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
  - `0x2` Enable Op Compute (MUL_MAT, etc.)

  Examples:

-      `GGML_HEXAGON_OPSTAGE=0x1 llama-completion ...` - Ops are enqueued to the NPU but dma & compute are disabled
-      `GGML_HEXAGON_OPSTAGE=0x3 llama-completion ...` - Full queuing and processing of Ops (default)
+      `GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
+      `GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - Full queuing and processing of Ops (default)

 - `GGML_HEXAGON_OPFILTER=regex`
  Allows filtering (disabling) Ops that match the regex pattern:
--- a/docs/multi-gpu.md
+++ b/docs/multi-gpu.md
@@ -1,127 +0,0 @@
-# Using multiple GPUs with llama.cpp
-
-This guide explains how to run [llama.cpp](https://github.com/ggml-org/llama.cpp) across more than one GPU. It covers the split modes, the command-line flags that control them, the limitations you need to know about, and ready-to-use recipes for `llama-cli` and `llama-server`.
-
-The CLI arguments listed here are the same for both tools - or most llama.cpp binaries for that matter.
-
---
-
-## When you need multi-GPU
-
-Reach for multi-GPU when one of these is true:
-
- **The model doesn't fit in a single GPU's VRAM.** By spreading the weights across two or more GPUs the whole model can stay on accelerators. Otherwise part of the model will need to be run off of the comparatively slower system RAM.
- **You want more throughput.** By distributing the computation across multiple GPUs, each individual GPU has to do less work. This can result in better prefill and/or token generation performance, depending on the split mode and interconnect speed vs. the speed of an individual GPU.
-
---
-
-## The split modes
-
-Set with `--split-mode` / `-sm`.
-
-| Mode | What it does | When to use |
-|---|---|---|
-| `none` | Use a single GPU only. Pick which one with `--main-gpu`. | You explicitly want to confine the model to one GPU even though more are visible. |
-| `layer` (**default**) | Pipeline parallelism. Each GPU holds a contiguous slice of layers. The KV cache for layer *l* lives on the GPU that owns layer *l*. | Default and most compatible multi-GPU choice. You want more memory than a single GPU provides and your priority is a fast prefill. Can tolerate slow interconnect speeds between GPUs. |
-| `row` | **Deprecated.** Older row-split tensor-parallel path with comparatively poor performance. Splits only dense weights across GPUs. Superseded by `tensor` which should be universally superior if it can be used. | Avoid in new deployments. |
-| `tensor` | **EXPERIMENTAL.** Tensor parallelism that splits both weights *and* KV across the participating GPUs via a "meta device" abstraction. | You want more memory than a single GPU provides and your priority is fast token generation. Prefill speeds approach pipeline parallel speeds for large, dense models and fast GPU interconnect speeds. Treat as experimental as the code is less mature than pipeline parallelism. Performance should be good for multiple NVIDIA GPUs using the CUDA backend, no guarantees otherwise. |
-
-> Pipeline parallel (`layer`) vs. tensor parallel (`tensor`): pipeline-parallel runs different layers on different GPUs and processes tokens sequentially through the pipeline. This minimizes data transfers between GPUs but requires many tokens to scale well. Tensor-parallel splits each layer across GPUs and does multiple cross-GPU reductions per layer. This enables parallelizing any workload but is much more bottlenecked by the GPU interconnect speed. Pipeline-parallel maximizes batch throughput; tensor-parallel minimizes latency.
-
---
-
-## Command-line arguments reference
-
-| Short | Long | Value | Default | Notes |
-|---|---|---|---|---|
-| `-sm` | `--split-mode` | `none` \| `layer` \| `tensor` | `layer` | See modes above. |
-| `-ts` | `--tensor-split` | comma-separated proportions, e.g. `3,1` | mode-dependent | How much of the model goes to each GPU. If omitted, `layer`/`row` use automatic splitting proportional to memory, while `tensor` splits tensor segments evenly. With `3,1` on two GPUs, GPU 0 gets 75 %, GPU 1 gets 25 %. The values follow the order in `--device`. |
-| `-mg` | `--main-gpu` | integer device index | `0` | The single GPU used in `--split-mode none`. |
-| `-ngl` | `--n-gpu-layers` / `--gpu-layers` | integer \| `auto` \| `all` | `auto` | Maximum number of layers to keep in VRAM. Use `999` or `all` to push everything possible to the GPUs. |
-| `-dev` | `--device` | comma-separated device names, or `none` | auto | Restrict which devices llama.cpp may use. See `--list-devices` for names. |
-| | `--list-devices` | - | - | Print the available devices and their memory. Run this first to learn the names you'd pass to `--device`. |
-| `-fa` | `--flash-attn` | `on` \| `off` \| `auto` | `auto` | Required when using `--split-mode tensor` and/or quantized V cache. Supported (and therefore enabled by default) for most combinations of models and backends. |
-| `-ctk` | `--cache-type-k` | `f32` \| `f16` \| `bf16` \| `q8_0` \| `q4_0` \| ... | `f16` | KV cache type for K. |
-| `-ctv` | `--cache-type-v` | same as `-ctk` | `f16` | KV cache type for V. |
-| `-fit` | `--fit` | `on` \| `off` | `on` | Auto-fit unset args to device memory. **Not supported with `tensor`. You may need to manually set the `--ctx-size` to make the model fit.**  |
-
-As for any CUDA program, the environment variable `CUDA_VISIBLE_DEVICES` can be used to control which GPUs to use for the CUDA backend: if you set it, llama.cpp only sees the specified GPUs. Use `--device` for selecting GPUs from among those visible to llama.cpp, this works for any backend.
-
---
-
-## Recipes
-
-### 1. Default - pipeline parallel across all visible GPUs
-
-```bash
-llama-cli -m model.gguf
-llama-server -m model.gguf
-```
-
-Easiest configuration. KV cache spreads across the GPUs along with the layers. `--fit` (on by default) sizes things automatically.
-
-### 2. Pipeline parallel with a custom split ratio
-
-```bash
-llama-cli -m model.gguf -ts 3,1
-```
-
-Useful when GPUs have different memory: GPU 0 (3 parts) and GPU 1 (1 part). Proportions are normalized so `-ts 3,1` is the same as e.g. `-ts 75,25`.
-
-### 3. Single-GPU mode, picking a specific GPU
-
-```bash
-llama-cli --list-devices
-llama-cli -m model.gguf -dev CUDA1
-```
-
-Use only the device listed as `CUDA1` when calling with `--list-devices`.
-
-### 4. Tensor parallelism (experimental)
-
-```bash
-llama-cli -m model.gguf -sm tensor -ctk f16 -ctv f16
-```
-
- `--flash-attn off` or (`--flash-attn auto` resolving to `off` when it isn't supported) is a hard error.
- KV cache types must be non-quantized: `f32`, `f16`, or `bf16`. Support for quantized KV cache is not implemented and trying to use it will result in an error.
- Mark this configuration as experimental in your tooling: validate output quality before deploying.
- `--split-mode tensor`is not implemented for all architectures. The following will fail with *"LLAMA_SPLIT_MODE_TENSOR not implemented for architecture '...'"*:
-
-  - **MoE / hybrid:** Grok, MPT, OLMoE, DeepSeek2, GLM-DSA, Nemotron-H, Nemotron-H-MoE, Granite-Hybrid, LFM2-MoE, Minimax-M2, Mistral4, Kimi-Linear, Jamba, Falcon-H1
-  - **State-space / RWKV-style:** Mamba, Mamba2 (and the hybrid Mamba-attention models above)
-  - **Other:** PLAMO2, MiniCPM3, Gemma-3n, OLMo2, BitNet, T5
-
-### 5. With NCCL
-
-There's no runtime flag for NCCL - it's selected at build time (`-DGGML_CUDA_NCCL=ON`, this is the default). Note that NCCL is **not** automatically distributed with CUDA and you may need to install it manually - when in doubt check the CMake log to see whether or not it can find the package. When llama.cpp is compiled with NCCL support it uses it automatically for cross-GPU reductions in `tensor` mode. When NCCL is missing on a multi-GPU build, you'll see this one-time warning and performance will be lower:
-
-```
-NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal
-```
-
-When using the "ROCm" backend (which is the ggml CUDA code translated for AMD via HIP), the AMD equivalent RCCL can be used by compiling with `-DGGML_HIP_RCCL=ON`. Note that RCCL is by default *disabled* because (unlike NCCL) it was not universally beneficial during testing.
-### 6. With CUDA peer-to-peer access (`GGML_CUDA_P2P`)
-
-CUDA peer-to-peer (P2P) lets GPUs transfer data directly between each other instead of going through system memory, which generally improves multi-GPU performance. It is **opt-in** at runtime - set the environment variable `GGML_CUDA_P2P` to any value to enable it:
-
-```bash
-GGML_CUDA_P2P=1 llama-cli -m model.gguf -sm tensor
-```
-
-P2P requires driver support (usually restricted to workstation/datacenter GPUs) and **may cause crashes or corrupted outputs on some motherboards or BIOS configurations** (e.g. when IOMMU is enabled). If you see instability after enabling it, unset the variable.
-
---
-
-## Troubleshooting
-
-| Symptom | How to fix |
-|---|---|
-| Startup error *"SPLIT_MODE_TENSOR requires flash_attn to be enabled"* | Add `-fa on` or remove `-fa off`. |
-| Startup error *"simultaneous use of SPLIT_MODE_TENSOR and KV cache quantization not implemented"* | Use `-ctk f16 -ctv f16` (or `bf16`/`f32`) with `--split-mode tensor`. |
-| Startup error *"LLAMA_SPLIT_MODE_TENSOR not implemented for architecture 'X'"* | Architecture not on the TENSOR allow-list. Use `--split-mode layer`. |
-| Warning *"NCCL is unavailable, multi GPU performance will be suboptimal"* | llama.cpp wasn't built with NCCL. Either accept the lower performance or install NCCL and rebuild. |
-| CUDA OOM at startup or during prefill in `--split-mode tensor` | Auto-fit is disabled in this mode, so reduce memory pressure yourself. In order from least to most disruptive: lower `--ctx-size` (`-c`) (KV cache is roughly proportional to `n_ctx`); for `llama-server`, lower `--parallel` (`-np`) (a slot KV cache is allocated per concurrent sequence); as a last resort, reduce `--n-gpu-layers` (`-ngl`) (the remaining layers run on CPU and inference will be much slower). |
-| Performance is worse with multi-GPU than single-GPU | The performance is bottlenecked by GPU interconnect speed. For `--split-mode tensor`, verify that NCCL is being used. Try `--split-mode layer` (less communication than `tensor`). Increase GPU interconnect speed via more PCIe lanes or e.g. NVLink (if available). |
-| GPU not used at all | `--n-gpu-layers` is `0` or too low - try explicitly setting `-ngl all`. Or you are accidentally hiding the GPUs via an environment variable like `CUDA_VISIBLE_DEVICES=-1`. Or your build doesn't include support for the relevant backend. |
-| Crashes or corrupted outputs after setting `GGML_CUDA_P2P=1` | Some motherboards and BIOS settings (e.g. with IOMMU enabled) don't support CUDA peer-to-peer reliably. Unset `GGML_CUDA_P2P`. |
--- a/docs/multimodal/minicpmv4.6.md
+++ b/docs/multimodal/minicpmv4.6.md
@@ -1,49 +0,0 @@
-## MiniCPM-V 4.6
-
-### Prepare models and code
-
-Download [MiniCPM-V-4_6](https://huggingface.co/openbmb/MiniCPM-V-4_6) PyTorch model from huggingface to "MiniCPM-V-4_6" folder.
-
-The model must be the standard `transformers` v5.7.0+ checkpoint (no `trust_remote_code`); the architecture in `config.json` is `MiniCPMV4_6ForConditionalGeneration` with a `qwen3_5_text` text model and a SigLIP-based vision tower plus a window-attention `vit_merger`.
-
-### Build llama.cpp
-
-If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
-
-Clone llama.cpp:
-```bash
-git clone https://github.com/ggml-org/llama.cpp
-cd llama.cpp
-```
-
-Build llama.cpp using `CMake`:
-```bash
-cmake -B build
-cmake --build build --config Release
-```
-
-
-### Usage of MiniCPM-V 4.6
-
-Unlike older MiniCPM-V variants, MiniCPM-V 4.6 is converted directly through `convert_hf_to_gguf.py`. The same script is invoked twice on the original Hugging Face directory: once to produce the language-model GGUF and once with `--mmproj` to produce the multimodal projector GGUF.
-
-```bash
-# language model
-python ./convert_hf_to_gguf.py ../MiniCPM-V-4_6 --outfile ../MiniCPM-V-4_6/ggml-model-f16.gguf
-
-# multimodal projector (vision tower + window-attention vit_merger + DownsampleMLP merger)
-python ./convert_hf_to_gguf.py ../MiniCPM-V-4_6 --mmproj --outfile ../MiniCPM-V-4_6/mmproj-model-f16.gguf
-
-# optional: quantize to Q4_K_M
-./build/bin/llama-quantize ../MiniCPM-V-4_6/ggml-model-f16.gguf ../MiniCPM-V-4_6/ggml-model-Q4_K_M.gguf Q4_K_M
-```
-
-
-Inference on Linux or Mac
-```bash
-# run in single-turn mode
-./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_6/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
-
-# run in conversation mode
-./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_6/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4_6/mmproj-model-f16.gguf
-```
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -17,8 +17,8 @@ Legend:
 |                              ABS | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
@@ -26,7 +26,7 @@ Legend:
 |                            CLAMP | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
-|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
@@ -36,15 +36,15 @@ Legend:
 |                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |               CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                           CUMSUM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                             DIAG | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                           CUMSUM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                             DIAG | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
 |                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              ELU | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                              EXP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                             FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                             FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
 |                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                  GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
@@ -60,18 +60,17 @@ Legend:
 |                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                           IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                           IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                          L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
 |                              LOG | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
-|                 MUL_MAT_HADAMARD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
 |                              NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                             NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
 |                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                     OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                         OUT_PROD | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 |
@@ -102,11 +101,11 @@ Legend:
 |                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
-|                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
+|                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
 |                              SQR | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                             SQRT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                         SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
 |                             STEP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              SUM | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
@@ -118,5 +117,5 @@ Legend:
 |                            TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            XIELU | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
--- a/docs/ops/SYCL.csv
+++ b/docs/ops/SYCL.csv
--- a/docs/ops/WebGPU.csv
+++ b/docs/ops/WebGPU.csv
--- a/docs/speculative.md
+++ b/docs/speculative.md
@@ -33,18 +33,18 @@ An example to use this approach can be the rewriting of source code by a LLM.
 This implementation looks for the last n-gram in history that matches the current n-gram and creates a draft using the m tokens following the matched n-gram. It is the simplest self-speculative approach with minimal overhead.

 ```
-llama-server [...] --spec-type ngram-simple --spec-draft-n-max 64
+llama-server [...] --spec-type ngram-simple --draft-max 64
 ```

 #### n-gram Map Key (`ngram-map-k`)

-This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-map-k-min-hits`, default is 1) before generating drafts.
+This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`, default is 1) before generating drafts.

 The number of accepted tokens is stored for each used n-gram.

 **Example:**
 ```
-llama-server [...] --spec-type ngram-map-k --spec-draft-n-max 64
+llama-server [...] --spec-type ngram-map-k --draft-max 64
 ```

 #### n-gram Map Key-4-Values (`ngram-map-k4v`)
@@ -55,7 +55,7 @@ The number of accepted tokens is stored for each used n-gram.

 **Example:** Server options to be used if there are a lot of longer repetitions.
 ```
-llama-server [...] --spec-type ngram-map-k4v --spec-ngram-map-k4v-size-n 8 --spec-ngram-map-k4v-size-m 8 --spec-ngram-map-k4v-min-hits 2 --spec-draft-n-max 64
+llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2 --draft-max 64
 ```

 ### n-gram Mod (`ngram-mod`)
@@ -80,9 +80,9 @@ Currently, a single hash pool is shared across all server slots, so different re
 # notes:
 # - small `n` are not recommended
 # - MoEs require long drafts
-# - dense models: can reduce `--spec-ngram-mod-n-min` and `--spec-ngram-mod-n-max`
+# - dense models: can reduce `--draft-min` and `--draft-max`

-llama-server ... --spec-type ngram-mod --spec-ngram-mod-n-match 24 --spec-ngram-mod-n-min 48 --spec-ngram-mod-n-max 64
+llama-server ... --spec-type ngram-mod --spec-ngram-size-n 24 --draft-min 48 --draft-max 64
 ```

 Applications:
@@ -105,90 +105,21 @@ Example Video:

 If a draft model is combined with a draftless decoding the draftless decoding has higher precedence.

-### General Speculative Parameters
-
 ```
+--draft, --draft-n, --draft-max N       number of tokens to draft for speculative decoding (default: 16)
+                                        (env: LLAMA_ARG_DRAFT_MAX)
+--draft-min, --draft-n-min N            minimum number of draft tokens to use for speculative decoding
+                                        (default: 0)
+                                        (env: LLAMA_ARG_DRAFT_MIN)
+[...]
 --spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
                                        type of speculative decoding to use when no draft model is provided
                                        (default: none)
-                                        (env: LLAMA_ARG_SPEC_TYPE)
--spec-default                          use default speculative decoding
-```
-
-### Draft Model Parameters
-
-```
--spec-draft-model, -md, --model-draft  FNAME
-                                        draft model for speculative decoding (default: unused)
-                                        (env: LLAMA_ARG_SPEC_DRAFT_MODEL)
--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft  <user>/<model>[:quant]
-                                        HuggingFace repository for the draft model
--spec-draft-n-max                      N
-                                        number of tokens to draft for speculative decoding (default: 16)
-                                        (env: LLAMA_ARG_SPEC_DRAFT_N_MAX)
--spec-draft-n-min                      N
-                                        minimum number of draft tokens to use for speculative decoding (default: 0)
-                                        (env: LLAMA_ARG_SPEC_DRAFT_N_MIN)
--spec-draft-p-split, --draft-p-split   P
-                                        speculative decoding split probability (default: 0.10)
-                                        (env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT)
--spec-draft-p-min, --draft-p-min       P
-                                        minimum speculative decoding probability (greedy) (default: 0.75)
-                                        (env: LLAMA_ARG_SPEC_DRAFT_P_MIN)
--spec-draft-ctx-size, -cd, --ctx-size-draft  N
-                                        size of the prompt context for the draft model (default: 0, 0 = loaded from model)
-                                        (env: LLAMA_ARG_SPEC_DRAFT_CTX_SIZE)
--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft  N
-                                        max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
-                                        (env: LLAMA_ARG_N_GPU_LAYERS_DRAFT)
--spec-draft-device, -devd, --device-draft  <dev1,dev2,..>
-                                        comma-separated list of devices to use for offloading the draft model
--spec-draft-replace, --spec-replace    TARGET  DRAFT
-                                        translate the string in TARGET into DRAFT if the draft model and main model are not compatible
-```
-
-### n-gram Mod Parameters
-
-```
--spec-ngram-mod-n-match                N
-                                        ngram-mod lookup length (default: 24)
--spec-ngram-mod-n-min                  N
-                                        minimum number of ngram tokens to use for ngram-based speculative decoding (default: 48)
--spec-ngram-mod-n-max                  N
-                                        maximum number of ngram tokens to use for ngram-based speculative decoding (default: 64)
-```
-
-### n-gram Simple Parameters
-
-```
--spec-ngram-simple-size-n              N
-                                        ngram size N for ngram-simple speculative decoding, length of lookup n-gram (default: 12)
--spec-ngram-simple-size-m              N
-                                        ngram size M for ngram-simple speculative decoding, length of draft m-gram (default: 48)
--spec-ngram-simple-min-hits            N
-                                        minimum hits for ngram-simple speculative decoding (default: 1)
-```
-
-### n-gram Map Key Parameters
-
-```
--spec-ngram-map-k-size-n               N
-                                        ngram size N for ngram-map-k speculative decoding, length of lookup n-gram (default: 12)
--spec-ngram-map-k-size-m               N
-                                        ngram size M for ngram-map-k speculative decoding, length of draft m-gram (default: 48)
--spec-ngram-map-k-min-hits             N
-                                        minimum hits for ngram-map-k speculative decoding (default: 1)
-```
-
-### n-gram Map Key-4-Values Parameters
-
-```
--spec-ngram-map-k4v-size-n             N
-                                        ngram size N for ngram-map-k4v speculative decoding, length of lookup n-gram (default: 12)
--spec-ngram-map-k4v-size-m             N
-                                        ngram size M for ngram-map-k4v speculative decoding, length of draft m-gram (default: 48)
--spec-ngram-map-k4v-min-hits           N
-                                        minimum hits for ngram-map-k4v speculative decoding (default: 1)
+--spec-ngram-size-n N                   ngram size N for ngram-simple/ngram-map speculative decoding, length
+                                        of lookup n-gram (default: 12)
+--spec-ngram-size-m N                   ngram size M for ngram-simple/ngram-map speculative decoding, length
+                                        of draft m-gram (default: 48)
+--spec-ngram-min-hits N                 minimum hits for ngram-map speculative decoding (default: 1)
 ```

 ### `--spec-type TYPE`
@@ -209,40 +140,21 @@ Specifies a type of speculative decoding without draft model.
 ./llama-server [...] --spec-type ngram-simple
 ```

-### `--spec-ngram-*-size-n N`
+### `--spec-ngram-size-n N`

 Sets the size N of the lookup n-gram for n-gram map based speculative decoding.
 The n-gram size N determines how many tokens in a row to look back when searching for matching patterns.

-Each n-gram implementation has its own parameter:
-
- `--spec-ngram-simple-size-n` for `ngram-simple`
- `--spec-ngram-map-k-size-n` for `ngram-map-k`
- `--spec-ngram-map-k4v-size-n` for `ngram-map-k4v`
- `--spec-ngram-mod-n-match` for `ngram-mod`
-
-### `--spec-ngram-*-size-m M`
+### `--spec-ngram-size-m M`

 Sets the size M of the draft m-gram for n-gram map based speculative decoding.
 The m-gram size determines how many tokens to draft when a match is found.
 Larger values can provide more speedup but may reduce acceptance rate.

-Each n-gram implementation has its own parameter:
-
- `--spec-ngram-simple-size-m` for `ngram-simple`
- `--spec-ngram-map-k-size-m` for `ngram-map-k`
- `--spec-ngram-map-k4v-size-m` for `ngram-map-k4v`
-
-### `--spec-ngram-*-min-hits H`
+### `--spec-ngram-min-hits H`

 This option defines how often a key has to appear in the token history to be used as a draft (default is 1).

-Each n-gram implementation has its own parameter:
-
- `--spec-ngram-simple-min-hits` for `ngram-simple`
- `--spec-ngram-map-k-min-hits` for `ngram-map-k`
- `--spec-ngram-map-k4v-min-hits` for `ngram-map-k4v`
-
 ## Statistics
 Each speculative decoding implementation prints statistics.

@@ -268,3 +180,4 @@ statistics ngram_map_k: #calls(b,g,a) = 6 1690 26, #gen drafts = 26, #acc drafts
 - `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
 - `#acc tokens`: number of tokens accepted by the main model
 - `dur(b,g,a): durations of begin (new prompt), generation and accumulation (process acceptance).
+
--- a/examples/debug/debug.cpp
+++ b/examples/debug/debug.cpp
@@ -202,14 +202,10 @@ static bool run(llama_context * ctx, const common_params & params) {
    print_tokenized_prompt(ctx, tokens, params.prompt);

    if (params.save_logits) {
-        try {
-            output_data output {ctx, model, params};
-            std::filesystem::path model_path{params.model.path};
-            std::string model_name{model_path.stem().string()};
-            save_output_data(output, model_name, params.logits_output_dir);
-        } catch (const std::exception & e) {
-            LOG_ERR("%s : error saving logits: %s\n", __func__, e.what());
-        }
+        output_data output {ctx, model, params};
+        std::filesystem::path model_path{params.model.path};
+        std::string model_name{model_path.stem().string()};
+        save_output_data(output, model_name, params.logits_output_dir);
    }

    return true;
@@ -227,7 +223,7 @@ int main(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

-    std::optional<common_debug_cb_user_data> cb_data;
+    std::optional<base_callback_data> cb_data;
    if (!params.save_logits) {
        cb_data.emplace(params, params.tensor_filter);
    }
--- a/examples/diffusion/CMakeLists.txt
+++ b/examples/diffusion/CMakeLists.txt
@@ -1,10 +1,5 @@
-set(TARGET llama-diffusion)
-add_library(${TARGET} STATIC diffusion.cpp diffusion.h)
-target_link_libraries(${TARGET} PUBLIC llama llama-common ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PUBLIC cxx_std_17)
-
 set(TARGET llama-diffusion-cli)
 add_executable(${TARGET} diffusion-cli.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama-diffusion llama llama-common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama llama-common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/diffusion/README.md
+++ b/examples/diffusion/README.md
@@ -12,11 +12,11 @@ The diffusion CLI supports various parameters to control the generation process:
 ### Core Diffusion Parameters
 - `--diffusion-steps`: Number of diffusion steps (default: 256)
 - `--diffusion-algorithm`: Algorithm for token selection
-  - `0`: DIFFUSION_ALGORITHM_ORIGIN - Token will be generated in a purely random order from https://arxiv.org/abs/2107.03006.
-  - `1`: DIFFUSION_ALGORITHM_ENTROPY_BASED - Entropy-based selection
-  - `2`: DIFFUSION_ALGORITHM_MARGIN_BASED - Margin-based selection
-  - `3`: DIFFUSION_ALGORITHM_RANDOM - Random selection
-  - `4`: DIFFUSION_ALGORITHM_CONFIDENCE_BASED - Confidence-based selection (default)
+  - `0`: ORIGIN - Token will be generated in a purely random order from https://arxiv.org/abs/2107.03006.
+  - `1`: ENTROPY_BASED - Entropy-based selection
+  - `2`: MARGIN_BASED - Margin-based selection
+  - `3`: RANDOM - Random selection
+  - `4`: CONFIDENCE_BASED - Confidence-based selection (default)
  - More documentation here https://github.com/DreamLM/Dream
 - `--diffusion-visual`: Enable live visualization during generation

--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@@ -1,23 +1,127 @@
 #include "arg.h"
 #include "chat.h"
 #include "common.h"
-#include "diffusion.h"
 #include "llama.h"
 #include "log.h"

 #include <limits.h>

+#include <algorithm>
 #include <clocale>
+#include <cmath>
 #include <cstring>
+#include <limits>
+#include <random>
 #include <string>
 #include <vector>

+enum diffusion_algorithm { ORIGIN = 0, ENTROPY_BASED = 1, MARGIN_BASED = 2, RANDOM = 3, CONFIDENCE_BASED = 4 };
+
+// Unified transfer scheduling methods
+enum transfer_schedule {
+    TIMESTEP_BASED = 0,  // Dream-style: (1.0 - s/t) * remaining
+    BLOCK_BASED    = 1,  // LLaDA-style: process in blocks with get_num_transfer_tokens
+};
+
+typedef bool (*diffusion_step_callback_t)(int32_t             step,
+                                          int32_t             total_steps,
+                                          const llama_token * tokens,
+                                          int32_t             n_tokens,
+                                          void *              user_data);
+
+struct diffusion_params {
+    int32_t                   steps                   = 0;
+    float                     temperature             = 0;
+    llama_token               mask_token_id           = LLAMA_TOKEN_NULL;
+    diffusion_step_callback_t step_callback           = nullptr;
+    void *                    step_callback_user_data = nullptr;
+    int32_t                   seed                    = 0;
+    bool                      visual_mode             = false;
+    bool                      shift_logits            = false;  // Shift logits by -1 after decode
+
+    float   top_p = 0.;
+    int32_t top_k = 0.;
+
+    diffusion_algorithm algorithm = CONFIDENCE_BASED;
+    transfer_schedule   schedule  = TIMESTEP_BASED;
+
+    float   cfg_scale        = 0.;     // Config scale for classifier-free guidance
+    float   eps              = 0.;     // Timestep scheduling
+    int32_t block_length     = 0;      // Block size (for block scheduling)
+    float   alg_temp         = 0;      // algorithm temperature (0.0 = deterministic)
+    bool    add_gumbel_noise = false;  // Add gumbel noise to the logits if temp > 0.0
+
+    int32_t max_length = 0;            // Maximum sequence length
+};
+
 struct callback_data {
    diffusion_params *  diff_params;
    const llama_vocab * vocab;
    int32_t             n_input;
 };

+static float calculate_confidence(const llama_token_data_array & cur_p,
+                                  diffusion_algorithm            algorithm,
+                                  std::mt19937 &                 rng) {
+    switch (algorithm) {
+        case CONFIDENCE_BASED:
+            return cur_p.data[cur_p.selected].p;  // Selected token probability
+
+        case ENTROPY_BASED:
+            {
+                float       entropy = 0.0f;
+                const float epsilon = 1e-10f;
+                for (size_t i = 0; i < cur_p.size; i++) {
+                    float prob = cur_p.data[i].p;
+                    entropy += prob * logf(prob + epsilon);
+                }
+                return -entropy;  // Higher entropy = lower confidence
+            }
+
+        case MARGIN_BASED:
+            return (cur_p.size > 1) ? cur_p.data[0].p - cur_p.data[1].p : cur_p.data[0].p;
+
+        case RANDOM:
+            {
+                std::uniform_real_distribution<float> uniform(0.0f, 1.0f);
+                return uniform(rng);  // Random confidence
+            }
+
+        case ORIGIN:
+            return cur_p.data[cur_p.selected].p;
+
+        default:
+            return 0.0f;
+    }
+}
+
+// Unified transfer count calculation function
+static int32_t calculate_transfer_count(int32_t                      step,
+                                        int32_t                      total_steps,
+                                        int32_t                      remaining_masked,
+                                        transfer_schedule            schedule,
+                                        float                        eps,
+                                        const std::vector<int32_t> & num_transfer_tokens = {}) {
+    switch (schedule) {
+        case TIMESTEP_BASED:
+            {
+                float t          = 1.0f - (float) step / total_steps * (1.0f - eps);
+                float s          = 1.0f - (float) (step + 1) / total_steps * (1.0f - eps);
+                float p_transfer = (step < total_steps - 1) ? (1.0f - s / t) : 1.0f;
+                return (int32_t) (remaining_masked * p_transfer);
+            }
+
+        case BLOCK_BASED:
+            if (!num_transfer_tokens.empty() && step < (int32_t) num_transfer_tokens.size()) {
+                return num_transfer_tokens[step];
+            }
+            return remaining_masked / (total_steps - step);  // Fallback
+
+        default:
+            return remaining_masked / (total_steps - step);
+    }
+}
+
 static bool diffusion_step_callback(int32_t             step,
                                    int32_t             total_steps,
                                    const llama_token * tokens,
@@ -72,6 +176,341 @@ static bool diffusion_step_callback(int32_t             step,
    return true;
 }

+static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) {
+    if (temperature == 0.0f) {
+        return;
+    }
+
+    std::uniform_real_distribution<double> uniform(0.0, 1.0);
+    for (int32_t i = 0; i < n_vocab; i++) {
+        double noise        = uniform(rng);
+        // Prevent log(0)
+        noise               = std::max(noise, 1e-20);
+        double gumbel_noise = std::pow(-std::log(noise), temperature);
+        logits[i]           = std::exp(logits[i]) / gumbel_noise;
+    }
+}
+
+static std::vector<int32_t> get_num_transfer_tokens(int32_t mask_count, int32_t steps) {
+    std::vector<int32_t> num_transfer_tokens(steps);
+
+    int32_t base      = mask_count / steps;
+    int32_t remainder = mask_count % steps;
+
+    for (int32_t i = 0; i < steps; i++) {
+        num_transfer_tokens[i] = base + (i < remainder ? 1 : 0);
+    }
+
+    return num_transfer_tokens;
+}
+
+static void diffusion_generate(llama_context *          ctx,
+                               const llama_token *      input_tokens,
+                               llama_token *            output_tokens,
+                               int32_t                  n_input,
+                               const diffusion_params & params,
+                               int32_t &                n_generated) {
+    n_generated = 0;
+    if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || params.max_length <= n_input) {
+        return;
+    }
+
+    const llama_model * model = llama_get_model(ctx);
+
+    // Initialize with input and pad with mask tokens
+    std::copy(input_tokens, input_tokens + n_input, output_tokens);
+    std::fill(output_tokens + n_input, output_tokens + params.max_length, params.mask_token_id);
+
+    std::mt19937 rng(params.seed);
+
+    llama_set_causal_attn(ctx, false);
+
+    int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
+
+    std::vector<llama_token_data> candidates(n_vocab);
+    std::vector<llama_token_data> conf_candidates;
+    conf_candidates.reserve(params.max_length);
+    std::vector<int32_t> mask_positions;
+    mask_positions.reserve(params.max_length);
+
+    // Setup sampler chain
+    struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
+    if (params.top_k > 0) {
+        llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
+    }
+    if (params.top_p < 1.0f) {
+        llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
+    }
+    if (params.temperature > 0.0f) {
+        llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
+    }
+    llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
+
+    struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
+
+    llama_batch batch = llama_batch_init(params.max_length, 0, 1);
+    batch.n_tokens    = params.max_length;
+
+    // Pre-allocate buffers for CFG if needed
+    int32_t                  logits_size = n_vocab * params.max_length;
+    std::vector<float>       cond_logits_buffer;
+    std::vector<llama_token> un_x_buffer;
+    if (params.cfg_scale > 0.0f) {
+        cond_logits_buffer.resize(logits_size);
+        un_x_buffer.resize(params.max_length);
+    }
+
+    // For block-based processing
+    std::vector<int32_t> num_transfer_tokens;
+    int32_t              num_blocks      = 1;
+    int32_t              steps_per_block = params.steps;
+
+    if (params.schedule == BLOCK_BASED) {
+        GGML_ASSERT(params.max_length % params.block_length == 0);
+        num_blocks = params.max_length / params.block_length;
+        GGML_ASSERT(params.steps % num_blocks == 0);
+        steps_per_block = params.steps / num_blocks;
+    }
+
+    std::vector<float> confidence(params.max_length);
+
+    int64_t total_sampling_time = 0;
+    int64_t total_time          = 0;
+    int64_t time_start          = ggml_time_us();
+
+    for (int block_num = 0; block_num < num_blocks; block_num++) {
+        int32_t block_start = (params.schedule == BLOCK_BASED) ? n_input + block_num * params.block_length : 0;
+        int32_t block_end   = (params.schedule == BLOCK_BASED) ?
+                                  std::min(n_input + (block_num + 1) * params.block_length, params.max_length) :
+                                  params.max_length;
+
+        // Count masked tokens in current block for block-based processing
+        if (params.schedule == BLOCK_BASED) {
+            int32_t block_mask_count = 0;
+            for (int i = block_start; i < block_end; i++) {
+                if (output_tokens[i] == params.mask_token_id) {
+                    block_mask_count++;
+                }
+            }
+            num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block);
+        }
+
+        for (int32_t step = 0; step < steps_per_block; step++) {
+            int32_t global_step = block_num * steps_per_block + step;
+
+            if (params.step_callback) {
+                if (!params.step_callback(
+                        global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) {
+                    break;
+                }
+            }
+
+            // Setup batch
+            for (int32_t i = 0; i < params.max_length; i++) {
+                batch.token[i]     = output_tokens[i];
+                batch.pos[i]       = i;
+                batch.n_seq_id[i]  = 1;
+                batch.seq_id[i][0] = 0;
+                batch.logits[i]    = 1;
+            }
+
+            float * logits = nullptr;
+
+            if (params.cfg_scale > 0.0f) {
+                int ret = llama_decode(ctx, batch);
+                if (ret != 0) {
+                    LOG_ERR("Failed to generate conditional");
+                    break;
+                }
+                float * cond_logits_ptr = llama_get_logits(ctx);
+                std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float));
+
+                // Unconditional generation (mask input)
+                std::copy(output_tokens, output_tokens + params.max_length, un_x_buffer.begin());
+                for (int32_t i = 0; i < n_input; i++) {
+                    un_x_buffer[i] = params.mask_token_id;
+                }
+
+                for (int32_t i = 0; i < params.max_length; i++) {
+                    batch.token[i] = un_x_buffer[i];
+                }
+                ret = llama_decode(ctx, batch);
+                if (ret != 0) {
+                    LOG_ERR("Failed to generate unconditional");
+                    break;
+                }
+                float * uncond_logits = llama_get_logits(ctx);
+
+                // Apply CFG
+                for (int32_t i = 0; i < logits_size; i++) {
+                    cond_logits_buffer[i] =
+                        uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]);
+                }
+                logits = cond_logits_buffer.data();
+            } else {
+                int ret = llama_decode(ctx, batch);
+                if (ret != 0) {
+                    LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, global_step, ret);
+                    break;
+                }
+                logits = llama_get_logits(ctx);
+            }
+
+            if (!logits) {
+                LOG_ERR("%s: failed to get logits at step %d\n", __func__, global_step);
+                break;
+            }
+
+            auto get_logits_for_pos = [&](int32_t pos) -> const float * {
+                if (params.shift_logits) {
+                    return pos == 0 ? logits : logits + (pos - 1) * n_vocab;
+                }
+                return logits + (pos) *n_vocab;
+            };
+
+            int64_t time_start_sampling = ggml_time_us();
+
+            mask_positions.clear();
+            for (int32_t i = 0; i < params.max_length; i++) {
+                if (output_tokens[i] == params.mask_token_id) {
+                    // For block-based, only consider current block
+                    if (params.schedule != BLOCK_BASED || (i >= block_start && i < block_end)) {
+                        mask_positions.push_back(i);
+                    }
+                }
+            }
+
+            if (mask_positions.empty()) {
+                break;
+            }
+
+            if (params.add_gumbel_noise && params.temperature > 0.0f) {
+                add_gumbel_noise(logits, n_vocab, params.temperature, rng);
+            }
+
+            if (params.algorithm == ORIGIN) {
+                int32_t transfer_count = calculate_transfer_count(
+                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
+                float p_transfer = (float) transfer_count / mask_positions.size();
+
+                for (int32_t pos : mask_positions) {
+                    if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
+                        const float * pos_logits = get_logits_for_pos(pos);
+                        for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
+                            candidates[token_id].id    = token_id;
+                            candidates[token_id].logit = pos_logits[token_id];
+                            candidates[token_id].p     = 0.0f;
+                        }
+
+                        llama_token_data_array cur_p = {
+                            candidates.data(),
+                            (size_t) n_vocab,
+                            -1,
+                            false,
+                        };
+
+                        llama_sampler_apply(sampler, &cur_p);
+                        output_tokens[pos] = cur_p.data[cur_p.selected].id;
+                    }
+                }
+            } else {
+                std::vector<std::pair<float, int32_t>> confidences;
+                std::vector<llama_token>               sampled_tokens(mask_positions.size());
+
+                for (size_t i = 0; i < mask_positions.size(); i++) {
+                    int32_t       pos        = mask_positions[i];
+                    const float * pos_logits = get_logits_for_pos(pos);
+
+                    for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
+                        candidates[token_id].logit = pos_logits[token_id];
+                        candidates[token_id].p     = 0.0f;
+                        candidates[token_id].id    = token_id;
+                    }
+
+                    llama_token_data_array cur_p = {
+                        candidates.data(),
+                        candidates.size(),
+                        -1,
+                        false,
+                    };
+
+                    llama_sampler_apply(sampler, &cur_p);
+                    llama_token sampled_token = cur_p.data[cur_p.selected].id;
+
+                    float conf = calculate_confidence(cur_p, params.algorithm, rng);
+
+                    sampled_tokens[i] = sampled_token;
+                    confidences.emplace_back(conf, i);
+                }
+
+                int32_t transfer_count = calculate_transfer_count(
+                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
+
+                if (transfer_count > 0) {
+                    if (params.alg_temp == 0.0f) {
+                        std::partial_sort(confidences.begin(),
+                                          confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()),
+                                          confidences.end(),
+                                          [](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
+                                              if (a.first != b.first) {
+                                                  return a.first > b.first;
+                                              }
+                                              return a.second < b.second;
+                                          });
+
+                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
+                            int32_t mask_idx   = confidences[i].second;
+                            int32_t pos        = mask_positions[mask_idx];
+                            output_tokens[pos] = sampled_tokens[mask_idx];
+                        }
+                    } else {
+                        conf_candidates.clear();
+                        for (size_t i = 0; i < confidences.size(); i++) {
+                            float conf_logit = confidences[i].first / params.alg_temp;
+                            conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f });
+                        }
+
+                        llama_token_data_array conf_array = {
+                            conf_candidates.data(),
+                            conf_candidates.size(),
+                            -1,
+                            false,
+                        };
+
+                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
+                            llama_sampler_apply(dist_sampler, &conf_array);
+                            int32_t selected_idx = conf_array.selected;
+                            int32_t mask_idx     = selected_idx;
+                            int32_t pos          = mask_positions[mask_idx];
+                            output_tokens[pos]   = sampled_tokens[mask_idx];
+
+                            conf_candidates[selected_idx].p = 0.0f;
+                            conf_array.selected             = -1;
+                        }
+                    }
+                }
+            }
+
+            int64_t time_end_sampling = ggml_time_us();
+            total_sampling_time += time_end_sampling - time_start_sampling;
+        }
+    }
+
+    int64_t time_end = ggml_time_us();
+    total_time += time_end - time_start;
+
+    LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
+            total_time / 1000.0,
+            total_time / 1000.0 / params.steps,
+            total_sampling_time / 1000.0 / params.steps);
+
+    llama_batch_free(batch);
+    llama_sampler_free(sampler);
+    llama_sampler_free(dist_sampler);
+
+    n_generated = params.max_length;
+}
+
 static std::string format_input_text(const std::string & prompt, const std::string & system_prompt, bool use_chat_template, llama_model * model) {
    if (!use_chat_template) {
        return prompt;
@@ -192,10 +631,10 @@ int main(int argc, char ** argv) {
    GGML_ASSERT((params.diffusion.eps == 0) ^ (params.diffusion.block_length == 0));

    if (params.diffusion.eps) {
-        diff_params.schedule = DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED;
+        diff_params.schedule = TIMESTEP_BASED;
        diff_params.eps      = params.diffusion.eps;
    } else if (params.diffusion.block_length) {
-        diff_params.schedule     = DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED;
+        diff_params.schedule     = BLOCK_BASED;
        diff_params.block_length = params.diffusion.block_length;
    }

@@ -214,17 +653,8 @@ int main(int argc, char ** argv) {
    callback_data cb_data               = { &diff_params, vocab, n_input };
    diff_params.step_callback_user_data = &cb_data;

-    const char * alg_names[]   = {
-        "DIFFUSION_ALGORITHM_ORIGIN",
-        "DIFFUSION_ALGORITHM_ENTROPY_BASED",
-        "DIFFUSION_ALGORITHM_MARGIN_BASED",
-        "DIFFUSION_ALGORITHM_RANDOM",
-        "DIFFUSION_ALGORITHM_CONFIDENCE_BASED",
-    };
-    const char * sched_names[] = {
-        "DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED",
-        "DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED",
-    };
+    const char * alg_names[]   = { "ORIGIN", "ENTROPY_BASED", "MARGIN_BASED", "RANDOM", "CONFIDENCE_BASED" };
+    const char * sched_names[] = { "TIMESTEP_BASED", "BLOCK_BASED" };
    const char * alg_name =
        (diff_params.algorithm >= 0 && diff_params.algorithm <= 4) ? alg_names[diff_params.algorithm] : "UNKNOWN";
    const char * sched_name =
@@ -236,11 +666,11 @@ int main(int argc, char ** argv) {
    LOG_INF("diffusion_params: - %-25s enum             = %d (%s)\n", "algorithm", diff_params.algorithm, alg_name);
    LOG_INF("diffusion_params: - %-25s enum             = %d (%s)\n", "schedule", diff_params.schedule, sched_name);
    LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "temperature", diff_params.temperature);
-    if (diff_params.schedule == DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED) {
+    if (diff_params.schedule == TIMESTEP_BASED) {
        LOG_INF("diffusion_params: - %-25s f32              = %.6f\n", "eps", diff_params.eps);
        LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "alg_temp", diff_params.alg_temp);
    }
-    if (diff_params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) {
+    if (diff_params.schedule == BLOCK_BASED) {
        LOG_INF("diffusion_params: - %-25s u32              = %d\n", "block_length", diff_params.block_length);
        LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "cfg_scale", diff_params.cfg_scale);
    }
--- a/examples/diffusion/diffusion.cpp
+++ b/examples/diffusion/diffusion.cpp
@@ -1,408 +0,0 @@
-#include "diffusion.h"
-
-#include "log.h"
-
-#include <algorithm>
-#include <cstddef>
-#include <cmath>
-#include <cstring>
-#include <random>
-#include <utility>
-#include <vector>
-
-static float calculate_confidence(const llama_token_data_array & cur_p,
-                                  diffusion_algorithm            algorithm,
-                                  std::mt19937 &                 rng) {
-    switch (algorithm) {
-        case DIFFUSION_ALGORITHM_CONFIDENCE_BASED:
-            return cur_p.data[cur_p.selected].p;  // Selected token probability
-
-        case DIFFUSION_ALGORITHM_ENTROPY_BASED:
-            {
-                float       entropy = 0.0f;
-                const float epsilon = 1e-10f;
-                for (size_t i = 0; i < cur_p.size; i++) {
-                    float prob = cur_p.data[i].p;
-                    entropy += prob * logf(prob + epsilon);
-                }
-                return -entropy;  // Higher entropy = lower confidence
-            }
-
-        case DIFFUSION_ALGORITHM_MARGIN_BASED:
-            return (cur_p.size > 1) ? cur_p.data[0].p - cur_p.data[1].p : cur_p.data[0].p;
-
-        case DIFFUSION_ALGORITHM_RANDOM:
-            {
-                std::uniform_real_distribution<float> uniform(0.0f, 1.0f);
-                return uniform(rng);  // Random confidence
-            }
-
-        case DIFFUSION_ALGORITHM_ORIGIN:
-            return cur_p.data[cur_p.selected].p;
-
-        default:
-            return 0.0f;
-    }
-}
-
-// Unified transfer count calculation function
-static int32_t calculate_transfer_count(int32_t                      step,
-                                        int32_t                      total_steps,
-                                        int32_t                      remaining_masked,
-                                        diffusion_transfer_schedule  schedule,
-                                        float                        eps,
-                                        const std::vector<int32_t> & num_transfer_tokens = {}) {
-    switch (schedule) {
-        case DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED:
-            {
-                float t          = 1.0f - (float) step / total_steps * (1.0f - eps);
-                float s          = 1.0f - (float) (step + 1) / total_steps * (1.0f - eps);
-                float p_transfer = (step < total_steps - 1) ? (1.0f - s / t) : 1.0f;
-                return (int32_t) (remaining_masked * p_transfer);
-            }
-
-        case DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED:
-            if (!num_transfer_tokens.empty() && step < (int32_t) num_transfer_tokens.size()) {
-                return num_transfer_tokens[step];
-            }
-            return remaining_masked / (total_steps - step);  // Fallback
-
-        default:
-            return remaining_masked / (total_steps - step);
-    }
-}
-
-static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) {
-    if (temperature == 0.0f) {
-        return;
-    }
-
-    std::uniform_real_distribution<double> uniform(0.0, 1.0);
-    for (int32_t i = 0; i < n_vocab; i++) {
-        double noise        = uniform(rng);
-        // Prevent log(0)
-        noise               = std::max(noise, 1e-20);
-        double gumbel_noise = std::pow(-std::log(noise), temperature);
-        logits[i]           = std::exp(logits[i]) / gumbel_noise;
-    }
-}
-
-static std::vector<int32_t> get_num_transfer_tokens(int32_t mask_count, int32_t steps) {
-    std::vector<int32_t> num_transfer_tokens(steps);
-
-    int32_t base      = mask_count / steps;
-    int32_t remainder = mask_count % steps;
-
-    for (int32_t i = 0; i < steps; i++) {
-        num_transfer_tokens[i] = base + (i < remainder ? 1 : 0);
-    }
-
-    return num_transfer_tokens;
-}
-
-void diffusion_generate(llama_context *          ctx,
-                        const llama_token *      input_tokens,
-                        llama_token *            output_tokens,
-                        int32_t                  n_input,
-                        const diffusion_params & params,
-                        int32_t &                n_generated) {
-    n_generated = 0;
-    if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || params.max_length <= n_input) {
-        return;
-    }
-
-    const llama_model * model = llama_get_model(ctx);
-
-    // Initialize with input and pad with mask tokens
-    std::copy(input_tokens, input_tokens + n_input, output_tokens);
-    std::fill(output_tokens + n_input, output_tokens + params.max_length, params.mask_token_id);
-
-    std::mt19937 rng(params.seed);
-
-    llama_set_causal_attn(ctx, false);
-
-    int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
-
-    std::vector<llama_token_data> candidates(n_vocab);
-    std::vector<llama_token_data> conf_candidates;
-    conf_candidates.reserve(params.max_length);
-    std::vector<int32_t> mask_positions;
-    mask_positions.reserve(params.max_length);
-
-    // Setup sampler chain
-    struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
-    if (params.top_k > 0) {
-        llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
-    }
-    if (params.top_p < 1.0f) {
-        llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
-    }
-    if (params.temperature > 0.0f) {
-        llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
-    }
-    llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
-
-    struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
-
-    llama_batch batch = llama_batch_init(params.max_length, 0, 1);
-    batch.n_tokens    = params.max_length;
-
-    // Pre-allocate buffers for CFG if needed
-    int32_t                  logits_size = n_vocab * params.max_length;
-    std::vector<float>       cond_logits_buffer;
-    std::vector<llama_token> un_x_buffer;
-    if (params.cfg_scale > 0.0f) {
-        cond_logits_buffer.resize(logits_size);
-        un_x_buffer.resize(params.max_length);
-    }
-
-    // For block-based processing
-    std::vector<int32_t> num_transfer_tokens;
-    int32_t              num_blocks      = 1;
-    int32_t              steps_per_block = params.steps;
-
-    if (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) {
-        GGML_ASSERT(params.max_length % params.block_length == 0);
-        num_blocks = params.max_length / params.block_length;
-        GGML_ASSERT(params.steps % num_blocks == 0);
-        steps_per_block = params.steps / num_blocks;
-    }
-
-    std::vector<float> confidence(params.max_length);
-
-    int64_t total_sampling_time = 0;
-    int64_t total_time          = 0;
-    int64_t time_start          = ggml_time_us();
-
-    for (int block_num = 0; block_num < num_blocks; block_num++) {
-        int32_t block_start = (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) ? n_input + block_num * params.block_length : 0;
-        int32_t block_end   = (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) ?
-                                  std::min(n_input + (block_num + 1) * params.block_length, params.max_length) :
-                                  params.max_length;
-
-        // Count masked tokens in current block for block-based processing
-        if (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) {
-            int32_t block_mask_count = 0;
-            for (int i = block_start; i < block_end; i++) {
-                if (output_tokens[i] == params.mask_token_id) {
-                    block_mask_count++;
-                }
-            }
-            num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block);
-        }
-
-        for (int32_t step = 0; step < steps_per_block; step++) {
-            int32_t global_step = block_num * steps_per_block + step;
-
-            if (params.step_callback) {
-                if (!params.step_callback(
-                        global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) {
-                    break;
-                }
-            }
-
-            // Setup batch
-            for (int32_t i = 0; i < params.max_length; i++) {
-                batch.token[i]     = output_tokens[i];
-                batch.pos[i]       = i;
-                batch.n_seq_id[i]  = 1;
-                batch.seq_id[i][0] = 0;
-                batch.logits[i]    = 1;
-            }
-
-            float * logits = nullptr;
-
-            if (params.cfg_scale > 0.0f) {
-                int ret = llama_decode(ctx, batch);
-                if (ret != 0) {
-                    LOG_ERR("Failed to generate conditional");
-                    break;
-                }
-                float * cond_logits_ptr = llama_get_logits(ctx);
-                std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float));
-
-                // Unconditional generation (mask input)
-                std::copy(output_tokens, output_tokens + params.max_length, un_x_buffer.begin());
-                for (int32_t i = 0; i < n_input; i++) {
-                    un_x_buffer[i] = params.mask_token_id;
-                }
-
-                for (int32_t i = 0; i < params.max_length; i++) {
-                    batch.token[i] = un_x_buffer[i];
-                }
-                ret = llama_decode(ctx, batch);
-                if (ret != 0) {
-                    LOG_ERR("Failed to generate unconditional");
-                    break;
-                }
-                float * uncond_logits = llama_get_logits(ctx);
-
-                // Apply CFG
-                for (int32_t i = 0; i < logits_size; i++) {
-                    cond_logits_buffer[i] =
-                        uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]);
-                }
-                logits = cond_logits_buffer.data();
-            } else {
-                int ret = llama_decode(ctx, batch);
-                if (ret != 0) {
-                    LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, global_step, ret);
-                    break;
-                }
-                logits = llama_get_logits(ctx);
-            }
-
-            if (!logits) {
-                LOG_ERR("%s: failed to get logits at step %d\n", __func__, global_step);
-                break;
-            }
-
-            auto get_logits_for_pos = [&](int32_t pos) -> const float * {
-                if (params.shift_logits) {
-                    return pos == 0 ? logits : logits + (pos - 1) * n_vocab;
-                }
-                return logits + pos * n_vocab;
-            };
-
-            int64_t time_start_sampling = ggml_time_us();
-
-            mask_positions.clear();
-            for (int32_t i = 0; i < params.max_length; i++) {
-                if (output_tokens[i] == params.mask_token_id) {
-                    // For block-based, only consider current block
-                    if (params.schedule != DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED || (i >= block_start && i < block_end)) {
-                        mask_positions.push_back(i);
-                    }
-                }
-            }
-
-            if (mask_positions.empty()) {
-                break;
-            }
-
-            if (params.add_gumbel_noise && params.temperature > 0.0f) {
-                add_gumbel_noise(logits, n_vocab, params.temperature, rng);
-            }
-
-            if (params.algorithm == DIFFUSION_ALGORITHM_ORIGIN) {
-                int32_t transfer_count = calculate_transfer_count(
-                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
-                float p_transfer = (float) transfer_count / mask_positions.size();
-
-                for (int32_t pos : mask_positions) {
-                    if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
-                        const float * pos_logits = get_logits_for_pos(pos);
-                        for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
-                            candidates[token_id].id    = token_id;
-                            candidates[token_id].logit = pos_logits[token_id];
-                            candidates[token_id].p     = 0.0f;
-                        }
-
-                        llama_token_data_array cur_p = {
-                            candidates.data(),
-                            (size_t) n_vocab,
-                            -1,
-                            false,
-                        };
-
-                        llama_sampler_apply(sampler, &cur_p);
-                        output_tokens[pos] = cur_p.data[cur_p.selected].id;
-                    }
-                }
-            } else {
-                std::vector<std::pair<float, int32_t>> confidences;
-                std::vector<llama_token>               sampled_tokens(mask_positions.size());
-
-                for (size_t i = 0; i < mask_positions.size(); i++) {
-                    int32_t       pos        = mask_positions[i];
-                    const float * pos_logits = get_logits_for_pos(pos);
-
-                    for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
-                        candidates[token_id].logit = pos_logits[token_id];
-                        candidates[token_id].p     = 0.0f;
-                        candidates[token_id].id    = token_id;
-                    }
-
-                    llama_token_data_array cur_p = {
-                        candidates.data(),
-                        candidates.size(),
-                        -1,
-                        false,
-                    };
-
-                    llama_sampler_apply(sampler, &cur_p);
-                    llama_token sampled_token = cur_p.data[cur_p.selected].id;
-
-                    float conf = calculate_confidence(cur_p, params.algorithm, rng);
-
-                    sampled_tokens[i] = sampled_token;
-                    confidences.emplace_back(conf, i);
-                }
-
-                int32_t transfer_count = calculate_transfer_count(
-                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
-
-                if (transfer_count > 0) {
-                    if (params.alg_temp == 0.0f) {
-                        std::partial_sort(confidences.begin(),
-                                          confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()),
-                                          confidences.end(),
-                                          [](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
-                                              if (a.first != b.first) {
-                                                  return a.first > b.first;
-                                              }
-                                              return a.second < b.second;
-                                          });
-
-                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
-                            int32_t mask_idx   = confidences[i].second;
-                            int32_t pos        = mask_positions[mask_idx];
-                            output_tokens[pos] = sampled_tokens[mask_idx];
-                        }
-                    } else {
-                        conf_candidates.clear();
-                        for (size_t i = 0; i < confidences.size(); i++) {
-                            float conf_logit = confidences[i].first / params.alg_temp;
-                            conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f });
-                        }
-
-                        llama_token_data_array conf_array = {
-                            conf_candidates.data(),
-                            conf_candidates.size(),
-                            -1,
-                            false,
-                        };
-
-                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
-                            llama_sampler_apply(dist_sampler, &conf_array);
-                            int32_t selected_idx = conf_array.selected;
-                            int32_t mask_idx     = selected_idx;
-                            int32_t pos          = mask_positions[mask_idx];
-                            output_tokens[pos]   = sampled_tokens[mask_idx];
-
-                            conf_candidates[selected_idx].p = 0.0f;
-                            conf_array.selected             = -1;
-                        }
-                    }
-                }
-            }
-
-            int64_t time_end_sampling = ggml_time_us();
-            total_sampling_time += time_end_sampling - time_start_sampling;
-        }
-    }
-
-    int64_t time_end = ggml_time_us();
-    total_time += time_end - time_start;
-
-    LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
-            total_time / 1000.0,
-            total_time / 1000.0 / params.steps,
-            total_sampling_time / 1000.0 / params.steps);
-
-    llama_batch_free(batch);
-    llama_sampler_free(sampler);
-    llama_sampler_free(dist_sampler);
-
-    n_generated = params.max_length;
-}
--- a/examples/diffusion/diffusion.h
+++ b/examples/diffusion/diffusion.h
@@ -1,57 +0,0 @@
-#pragma once
-
-#include "llama.h"
-
-#include <cstdint>
-
-enum diffusion_algorithm {
-    DIFFUSION_ALGORITHM_ORIGIN           = 0,
-    DIFFUSION_ALGORITHM_ENTROPY_BASED    = 1,
-    DIFFUSION_ALGORITHM_MARGIN_BASED     = 2,
-    DIFFUSION_ALGORITHM_RANDOM           = 3,
-    DIFFUSION_ALGORITHM_CONFIDENCE_BASED = 4,
-};
-
-// Unified transfer scheduling methods
-enum diffusion_transfer_schedule {
-    DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED = 0,  // Dream-style: (1.0 - s/t) * remaining
-    DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED    = 1,  // LLaDA-style: process in blocks with get_num_transfer_tokens
-};
-
-typedef bool (*diffusion_step_callback_t)(int32_t             step,
-                                          int32_t             total_steps,
-                                          const llama_token * tokens,
-                                          int32_t             n_tokens,
-                                          void *              user_data);
-
-struct diffusion_params {
-    int32_t                   steps                   = 0;
-    float                     temperature             = 0;
-    llama_token               mask_token_id           = LLAMA_TOKEN_NULL;
-    diffusion_step_callback_t step_callback           = nullptr;
-    void *                    step_callback_user_data = nullptr;
-    int32_t                   seed                    = 0;
-    bool                      visual_mode             = false;
-    bool                      shift_logits            = false;  // Shift logits by -1 after decode
-
-    float   top_p = 0.;
-    int32_t top_k = 0.;
-
-    diffusion_algorithm         algorithm = DIFFUSION_ALGORITHM_CONFIDENCE_BASED;
-    diffusion_transfer_schedule schedule  = DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED;
-
-    float   cfg_scale        = 0.;     // Config scale for classifier-free guidance
-    float   eps              = 0.;     // Timestep scheduling
-    int32_t block_length     = 0;      // Block size (for block scheduling)
-    float   alg_temp         = 0;      // algorithm temperature (0.0 = deterministic)
-    bool    add_gumbel_noise = false;  // Add gumbel noise to the logits if temp > 0.0
-
-    int32_t max_length = 0;            // Maximum sequence length
-};
-
-void diffusion_generate(llama_context *          ctx,
-                        const llama_token *      input_tokens,
-                        llama_token *            output_tokens,
-                        int32_t                  n_input,
-                        const diffusion_params & params,
-                        int32_t &                n_generated);
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -3,6 +3,7 @@
 #include "debug.h"
 #include "log.h"
 #include "llama.h"
+#include "llama-cpp.h"

 #include <clocale>
 #include <string>
@@ -37,7 +38,7 @@ static bool run(llama_context * ctx, const common_params & params) {
 int main(int argc, char ** argv) {
    std::setlocale(LC_NUMERIC, "C");

-    common_debug_cb_user_data cb_data;
+    base_callback_data cb_data;

    common_params params;

@@ -52,7 +53,7 @@ int main(int argc, char ** argv) {

    // pass the callback to the backend scheduler
    // it will be executed for each node during the graph computation
-    params.cb_eval = common_debug_cb_eval;
+    params.cb_eval = common_debug_cb_eval<false>;
    params.cb_eval_user_data = &cb_data;
    params.warmup = false;

--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@@ -73,12 +73,12 @@ static void write_help(std::ostringstream & ss, const md_file & md) {
    auto ctx_arg = common_params_parser_init(params, md.ex);

    std::vector<common_arg *> common_options;
-    std::vector<common_arg *> sampling_options;
+    std::vector<common_arg *> sparam_options;
    std::vector<common_arg *> specific_options;
    for (auto & opt : ctx_arg.options) {
        // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
-        if (opt.is_sampling) {
-            sampling_options.push_back(&opt);
+        if (opt.is_sparam) {
+            sparam_options.push_back(&opt);
        } else if (opt.in_example(ctx_arg.ex)) {
            specific_options.push_back(&opt);
        } else {
@@ -93,7 +93,7 @@ static void write_help(std::ostringstream & ss, const md_file & md) {
    ss << "### Common params\n\n";
    write_table(ss, common_options);
    ss << "\n\n### Sampling params\n\n";
-    write_table(ss, sampling_options);
+    write_table(ss, sparam_options);
    ss << "\n\n### " << md.specific_section_header << "\n\n";
    write_table(ss, specific_options);

--- a/examples/llama-eval/README.md
+++ b/examples/llama-eval/README.md
@@ -1,26 +0,0 @@
-# llama-eval
-
-Simple evaluation tool for llama.cpp with support for multiple datasets.
-
-For a full description, usage examples, and sample results, see:
-
- [PR 21152](https://github.com/ggml-org/llama.cpp/pull/21152)
-
-## Quick start
-
-```bash
-# Single server
-python3 llama-eval.py \
-  --server http://localhost:8033 \
-  --model my-model \
-  --dataset gsm8k --n_cases 100 \
-  --grader-type regex --threads 32
-
-# Multiple servers (comma-separated URLs and thread counts)
-python3 llama-eval.py \
-  --server http://server1:8033,http://server2:8033 \
-  --server-name server1,server2 \
-  --threads 16,16 \
-  --dataset aime2025 --n_cases 240 \
-  --grader-type regex
-```
--- a/examples/llama-eval/llama-eval.py
+++ b/examples/llama-eval/llama-eval.py
--- a/examples/llama-eval/llama-server-simulator.py
+++ b/examples/llama-eval/llama-server-simulator.py
@@ -1,317 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import json
-import random
-import re
-import time
-import sys
-import os
-import threading
-from http.server import HTTPServer, BaseHTTPRequestHandler
-from typing import Dict, List, Optional
-from dataclasses import dataclass
-from pathlib import Path
-
-import datasets
-
-# Set cache directory for HuggingFace datasets
-cache_dir = Path.home() / ".cache" / "huggingface" / "datasets"
-cache_dir.mkdir(parents=True, exist_ok=True)
-os.environ["HF_DATASETS_CACHE"] = str(cache_dir)
-
-def dice(s1: str, s2: str) -> float:
-    """Calculate Dice coefficient between two strings based on bigram overlap."""
-    if not s1 and not s2:
-        return 1.0
-
-    def _bigrams(s: str):
-        return [s[i : i + 2] for i in range(len(s) - 1)]
-
-    bigrams1 = _bigrams(s1)
-    bigrams2 = _bigrams(s2)
-
-    if not bigrams1 and not bigrams2:
-        return 1.0
-
-    from collections import Counter
-
-    freq1 = Counter(bigrams1)
-    freq2 = Counter(bigrams2)
-
-    intersection = sum(min(freq1[bg], freq2[bg]) for bg in freq1)
-    dice_coeff = 2 * intersection / (len(bigrams1) + len(bigrams2))
-    return dice_coeff
-
-def debug_log(message: str):
-    """Log debug messages to both stdout and a file"""
-    print(message, file=sys.stderr)
-    with open("/tmp/simulator-debug.log", "a") as f:
-        f.write(message + "\n")
-
-simulator: Optional["Simulator"] = None
-
-@dataclass
-class EvalState:
-    id: str
-    tasks: List[str]
-    task_states: Dict[str, Dict]
-    sampling_config: Dict
-
-def normalize_number(s: str) -> Optional[int]:
-    match = re.match(r"\d+", s)  # match digits from the start
-    if not match:
-        return None
-    return int(match.group(0))
-
-class AimeDataset:
-    def __init__(self, split: str = "train"):
-        self.split = split
-        self.questions: List[Dict] = []
-        self._load_dataset()
-
-    def _load_dataset(self):
-        print(f"Loading AIME dataset (split: {self.split})...")
-
-        cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0"
-        if cache_path.exists():
-            print(f"Using cached dataset from {cache_path}")
-            ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path))
-        else:
-            ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
-
-        self.questions = list(ds)
-        print(f"AIME dataset loaded: {len(self.questions)} questions")
-
-    def find_question(self, request_text: str) -> Optional[Dict]:
-        best_match = None
-        best_distance = -1
-        best_index = -1
-
-        for i, question in enumerate(self.questions):
-            question_text = question["problem"]
-            request_lower = request_text.lower()
-            question_lower = question_text.lower()
-
-            # Exact match
-            if question_lower == request_lower:
-                debug_log(f"DEBUG: Found exact match at index {i}")
-                return question
-
-            # Remove LaTeX formatting for more flexible matching
-            question_no_latex = re.sub(r'\$[^$]+\$', '', question_text)
-            if question_no_latex.lower() == request_lower:
-                debug_log(f"DEBUG: Found match (no LaTeX) at index {i}")
-                return question
-
-            # Calculate Dice coefficient for partial matches
-            # Only consider if request is at least 50% of question length
-            if len(request_lower) >= len(question_lower) * 0.5:
-                distance = dice(question_lower, request_lower)
-
-                if distance > best_distance:
-                    best_distance = distance
-                    best_match = question
-                    best_index = i
-
-        if best_match and best_distance > 0.3:  # Threshold for partial match
-            debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}")
-            return best_match
-
-        debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...")
-        return None
-
-    def get_answer(self, question: Dict) -> str:
-        answer = question["answer"]
-        if isinstance(answer, str):
-            normalized = normalize_number(answer)
-            return str(normalized) if normalized is not None else answer
-        return str(answer)
-
-class Simulator:
-    def __init__(
-        self,
-        port: int = 8033,
-        host: str = "localhost",
-        success_rate: float = 0.8,
-        dataset_split: str = "train"
-    ):
-        self.port = port
-        self.host = host
-        self.success_rate = success_rate
-        self.dataset = AimeDataset(dataset_split)
-        self.eval_state = EvalState(
-            id="aime-2025",
-            tasks=["aime"],
-            task_states={},
-            sampling_config={"temperature": 0, "max_tokens": 2048}
-        )
-
-    def _generate_response(
-        self,
-        question: Dict,
-        should_be_correct: bool
-    ) -> Dict:
-        expected_answer = self.dataset.get_answer(question)
-
-        if should_be_correct:
-            response_text = expected_answer
-        else:
-            response_text = self._generate_wrong_answer(question)
-
-        return {
-            "id": f"chatcmpl-{int(time.time())}",
-            "object": "chat.completion",
-            "created": int(time.time()),
-            "model": "llama",
-            "choices": [
-                {
-                    "index": 0,
-                    "message": {
-                        "role": "assistant",
-                        "content": response_text
-                    },
-                    "finish_reason": "stop"
-                }
-            ],
-            "usage": {
-                "prompt_tokens": 100,
-                "completion_tokens": 50,
-                "total_tokens": 150
-            }
-        }
-
-    def _generate_wrong_answer(self, question: Dict) -> str:
-        expected_answer = self.dataset.get_answer(question)
-
-        if expected_answer.isdigit():
-            wrong_answer = str(int(expected_answer) + 1)
-        else:
-            wrong_answer = expected_answer + " (wrong)"
-
-        return wrong_answer
-
-    def _process_request(self, request_data: Dict) -> Dict:
-        messages = request_data.get("messages", [])
-        if not messages:
-            return {"error": "No messages in request"}
-
-        request_text = messages[0].get("content", "")
-        debug_log(f"DEBUG: Received request with content: {request_text[:150]}...")
-
-        question = self.dataset.find_question(request_text)
-        if not question:
-            debug_log(f"DEBUG: find_question returned None")
-            return {"error": "No matching question found"}
-
-        should_be_correct = random.random() < self.success_rate
-
-        response = self._generate_response(question, should_be_correct)
-
-        task_id = "aime"
-        self.eval_state.task_states[task_id] = {
-            "correct": should_be_correct,
-            "expected": self.dataset.get_answer(question),
-            "predicted": response["choices"][0]["message"]["content"]
-        }
-
-        return response
-
-class RequestHandler(BaseHTTPRequestHandler):
-    def do_POST(self):
-        if self.path != "/v1/chat/completions":
-            self._send_json({"error": "Not found"}, 404)
-            return
-
-        try:
-            content_length = int(self.headers.get("Content-Length", 0))
-            body = self.rfile.read(content_length)
-            request_data = json.loads(body) if body else None
-
-            if not request_data:
-                self._send_json({"error": "Invalid JSON"}, 400)
-                return
-
-            if simulator is None:
-                self._send_json({"error": "Simulator not initialized"}, 500)
-                return
-
-            response = simulator._process_request(request_data)
-            self._send_json(response, 200)
-
-        except json.JSONDecodeError:
-            self._send_json({"error": "Invalid JSON"}, 400)
-        except Exception as e:
-            print(f"Error processing request: {e}")
-            self._send_json({"error": str(e)}, 500)
-
-    def _send_json(self, data: dict, status: int = 200):
-        body = json.dumps(data).encode("utf-8")
-        self.send_response(status)
-        self.send_header("Content-Type", "application/json")
-        self.send_header("Content-Length", str(len(body)))
-        self.end_headers()
-        self.wfile.write(body)
-
-    def log_message(self, format, *args):
-        # Suppress default request logging
-        pass
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="llama-server simulator for testing eval scripts"
-    )
-    parser.add_argument(
-        "--port",
-        type=int,
-        default=8033,
-        help="Server port (default: 8033)"
-    )
-    parser.add_argument(
-        "--host",
-        type=str,
-        default="localhost",
-        help="Server host (default: localhost)"
-    )
-    parser.add_argument(
-        "--success-rate",
-        type=float,
-        default=0.8,
-        help="Success rate 0-1 (default: 0.8)"
-    )
-    parser.add_argument(
-        "--dataset-split",
-        type=str,
-        default="train",
-        help="AIME dataset split to use (default: train)"
-    )
-
-    args = parser.parse_args()
-
-    global simulator
-    simulator = Simulator(
-        port=args.port,
-        host=args.host,
-        success_rate=args.success_rate,
-        dataset_split=args.dataset_split
-    )
-
-    server = HTTPServer((args.host, args.port), RequestHandler)
-    server_thread = threading.Thread(target=server.serve_forever, daemon=True)
-    server_thread.start()
-
-    print("\n=== llama-server-simulator ===")
-    print(f"Server running on http://{args.host}:{args.port}")
-    print(f"Success rate: {args.success_rate}")
-    print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions")
-    print("\nPress Ctrl+C to stop\n")
-
-    try:
-        server_thread.join()
-    except KeyboardInterrupt:
-        print("\nShutting down...")
-        server.shutdown()
-
-if __name__ == "__main__":
-    main()
--- a/examples/llama-eval/test-simulator.sh
+++ b/examples/llama-eval/test-simulator.sh
@@ -1,86 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# Get the directory where this script is located
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-echo "=== llama-server-simulator Test Script ==="
-echo ""
-
-PORT=8033
-SUCCESS_RATE=0.8
-TEST_PORT=8034
-
-echo "Starting simulator on port $PORT with success rate $SUCCESS_RATE..."
-source "$SCRIPT_DIR/venv/bin/activate"
-python3 "$SCRIPT_DIR/llama-server-simulator.py" --port $PORT --success-rate $SUCCESS_RATE > /tmp/simulator-test.log 2>&1 &
-SIMULATOR_PID=$!
-
-echo "Waiting for simulator to start..."
-sleep 5
-
-# Helper function to make a request and extract the answer
-make_request() {
-  local question="$1"
-  curl -s -X POST http://localhost:$PORT/v1/chat/completions \
-    -H "Content-Type: application/json" \
-    -d "{
-      \"model\": \"llama\",
-      \"messages\": [
-        {\"role\": \"user\", \"content\": \"$question\"}
-      ],
-      \"temperature\": 0,
-      \"max_tokens\": 2048
-    }" | python3 -c "import sys, json; data = json.load(sys.stdin); print(data.get('choices', [{}])[0].get('message', {}).get('content', data.get('error', 'No response')))"
-}
-
-# Test question (repeated in multiple tests)
-TEST_QUESTION="Quadratic polynomials P(x) and Q(x) have leading coefficients 2 and -2, respectively. The graphs of both polynomials pass through the two points (16,54) and (20,53). Find P(0) + Q(0)."
-
-echo ""
-echo "=== Test 1: Correct Answer ==="
-echo "Sending request with known question..."
-answer=$(make_request "$TEST_QUESTION")
-echo "Answer: $answer"
-echo "Expected: 116"
-echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
-
-echo ""
-echo "=== Test 2: Wrong Answer ==="
-echo "Sending request with known question (success rate 0.0)..."
-answer=$(make_request "$TEST_QUESTION")
-echo "Answer: $answer"
-echo "Expected: 116"
-echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
-
-echo ""
-echo "=== Test 3: No Matching Question ==="
-echo "Sending request with non-matching text..."
-response=$(make_request "What is the capital of France?")
-echo "Response: $response"
-echo "Expected: No matching question found"
-echo "Correct: $([ "$response" == "No matching question found" ] && echo "Yes" || echo "No")"
-
-echo ""
-echo "=== Test 4: Success Rate Verification ==="
-echo "Sending 10 requests to test success rate..."
-correct_count=0
-for i in {1..10}; do
-  answer=$(make_request "$TEST_QUESTION")
-  if [ "$answer" == "116" ]; then
-    correct_count=$((correct_count + 1))
-  fi
-  echo "  Request $i: Answer = $answer"
-done
-echo "Correct answers: $correct_count/10"
-echo "Expected: ~8/10 (80% success rate)"
-echo "Success rate: $(echo "scale=1; $correct_count * 10" | bc)%"
-
-echo ""
-echo "=== Test Complete ==="
-echo "Stopping simulator..."
-kill $SIMULATOR_PID 2>/dev/null
-wait $SIMULATOR_PID 2>/dev/null || true
-
-echo "Simulator stopped."
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -37,9 +37,9 @@ int main(int argc, char ** argv){

    common_ngram_cache ngram_cache;
    common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
-    fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.speculative.ngram_cache.lookup_cache_static.c_str());
+    fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.speculative.lookup_cache_static.c_str());

-    common_ngram_cache_save(ngram_cache, params.speculative.ngram_cache.lookup_cache_static);
+    common_ngram_cache_save(ngram_cache, params.speculative.lookup_cache_static);

    return 0;
 }
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -24,7 +24,7 @@ int main(int argc, char ** argv){
        return 1;
    }

-    const int n_draft = params.speculative.draft.n_max;
+    const int n_draft = params.speculative.n_max;

    // init llama.cpp
    llama_backend_init();
@@ -49,18 +49,18 @@ int main(int argc, char ** argv){
    {
        const int64_t t_start_draft_us = ggml_time_us();

-        if (!params.speculative.ngram_cache.lookup_cache_static.empty()) {
+        if (!params.speculative.lookup_cache_static.empty()) {
            try {
-                ngram_cache_static = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_static);
+                ngram_cache_static = common_ngram_cache_load(params.speculative.lookup_cache_static);
            } catch (std::ifstream::failure const &) {
-                LOG_ERR("failed to open static lookup cache: %s", params.speculative.ngram_cache.lookup_cache_static.c_str());
+                LOG_ERR("failed to open static lookup cache: %s", params.speculative.lookup_cache_static.c_str());
                exit(1);
            }
        }

-        if (!params.speculative.ngram_cache.lookup_cache_dynamic.empty()) {
+        if (!params.speculative.lookup_cache_dynamic.empty()) {
            try {
-                ngram_cache_dynamic = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_dynamic);
+                ngram_cache_dynamic = common_ngram_cache_load(params.speculative.lookup_cache_dynamic);
            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
        }

--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -25,7 +25,7 @@ int main(int argc, char ** argv){
    }

    // max. number of additional tokens to draft if match is found
-    const int n_draft = params.speculative.draft.n_max;
+    const int n_draft = params.speculative.n_max;

    // init llama.cpp
    llama_backend_init();
@@ -54,18 +54,18 @@ int main(int argc, char ** argv){
        const int64_t t_start_draft_us = ggml_time_us();
        common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);

-        if (!params.speculative.ngram_cache.lookup_cache_static.empty()) {
+        if (!params.speculative.lookup_cache_static.empty()) {
            try {
-                ngram_cache_static = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_static);
+                ngram_cache_static = common_ngram_cache_load(params.speculative.lookup_cache_static);
            } catch (std::ifstream::failure const &) {
-                LOG_ERR("failed to open static lookup cache: %s", params.speculative.ngram_cache.lookup_cache_static.c_str());
+                LOG_ERR("failed to open static lookup cache: %s", params.speculative.lookup_cache_static.c_str());
                exit(1);
            }
        }

-        if (!params.speculative.ngram_cache.lookup_cache_dynamic.empty()) {
+        if (!params.speculative.lookup_cache_dynamic.empty()) {
            try {
-                ngram_cache_dynamic = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_dynamic);
+                ngram_cache_dynamic = common_ngram_cache_load(params.speculative.lookup_cache_dynamic);
            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
        }

@@ -213,7 +213,7 @@ int main(int argc, char ** argv){

    // Update dynamic ngram cache with context ngram cache and save it to disk:
    common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
-    common_ngram_cache_save(ngram_cache_dynamic, params.speculative.ngram_cache.lookup_cache_dynamic);
+    common_ngram_cache_save(ngram_cache_dynamic, params.speculative.lookup_cache_dynamic);

    LOG("\n\n");

--- a/examples/model-conversion/Makefile
+++ b/examples/model-conversion/Makefile
@@ -52,10 +52,6 @@ causal-convert-mm-model:
 	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
 	./scripts/causal/convert-model.sh

-	$(MAKE) causal-convert-mmproj MM_OUTTYPE="$(MM_OUTTYPE)"
-
-causal-convert-mmproj:
-	$(call validate_model_path,causal-convert-mmproj)
 	@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(MM_OUTTYPE)" MODEL_PATH="$(MODEL_PATH)" \
 	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
 	./scripts/causal/convert-model.sh --mmproj
--- a/examples/model-conversion/scripts/causal/convert-model.sh
+++ b/examples/model-conversion/scripts/causal/convert-model.sh
@@ -25,11 +25,7 @@ MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
 OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
 TYPE="${OUTTYPE:-f16}"
 METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
-if [[ -n "$MMPROJ" ]]; then
-    CONVERTED_MODEL="${OUTPUT_DIR}/mmproj-${MODEL_NAME}.gguf"
-else
-    CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
-fi
+CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"

 echo "Model path: ${MODEL_PATH}"
 echo "Model name: ${MODEL_NAME}"
@@ -42,7 +38,6 @@ if [[ -n "$DEBUG" ]]; then
 else
    CMD_ARGS=("python")
 fi
-
 CMD_ARGS+=("../../convert_hf_to_gguf.py" "--verbose")
 CMD_ARGS+=("${MODEL_PATH}")
 CMD_ARGS+=("--outfile" "${CONVERTED_MODEL}")
@@ -55,3 +50,7 @@ CMD_ARGS+=("--outtype" "${TYPE}")
 echo ""
 echo "The environment variable CONVERTED_MODEL can be set to this path using:"
 echo "export CONVERTED_MODEL=$(realpath ${CONVERTED_MODEL})"
+if [[ -n "$MMPROJ" ]]; then
+    mmproj_file="${OUTPUT_DIR}/mmproj-$(basename "${CONVERTED_MODEL}")"
+    echo "The mmproj model was created in $(realpath "$mmproj_file")"
+fi
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -38,12 +38,8 @@ int main(int argc, char ** argv) {
    std::string result0;
    std::string result1;
    std::string result2;
-    std::string result3;

    // init
-
-    ggml_backend_load_all();
-
    auto llama_init = common_init_from_params(params);

    auto * model = llama_init->model();
@@ -217,83 +213,11 @@ int main(int argc, char ** argv) {
        n_past += 1;
    }

-    // test on-device state save/load
-    auto params_ctx4 = common_context_params_to_llama(params);
-    params_ctx4.n_seq_max = 2;
-    llama_context * ctx4 = llama_init_from_model(model, params_ctx4);
-
-    llama_sampler * smpl4 = llama_sampler_chain_init(sparams);
-
-    llama_sampler_chain_add(smpl4, llama_sampler_init_dist(params.sampling.seed));
-
-    printf("\nsingle seq run: %s", params.prompt.c_str());
-
-    // load state (rng, logits, embedding and kv_cache) from file
-    n_token_count_out = 0;
-
-    if (!llama_state_load_file(ctx4, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
-        fprintf(stderr, "\n%s : failed to load state\n", __func__);
-        return 1;
-    }
-
-    fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
-
-    // restore state (last tokens)
-    n_past = n_token_count_out;
-    if (!common_replay_last_token(ctx4, tokens.back(), n_past)) {
-        return 1;
-    }
-    ++n_past;
-
-    // save seq 0 and load into seq 1
-    {
-        // save kv of seq 0
-        std::vector<uint8_t> seq_store(llama_state_seq_get_size_ext(ctx4, 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE));
-        const size_t ncopy = llama_state_seq_get_data_ext(ctx4, seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
-        if (ncopy != seq_store.size()) {
-            fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
-            return 1;
-        }
-        fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
-
-        // erase whole kv
-        llama_memory_clear(llama_get_memory(ctx4), true);
-        fprintf(stderr, "%s : kv cache cleared\n", __func__);
-
-        // restore kv into seq 0
-        const size_t nset = llama_state_seq_set_data_ext(ctx4, seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
-        if (nset != seq_store.size()) {
-            fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
-            return 1;
-        }
-        fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
-    }
-
-    // forth run
-    for (auto i = 0; i < params.n_predict; i++) {
-        auto next_token     = llama_sampler_sample(smpl4, ctx4, -1);
-        auto next_token_str = common_token_to_piece(ctx4, next_token);
-
-        printf("%s", next_token_str.c_str());
-        result3 += next_token_str;
-
-        common_batch_clear(batch);
-        common_batch_add(batch, next_token, n_past, {1}, true);
-
-        if (llama_decode(ctx4, batch)) {
-            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
-            llama_batch_free(batch);
-            return 1;
-        }
-        n_past += 1;
-    }
-
    printf("\n");

    llama_sampler_free(smpl);
    llama_sampler_free(smpl2);
    llama_sampler_free(smpl3);
-    llama_sampler_free(smpl4);

    llama_batch_free(batch);

@@ -302,18 +226,12 @@ int main(int argc, char ** argv) {

    llama_free(ctx2);
    llama_free(ctx3);
-    llama_free(ctx4);

    if (result0 != result2) {
        fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
        return 1;
    }

-    if (result0 != result3) {
-        fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
-        return 1;
-    }
-
    fprintf(stderr, "\n%s : success\n", __func__);

    return 0;
--- a/examples/speculative-simple/README.md
+++ b/examples/speculative-simple/README.md
@@ -6,7 +6,7 @@ Demonstration of basic greedy speculative decoding
 ./bin/llama-speculative-simple \
    -m  ../models/qwen2.5-32b-coder-instruct/ggml-model-q8_0.gguf \
    -md ../models/qwen2.5-1.5b-coder-instruct/ggml-model-q4_0.gguf \
-    -f test.txt -c 0 -ngl 99 --color on \
-    --sampling-seq k --top-k 1 -fa on --temp 0.0 \
-    -ngld 99 --spec-draft-n-max 16 --spec-draft-n-draft-min 5 --draft-p-min 0.9
+    -f test.txt -c 0 -ngl 99 --color \
+    --sampling-seq k --top-k 1 -fa --temp 0.0 \
+    -ngld 99 --draft-max 16 --draft-min 5 --draft-p-min 0.9
 ```
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -8,10 +8,8 @@
 #include <clocale>
 #include <cstdio>
 #include <cstring>
-#include <cinttypes>
 #include <string>
 #include <vector>
-#include <utility>

 int main(int argc, char ** argv) {
    std::setlocale(LC_NUMERIC, "C");
@@ -29,6 +27,11 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    if (params.speculative.mparams_dft.path.empty()) {
+        LOG_ERR("%s: --model-draft is required\n", __func__);
+        return 1;
+    }
+
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -47,24 +50,26 @@ int main(int argc, char ** argv) {

    // load the draft model
    llama_model_ptr model_dft;
-    llama_context_ptr ctx_dft;

    // TODO: simplify this logic
    {
-        const auto & params_spec = params.speculative.draft;
+        const auto & params_spec = params.speculative;

        auto params_dft = params;

+        params_dft.n_parallel   = 1;
+        params_dft.n_ctx        = params_spec.n_ctx;
+        params_dft.n_batch      = llama_n_ctx_seq(ctx_tgt);
        params_dft.devices      = params_spec.devices;
-        params_dft.model        = params_spec.mparams;
+        params_dft.model        = params_spec.mparams_dft;
        params_dft.n_gpu_layers = params_spec.n_gpu_layers;

        if (params_spec.cpuparams.n_threads > 0) {
-            params_dft.cpuparams.n_threads       = params.speculative.draft.cpuparams.n_threads;
-            params_dft.cpuparams_batch.n_threads = params.speculative.draft.cpuparams_batch.n_threads;
+            params_dft.cpuparams.n_threads       = params.speculative.cpuparams.n_threads;
+            params_dft.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
        }

-        params_dft.tensor_buft_overrides = params.speculative.draft.tensor_buft_overrides;
+        params_dft.tensor_buft_overrides = params.speculative.tensor_buft_overrides;

        auto mparams_dft = common_model_params_to_llama(params_dft);

@@ -74,19 +79,8 @@ int main(int argc, char ** argv) {
            return 1;
        }

-        auto cparams = common_context_params_to_llama(params_dft);
-        ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams));
-
-        params.speculative.draft.ctx_tgt = ctx_tgt;
-        params.speculative.draft.ctx_dft = ctx_dft.get();
-    }
-
-    // check if the context supports partial sequence removal
-    const bool use_ckpt_tgt = (common_context_can_seq_rm(ctx_tgt)       == COMMON_CONTEXT_SEQ_RM_TYPE_FULL);
-    const bool use_ckpt_dft = (common_context_can_seq_rm(ctx_dft.get()) == COMMON_CONTEXT_SEQ_RM_TYPE_FULL);
-
-    if (use_ckpt_tgt) {
-        LOG_INF("speculative decoding will use checkpoints (context does not support partial sequence removal)\n");
+        params.speculative.model_dft = model_dft.get();
+        params.speculative.cparams_dft = common_context_params_to_llama(params_dft);
    }

    // Tokenize the prompt
@@ -118,8 +112,6 @@ int main(int argc, char ** argv) {
    // used to determine end of generation
    bool has_eos = false;

-    llama_seq_id seq_id = 0;
-
    // ================================================
    // everything until here is standard initialization
    // the relevant stuff for speculative decoding starts here
@@ -127,11 +119,10 @@ int main(int argc, char ** argv) {
    const auto t_enc_start = ggml_time_us();

    // target model sampling context
-    common_sampler_ptr smpl(common_sampler_init(model_tgt, params.sampling));
+    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);

    // eval the prompt
-    llama_decode(ctx_tgt,       llama_batch_get_one(inp.data(), inp.size() - 1));
-    llama_decode(ctx_dft.get(), llama_batch_get_one(inp.data(), inp.size() - 1));
+    llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));

    // note: keep the last token separate!
    llama_token id_last = inp.back();
@@ -145,81 +136,41 @@ int main(int argc, char ** argv) {
    // init the speculator
    const auto & params_spec = params.speculative;

-    struct common_speculative * spec = common_speculative_init(params.speculative, 1);
+    struct common_speculative * spec = common_speculative_init(params.speculative, ctx_tgt);

-    common_speculative_begin(spec, seq_id, prompt_tgt);
+    common_speculative_begin(spec, prompt_tgt);

    llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);

-    size_t n_draft = 0;
-
-    llama_tokens draft;
-    common_prompt_checkpoint ckpt;
-
    const auto t_enc_end = ggml_time_us();

    const auto t_dec_start = ggml_time_us();

    while (true) {
-        // generate or reuse draft tokens
+        // optionally, generate draft tokens that can be appended to the target batch
        //
        // this is the most important part of the speculation. the more probable tokens that are provided here
        // the better the performance will be. in theory, this computation can be performed asynchronously and even
        // offloaded to a remote device. it doesn't even have to be based on an LLM. instead, it can provide tokens
        // from a cache or lookup tables.
        //
-        if (draft.empty()) {
-            ckpt.update_pos(
-                    prompt_tgt.size(),
-                    llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), seq_id),
-                    llama_memory_seq_pos_max(llama_get_memory(ctx_tgt), seq_id));
+        llama_tokens draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);

-            if (use_ckpt_dft) {
-                ckpt.update_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
-            }
-
-            // generate a new draft
-            common_speculative_get_draft_params(spec, seq_id) = {
-                /* .drafting   = */ true,
-                /* .n_max      = */ -1,
-                /* .n_past     = */ n_past,
-                /* .id_last    = */ id_last,
-                /* .prompt     = */ &prompt_tgt,
-                /* .result     = */ &draft, // output
-            };
-            common_speculative_draft(spec);
-
-            // save the original draft size
-            n_draft = draft.size();
-
-            // save a checkpoint of the target context before evaluating the draft
-            // this allows us to restore the state if partial draft acceptance occurs
-            if (!draft.empty()) {
-                if (use_ckpt_tgt) {
-                    ckpt.update_tgt(ctx_tgt, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
-                }
-            }
-
-            {
-                ckpt.load_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
-
-                llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), seq_id, ckpt.pos_max + 1, -1);
-            }
-        } else {
-            // we have a previous (partial) draft to reuse from checkpoint restoration
-            if (use_ckpt_tgt) {
-                GGML_ASSERT(!ckpt.empty());
-            }
-        }
+        //LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str());

        // always have a token to evaluate from before - id_last
        common_batch_clear(batch_tgt);
-        common_batch_add  (batch_tgt, id_last, n_past++, { seq_id }, true);
+        common_batch_add  (batch_tgt, id_last, n_past++, { 0 }, true);

        // evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
        {
+            // do not waste time on small drafts
+            if (draft.size() < (size_t) params_spec.n_min) {
+                draft.clear();
+            }
+
            for (size_t i = 0; i < draft.size(); ++i) {
-                common_batch_add(batch_tgt, draft[i], n_past + i, { seq_id }, true);
+                common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
            }

            //LOG_DBG("target batch: %s\n", string_from(ctx_tgt, batch_tgt).c_str());
@@ -227,18 +178,6 @@ int main(int argc, char ** argv) {
            llama_decode(ctx_tgt, batch_tgt);
        }

-        // evaluate the same batch with the draft model
-        {
-            // TODO: extend to support MTP, Eagle, etc. See server code for reference
-            llama_decode(ctx_dft.get(), batch_tgt);
-        }
-
-        // only save the sampler sampler state if we use checkpoints
-        common_sampler_ptr smpl_save;
-        if (use_ckpt_tgt) {
-            smpl_save.reset(common_sampler_clone(smpl.get()));
-        }
-
        // sample from the full target batch and return the accepted tokens based on the target sampler
        //
        // for each token to be accepted, the sampler would have to sample that same token
@@ -246,45 +185,14 @@ int main(int argc, char ** argv) {
        // available logits from the batch and sample the next token until we run out of logits or the sampler
        // disagrees with the draft
        //
-        auto ids = common_sampler_sample_and_accept_n(smpl.get(), ctx_tgt, draft);
+        const auto ids = common_sampler_sample_and_accept_n(smpl, ctx_tgt, draft);

        //LOG_DBG("ids: %s\n", string_from(ctx_tgt, ids).c_str());

        GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token

-        // check for partial draft acceptance:
-        // if the context doesn't support partial sequence removal, restore the checkpoint
-        // and make the accepted tokens the new partial draft for the next iteration
-        if (use_ckpt_tgt && ids.size() - 1 < draft.size()) {
-            LOG_DBG("partial acceptance: %zu < %zu, restoring checkpoint\n", ids.size() - 1, draft.size());
-
-            draft = std::move(ids);
-
-            {
-                ckpt.load_tgt(ctx_tgt, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
-
-                llama_memory_seq_rm(llama_get_memory(ctx_tgt), seq_id, ckpt.pos_max + 1, -1);
-            }
-
-            {
-                ckpt.load_dft(ctx_dft.get(), seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
-
-                llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), seq_id, ckpt.pos_max + 1, -1);
-            }
-
-            prompt_tgt.resize(ckpt.n_tokens);
-            smpl = std::move(smpl_save);
-
-            n_past = (int) prompt_tgt.size();
-
-            continue;
-        }
-
-        common_speculative_accept(spec, seq_id, ids.size() - 1);
-
-        // full acceptance: consume the draft and commit accepted tokens
        n_past    += ids.size() - 1;
-        n_drafted += n_draft; // note: we ignore the discarded small drafts
+        n_drafted += draft.size(); // note: we ignore the discarded small drafts
        n_accept  += ids.size() - 1;
        n_predict += ids.size();

@@ -314,14 +222,10 @@ int main(int argc, char ** argv) {

        LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last);

-        // clear the draft since it has been consumed
-        draft.clear();
-
        {
            LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);

-            llama_memory_seq_rm(llama_get_memory(ctx_tgt),       seq_id, n_past, -1);
-            llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), seq_id, n_past, -1);
+            llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, n_past, -1);
        }

        if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
@@ -339,7 +243,7 @@ int main(int argc, char ** argv) {
    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));

    LOG_INF("\n");
-    LOG_INF("n_draft   = %d\n", params_spec.draft.n_max);
+    LOG_INF("n_draft   = %d\n", params_spec.n_max);
    LOG_INF("n_predict = %d\n", n_predict);
    LOG_INF("n_drafted = %d\n", n_drafted);
    LOG_INF("n_accept  = %d\n", n_accept);
@@ -350,10 +254,11 @@ int main(int argc, char ** argv) {

    LOG_INF("\n");
    LOG_INF("target:\n\n");
-    common_perf_print(ctx_tgt, smpl.get());
+    common_perf_print(ctx_tgt, smpl);

    llama_batch_free(batch_tgt);

+    common_sampler_free(smpl);
    common_speculative_free(spec);

    llama_backend_free();
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -49,7 +49,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    if (params.speculative.draft.mparams.path.empty()) {
+    if (params.speculative.mparams_dft.path.empty()) {
        LOG_ERR("%s: --model-draft is required\n", __func__);
        return 1;
    }
@@ -58,7 +58,7 @@ int main(int argc, char ** argv) {
    const int n_seq_dft = params.n_parallel;

    // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
-    const float p_draft_split = params.speculative.draft.p_split;
+    const float p_draft_split = params.speculative.p_split;

    std::default_random_engine rng(params.sampling.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sampling.seed);
    std::uniform_real_distribution<> u_dist;
@@ -80,15 +80,15 @@ int main(int argc, char ** argv) {
    ctx_tgt   = llama_init_tgt->context();

    // load the draft model
-    params.devices = params.speculative.draft.devices;
-    params.model = params.speculative.draft.mparams;
-    params.n_gpu_layers = params.speculative.draft.n_gpu_layers;
-    if (params.speculative.draft.cpuparams.n_threads > 0) {
-        params.cpuparams.n_threads = params.speculative.draft.cpuparams.n_threads;
+    params.devices = params.speculative.devices;
+    params.model = params.speculative.mparams_dft;
+    params.n_gpu_layers = params.speculative.n_gpu_layers;
+    if (params.speculative.cpuparams.n_threads > 0) {
+        params.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
    }

-    params.cpuparams_batch.n_threads = params.speculative.draft.cpuparams_batch.n_threads;
-    params.tensor_buft_overrides     = params.speculative.draft.tensor_buft_overrides;
+    params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
+    params.tensor_buft_overrides     = params.speculative.tensor_buft_overrides;

    auto llama_init_dft = common_init_from_params(params);

@@ -110,21 +110,13 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
-        (llama_vocab_get_add_bos(vocab_tgt) && llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft))) {
-        LOG_ERR("%s: draft model bos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
-                __func__,
-                llama_vocab_get_add_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_dft),
-                llama_vocab_bos(vocab_tgt), llama_vocab_bos(vocab_dft));
-        return 1;
-    }
-
-    if (llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
-        (llama_vocab_get_add_eos(vocab_tgt) && llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft))) {
-        LOG_ERR("%s: draft model eos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
-                __func__,
-                llama_vocab_get_add_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_dft),
-                llama_vocab_eos(vocab_tgt), llama_vocab_eos(vocab_dft));
+    if (
+        llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
+        llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
+        llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
+        llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
+    ) {
+        LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
        return 1;
    }

@@ -145,12 +137,11 @@ int main(int argc, char ** argv) {
        for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
            const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
            const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
-
            if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
                LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
                LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
-                        common_token_to_piece(vocab_tgt, i).c_str(),
-                        common_token_to_piece(vocab_dft, i).c_str());
+                        common_token_to_piece(ctx_tgt, i).c_str(),
+                        common_token_to_piece(ctx_dft, i).c_str());
                return 1;
            }
        }
@@ -192,7 +183,7 @@ int main(int argc, char ** argv) {
    //GGML_ASSERT(n_vocab == llama_vocab_n_tokens(model_dft));

    // how many tokens to draft each time
-    int n_draft = params.speculative.draft.n_max;
+    int n_draft = params.speculative.n_max;

    int n_predict = 0;
    int n_drafted = 0;
--- a/examples/sycl/start-svr.sh
+++ b/examples/sycl/start-svr.sh
@@ -1,124 +0,0 @@
-#!/bin/bash
-
-#  MIT license
-#  Copyright (C) 2024 Intel Corporation
-#  SPDX-License-Identifier: MIT
-
-Help() {
-  cat << EOF
-Usage: $(basename "$0") [OPTIONS]
-
-This script processes files with specified options.
-
-Options:
-  -h, --help    Display this help message and exit.
-  -c, --context <value>    Set context length. Bigger need more memory.
-  -p, --promote <value>    Prompt to start generation with.
-  -m, --model   <value>    Full model file path.
-  -mg,--main-gpu <value>   Set main GPU ID (0 - n) for single GPU mode.
-  -sm,--split-mode <value> How to split the model across multiple GPUs, one of:
-                            - none: use one GPU only
-                            - layer (default): split layers and KV across GPUs
-                            - row: split rows across GPUs
-  -ngl,--n-gpu-layers <value>  Max. number of layers to store in VRAM (default: -1)
-  -lv,--log-verbosity <value>  Set the verbosity threshold. Messages with a higher verbosity will be
-                               ignored. Values:
-                                - 0: generic output
-                                - 1: error
-                                - 2: warning
-                                - 3: info
-                                - 4: debug
-
-
-EOF
-}
-
-BIN_FILE=./build/bin/llama-server
-SEED=0
-GPUS_SETTING=""
-
-MODEL_FILE=../models/Qwen3.5-4B-Q4_0.gguf
-NGL=99
-CONTEXT=4096
-GGML_SYCL_DEVICE=-1
-SPLIT_MODE=layer
-LOG_VERBOSE=3
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        -c|--context)
-            CONTEXT=$2
-            # Shift twice to consume both the option flag and its value
-            shift
-            shift
-            ;;
-        -m|--model)
-            MODEL_FILE="$2"
-            # Shift twice to consume both the option flag and its value
-            shift
-            shift
-            ;;
-        -mg|--main-gpu)
-            GGML_SYCL_DEVICE=$2
-            SPLIT_MODE=none
-            # Shift twice to consume both the option flag and its value
-            shift
-            shift
-            ;;
-        -sm|--split-mode)
-            SPLIT_MODE=$2
-            # Shift twice to consume both the option flag and its value
-            shift
-            shift
-            ;;
-        -ngl|--n-gpu-layers)
-            NGL=$2
-            # Shift twice to consume both the option flag and its value
-            shift
-            shift
-            ;;
-        -lv|--log-verbosity)
-            LOG_VERBOSE=$2
-            # Shift twice to consume both the option flag and its value
-            shift
-            shift
-            ;;
-        -h|--help)
-            Help
-            exit 0
-            ;;
-        *)
-            # Handle unknown options or stop processing options
-            echo "Invalid option: $1"
-            # Optional: exit script or shift to treat remaining as positional args
-            exit 1
-            ;;
-    esac
-done
-
-
-
-source /opt/intel/oneapi/setvars.sh
-
-#export GGML_SYCL_DEBUG=1
-
-#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
-
-#support malloc device memory more than 4GB.
-export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
-echo "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=${UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS}"
-
-if [ $GGML_SYCL_DEVICE -ne -1 ]; then
-    echo "Use $GGML_SYCL_DEVICE as main GPU"
-    #use signle GPU only
-    GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
-    export ONEAPI_DEVICE_SELECTOR="level_zero:${GGML_SYCL_DEVICE}"
-    echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
-else
-    echo "Use all Intel GPUs, including iGPU & dGPU"
-    GPUS_SETTING="-sm ${SPLIT_MODE}"
- fi
-
-echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE}  --mmap --host 0.0.0.0 --port 8000"
-ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap --host 0.0.0.0 --port 8000
-
-
--- a/examples/sycl/test.sh
+++ b/examples/sycl/test.sh
@@ -38,7 +38,7 @@ SEED=0
 GPUS_SETTING=""

 INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
-MODEL_FILE=../models/llama-2-7b.Q4_0.gguf
+MODEL_FILE=models/llama-2-7b.Q4_0.gguf
 NGL=99
 CONTEXT=4096
 GGML_SYCL_DEVICE=-1
@@ -119,13 +119,12 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then
    echo "Use $GGML_SYCL_DEVICE as main GPU"
    #use signle GPU only
    GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
-    export ONEAPI_DEVICE_SELECTOR="level_zero:${GGML_SYCL_DEVICE}"
+    export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
    echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
 else
-    echo "Use all Intel GPUs, including iGPU & dGPU"
-    GPUS_SETTING="-sm ${SPLIT_MODE}"
+   echo "Use all Intel GPUs, including iGPU & dGPU"
 fi

-echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE}  --mmap "
-ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap
+echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE}  --mmap "
+ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap

--- a/examples/sycl/win-start-svr.bat
+++ b/examples/sycl/win-start-svr.bat
@@ -1,179 +0,0 @@
-::  MIT license
-::  Copyright (C) 2024 Intel Corporation
-::  SPDX-License-Identifier: MIT
-
-@echo off
-setlocal EnableExtensions EnableDelayedExpansion
-
-set "BIN_FILE=.\build\bin\llama-server.exe"
-set "SEED=0"
-set "GPUS_SETTING="
-
-set "MODEL_FILE=..\models\Qwen3.5-4B-Q4_0.gguf"
-set "NGL=99"
-set "CONTEXT=4096"
-set "GGML_SYCL_DEVICE=-1"
-set "SPLIT_MODE=layer"
-set "LOG_VERBOSE=3"
-
-if "%~1"=="" goto after_args
-
-:parse_args
-if "%~1"=="" goto after_args
-
-if /I "%~1"=="-c" (
-  if "%~2"=="" goto missing_value
-  set "CONTEXT=%~2"
-  shift
-  shift
-  goto parse_args
-)
-if /I "%~1"=="--context" (
-  if "%~2"=="" goto missing_value
-  set "CONTEXT=%~2"
-  shift
-  shift
-  goto parse_args
-)
-
-if /I "%~1"=="-m" (
-  if "%~2"=="" goto missing_value
-  set "MODEL_FILE=%~2"
-  shift
-  shift
-  goto parse_args
-)
-if /I "%~1"=="--model" (
-  if "%~2"=="" goto missing_value
-  set "MODEL_FILE=%~2"
-  shift
-  shift
-  goto parse_args
-)
-
-if /I "%~1"=="-mg" (
-  if "%~2"=="" goto missing_value
-  set "GGML_SYCL_DEVICE=%~2"
-  set "SPLIT_MODE=none"
-  shift
-  shift
-  goto parse_args
-)
-if /I "%~1"=="--main-gpu" (
-  if "%~2"=="" goto missing_value
-  set "GGML_SYCL_DEVICE=%~2"
-  set "SPLIT_MODE=none"
-  shift
-  shift
-  goto parse_args
-)
-
-if /I "%~1"=="-sm" (
-  if "%~2"=="" goto missing_value
-  set "SPLIT_MODE=%~2"
-  shift
-  shift
-  goto parse_args
-)
-if /I "%~1"=="--split-mode" (
-  if "%~2"=="" goto missing_value
-  set "SPLIT_MODE=%~2"
-  shift
-  shift
-  goto parse_args
-)
-
-if /I "%~1"=="-ngl" (
-  if "%~2"=="" goto missing_value
-  set "NGL=%~2"
-  shift
-  shift
-  goto parse_args
-)
-if /I "%~1"=="--n-gpu-layers" (
-  if "%~2"=="" goto missing_value
-  set "NGL=%~2"
-  shift
-  shift
-  goto parse_args
-)
-
-if /I "%~1"=="-lv" (
-  if "%~2"=="" goto missing_value
-  set "LOG_VERBOSE=%~2"
-  shift
-  shift
-  goto parse_args
-)
-if /I "%~1"=="--log-verbosity" (
-  if "%~2"=="" goto missing_value
-  set "LOG_VERBOSE=%~2"
-  shift
-  shift
-  goto parse_args
-)
-
-if /I "%~1"=="-h" goto help
-if /I "%~1"=="--help" goto help
-
-echo Invalid option: %~1
-exit /b 1
-
-:missing_value
-echo Missing value for option: %~1
-exit /b 1
-
-:help
-echo Usage: %~n0 [OPTIONS]
-echo.
-echo This script processes files with specified options.
-echo.
-echo Options:
-echo   -h, --help    Display this help message and exit.
-echo   -c, --context ^<value^>    Set context length. Bigger need more memory.
-echo   -m, --model   ^<value^>    Full model file path.
-echo   -mg,--main-gpu ^<value^>   Set main GPU ID (0 - n) for single GPU mode.
-echo   -sm,--split-mode ^<value^> How to split the model across multiple GPUs, one of:
-echo                             - none: use one GPU only
-echo                             - layer (default): split layers and KV across GPUs
-echo                             - row: split rows across GPUs
-echo   -ngl,--n-gpu-layers ^<value^>  Max. number of layers to store in VRAM (default: -1)
-echo   -lv,--log-verbosity ^<value^>  Set the verbosity threshold. Messages with a higher verbosity will be
-echo                                ignored. Values:
-echo                                 - 0: generic output
-echo                                 - 1: error
-echo                                 - 2: warning
-echo                                 - 3: info
-echo                                 - 4: debug
-exit /b 0
-
-:after_args
-
-REM In Windows CMD, source is not available; call oneAPI setvars if present.
-if exist "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" (
-  call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" >nul
-) else (
-  echo Warning: oneAPI setvars.bat not found. Continuing without environment setup.
-)
-
-REM Support malloc device memory more than 4GB.
-set "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1"
-echo UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=%UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS%
-
-if not "%GGML_SYCL_DEVICE%"=="-1" (
-  echo Use %GGML_SYCL_DEVICE% as main GPU
-  REM Use single GPU only.
-  set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%"
-  set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%"
-  echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR%
-) else (
-  echo Use all Intel GPUs, including iGPU ^& dGPU
-  set "GPUS_SETTING=-sm %SPLIT_MODE%"
-)
-
-echo run cmd: ZES_ENABLE_SYSMAN=1 %BIN_FILE% -m "%MODEL_FILE%" -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap --host 0.0.0.0 --port 8000
-set "ZES_ENABLE_SYSMAN=1"
-%BIN_FILE% -m "%MODEL_FILE%" -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap --host 0.0.0.0 --port 8000
-
-endlocal
-
--- a/examples/sycl/win-test.bat
+++ b/examples/sycl/win-test.bat
@@ -2,200 +2,10 @@
 ::  Copyright (C) 2024 Intel Corporation
 ::  SPDX-License-Identifier: MIT

+set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
+@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force

-@echo off
-setlocal EnableExtensions EnableDelayedExpansion
-
-REM MIT license
-REM Copyright (C) 2024 Intel Corporation
-REM SPDX-License-Identifier: MIT
-
-set "BIN_FILE=.\build\bin\llama-completion.exe"
-set "SEED=0"
-set "GPUS_SETTING="
-
-set "INPUT_PROMPT=Building a website can be done in 10 simple steps:^nStep 1:"
-set "MODEL_FILE=..\models\llama-2-7b.Q4_0.gguf"
-set "NGL=99"
-set "CONTEXT=4096"
-set "GGML_SYCL_DEVICE=-1"
-set "SPLIT_MODE=layer"
-set "LOG_VERBOSE=3"
-
-if "%~1"=="" goto after_args
-
-:parse_args
-if "%~1"=="" goto after_args
-
-if /I "%~1"=="-c" (
-  if "%~2"=="" goto missing_value
-  set "CONTEXT=%~2"
-  shift
-  shift
-  goto parse_args
-)
-if /I "%~1"=="--context" (
-  if "%~2"=="" goto missing_value
-  set "CONTEXT=%~2"
-  shift
-  shift
-  goto parse_args
-)
-
-if /I "%~1"=="-p" (
-  if "%~2"=="" goto missing_value
-  set "INPUT_PROMPT=%~2"
-  shift
-  shift
-  goto parse_args
-)
-if /I "%~1"=="--promote" (
-  if "%~2"=="" goto missing_value
-  set "INPUT_PROMPT=%~2"
-  shift
-  shift
-  goto parse_args
-)
-
-if /I "%~1"=="-m" (
-  if "%~2"=="" goto missing_value
-  set "MODEL_FILE=%~2"
-  shift
-  shift
-  goto parse_args
-)
-if /I "%~1"=="--model" (
-  if "%~2"=="" goto missing_value
-  set "MODEL_FILE=%~2"
-  shift
-  shift
-  goto parse_args
-)
-
-if /I "%~1"=="-mg" (
-  if "%~2"=="" goto missing_value
-  set "GGML_SYCL_DEVICE=%~2"
-  set "SPLIT_MODE=none"
-  shift
-  shift
-  goto parse_args
-)
-if /I "%~1"=="--main-gpu" (
-  if "%~2"=="" goto missing_value
-  set "GGML_SYCL_DEVICE=%~2"
-  set "SPLIT_MODE=none"
-  shift
-  shift
-  goto parse_args
-)
-
-if /I "%~1"=="-sm" (
-  if "%~2"=="" goto missing_value
-  set "SPLIT_MODE=%~2"
-  shift
-  shift
-  goto parse_args
-)
-if /I "%~1"=="--split-mode" (
-  if "%~2"=="" goto missing_value
-  set "SPLIT_MODE=%~2"
-  shift
-  shift
-  goto parse_args
-)
-
-if /I "%~1"=="-ngl" (
-  if "%~2"=="" goto missing_value
-  set "NGL=%~2"
-  shift
-  shift
-  goto parse_args
-)
-if /I "%~1"=="--n-gpu-layers" (
-  if "%~2"=="" goto missing_value
-  set "NGL=%~2"
-  shift
-  shift
-  goto parse_args
-)
-
-if /I "%~1"=="-lv" (
-  if "%~2"=="" goto missing_value
-  set "LOG_VERBOSE=%~2"
-  shift
-  shift
-  goto parse_args
-)
-if /I "%~1"=="--log-verbosity" (
-  if "%~2"=="" goto missing_value
-  set "LOG_VERBOSE=%~2"
-  shift
-  shift
-  goto parse_args
-)
-
-if /I "%~1"=="-h" goto help
-if /I "%~1"=="--help" goto help
-
-echo Invalid option: %~1
-exit /b 1
-
-:missing_value
-echo Missing value for option: %~1
-exit /b 1
-
-:help
-echo Usage: %~n0 [OPTIONS]
-echo.
-echo This script processes files with specified options.
-echo.
-echo Options:
-echo   -h, --help    Display this help message and exit.
-echo   -c, --context ^<value^>    Set context length. Bigger need more memory.
-echo   -p, --promote ^<value^>    Prompt to start generation with.
-echo   -m, --model   ^<value^>    Full model file path.
-echo   -mg,--main-gpu ^<value^>   Set main GPU ID (0 - n) for single GPU mode.
-echo   -sm,--split-mode ^<value^> How to split the model across multiple GPUs, one of:
-echo                             - none: use one GPU only
-echo                             - layer (default): split layers and KV across GPUs
-echo                             - row: split rows across GPUs
-echo   -ngl,--n-gpu-layers ^<value^>  Max. number of layers to store in VRAM (default: -1)
-echo   -lv,--log-verbosity ^<value^>  Set the verbosity threshold. Messages with a higher verbosity will be
-echo                                ignored. Values:
-echo                                 - 0: generic output
-echo                                 - 1: error
-echo                                 - 2: warning
-echo                                 - 3: info
-echo                                 - 4: debug
-exit /b 0
-
-:after_args
-
-REM In Windows CMD, source is not available; call oneAPI setvars if present.
-if exist "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" (
-  call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" >nul
-) else (
-  echo Warning: oneAPI setvars.bat not found. Continuing without environment setup.
-)
-
-REM Support malloc device memory more than 4GB.
-set "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1"
-echo UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=%UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS%
-
-if not "%GGML_SYCL_DEVICE%"=="-1" (
-  echo Use %GGML_SYCL_DEVICE% as main GPU
-  REM Use single GPU only.
-  set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%"
-  set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%"
-  echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR%
-) else (
-  echo Use all Intel GPUs, including iGPU ^& dGPU
-  set "GPUS_SETTING=-sm %SPLIT_MODE%"
-)
-
-echo run cmd: ZES_ENABLE_SYSMAN=1 %BIN_FILE% -m %MODEL_FILE% -no-cnv -p "%INPUT_PROMPT%" -n 200 -e -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap
-set "ZES_ENABLE_SYSMAN=1"
-%BIN_FILE% -m "%MODEL_FILE%" -no-cnv -p "%INPUT_PROMPT%" -n 200 -e -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap
-
-endlocal
-
+:: support malloc device memory more than 4GB.
+set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
+set LOAD_MODE="--mmap"
+.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 %LOAD_MODE%
--- a/flake.lock
+++ b/flake.lock
@@ -0,0 +1,58 @@
+{
+  "nodes": {
+    "flake-parts": {
+      "inputs": {
+        "nixpkgs-lib": "nixpkgs-lib"
+      },
+      "locked": {
+        "lastModified": 1730504689,
+        "narHash": "sha256-hgmguH29K2fvs9szpq2r3pz2/8cJd2LPS+b4tfNFCwE=",
+        "owner": "hercules-ci",
+        "repo": "flake-parts",
+        "rev": "506278e768c2a08bec68eb62932193e341f55c90",
+        "type": "github"
+      },
+      "original": {
+        "owner": "hercules-ci",
+        "repo": "flake-parts",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1732014248,
+        "narHash": "sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "23e89b7da85c3640bbc2173fe04f4bd114342367",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "nixpkgs-lib": {
+      "locked": {
+        "lastModified": 1730504152,
+        "narHash": "sha256-lXvH/vOfb4aGYyvFmZK/HlsNsr/0CVWlwYvo2rxJk3s=",
+        "type": "tarball",
+        "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
+      },
+      "original": {
+        "type": "tarball",
+        "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
+      }
+    },
+    "root": {
+      "inputs": {
+        "flake-parts": "flake-parts",
+        "nixpkgs": "nixpkgs"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -4,8 +4,8 @@ project("ggml" C CXX ASM)

 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
-set(GGML_VERSION_MINOR 11)
-set(GGML_VERSION_PATCH 1)
+set(GGML_VERSION_MINOR 9)
+set(GGML_VERSION_PATCH 11)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@@ -213,7 +213,7 @@ set   (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
 set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")

 option(GGML_HIP                             "ggml: use HIP"                                   OFF)
-option(GGML_HIP_GRAPHS                      "ggml: use HIP graph"                              ON)
+option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
 option(GGML_HIP_RCCL                        "ggml: use ROCm Collective Comm. Library"         OFF)
 option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
 option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -169,7 +169,7 @@ extern "C" {
        // device type
        enum ggml_backend_dev_type type;
        // device id
-        //   for PCI devices, this should be the lower-case PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:c1:00.0")
+        //   for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
        //   if the id is unknown, this should be NULL
        const char * device_id;
        // device capabilities
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -438,12 +438,6 @@ extern "C" {
        GGML_PREC_F32     = 10,
    };

-    // op hint
-    enum ggml_op_hint {
-        GGML_HINT_NONE             = 0,
-        GGML_HINT_SRC0_IS_HADAMARD = 1,
-    };
-
    // model file types
    enum ggml_ftype {
        GGML_FTYPE_UNKNOWN        = -1,
@@ -1425,11 +1419,6 @@ extern "C" {
            struct ggml_tensor * a,
            enum ggml_prec       prec);

-    // change the hint of a matrix multiplication
-    GGML_API void ggml_mul_mat_set_hint(
-            struct ggml_tensor * a,
-            enum ggml_op_hint    hint);
-
    // indirect matrix multiplication
    GGML_API struct ggml_tensor * ggml_mul_mat_id(
            struct ggml_context * ctx,
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -470,10 +470,11 @@ endforeach()

 target_link_libraries(ggml-base PRIVATE Threads::Threads)

-if (DEFINED MATH_LIBRARY)
-    target_link_libraries(ggml-base PRIVATE ${MATH_LIBRARY})
-elseif (NOT WIN32 AND NOT DEFINED ENV{ONEAPI_ROOT})
-    target_link_libraries(ggml-base PRIVATE m)
+find_library(MATH_LIBRARY m)
+if (MATH_LIBRARY)
+    if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
+        target_link_libraries(ggml-base PRIVATE m)
+    endif()
 endif()

 if (CMAKE_SYSTEM_NAME MATCHES "Android")
--- a/ggml/src/ggml-backend-meta.cpp
+++ b/ggml/src/ggml-backend-meta.cpp
@@ -1133,7 +1133,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
        if (t_ij->view_src != nullptr && ggml_backend_buffer_is_meta(t_ij->view_src->buffer)) {
            t_ij->view_src = ggml_backend_meta_buffer_simple_tensor(tensor->view_src, j);
            if (t_ij->view_offs > 0 && split_dim >= 0 && split_dim < GGML_MAX_DIMS) {
-                GGML_ASSERT(tensor->ne[split_dim] != 0);
+                GGML_ASSERT(ne[split_dim] != 0 && tensor->ne[split_dim] != 0);
                const int split_dim_view_src = ggml_backend_meta_get_split_state(tensor->view_src, /*assume_sync =*/ true).axis;
                GGML_ASSERT(split_dim_view_src >= 0 && split_dim_view_src < GGML_MAX_DIMS);

@@ -1170,28 +1170,6 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer

        simple_tensors.push_back(t_ij);
    }
-
-    // If one of the sources has a zero-sized slice, disable the computation:
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        if (tensor->src[i] == nullptr || !ggml_backend_buffer_is_meta(tensor->src[i]->buffer)) {
-            continue;
-        }
-
-        const ggml_backend_meta_split_state split_state_src = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true);
-        if (split_state_src.axis < 0 || split_state_src.axis >= GGML_MAX_DIMS) {
-            continue;
-        }
-        for (size_t j = 0; j < n_simple_bufs; j++) {
-            int64_t ne_sum = 0;
-            for (size_t s = 0; s < split_state_src.n_segments; s++) {
-                ne_sum += split_state_src.ne[s*n_simple_bufs + j];
-            }
-            if (ne_sum == 0) {
-                simple_tensors[j]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
-            }
-        }
-    }
-
    buf_ctx->simple_tensors[tensor] = simple_tensors;

    return GGML_STATUS_SUCCESS;
@@ -1205,57 +1183,40 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg

    if (split_state.n_segments != 1) {
        GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
+        GGML_ASSERT(offset == 0);
+        GGML_ASSERT(size == ggml_nbytes(tensor));
        GGML_ASSERT(tensor->ne[3] == 1);
-
        size_t offset_data = 0;
        std::vector<size_t> simple_offsets(n_bufs, 0);
        if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
            GGML_ASSERT(tensor->ne[2] == 1);
-
-            const size_t row_stride = tensor->nb[1];
-            GGML_ASSERT(offset % row_stride == 0);
-            GGML_ASSERT(size   % row_stride == 0);
-            const int64_t r_start = offset / row_stride;
-            const int64_t r_count = size   / row_stride;
-            GGML_ASSERT(r_start + r_count <= tensor->ne[1]);
-
            const int64_t blck_size = ggml_blck_size(tensor->type);
            for (size_t s = 0; s < split_state.n_segments; s++) {
                for (size_t j = 0; j < n_bufs; j++) {
                    ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
                    GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
                    const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
-                    ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
-                        simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes,
-                        r_count, simple_tensor->nb[1], tensor->nb[1]);
+                    ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data, simple_offsets[j], nbytes,
+                        tensor->ne[1], simple_tensor->nb[1], tensor->nb[1]);
                    offset_data       += nbytes;
                    simple_offsets[j] += nbytes;
                }
            }
-            GGML_ASSERT(offset_data*r_count == size);
+            GGML_ASSERT(offset_data*tensor->ne[1] == size);
            return;
        }
        GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
-
-        const size_t row_stride = tensor->nb[2];
-        GGML_ASSERT(offset % row_stride == 0);
-        GGML_ASSERT(size   % row_stride == 0);
-        const int64_t r_start = offset / row_stride;
-        const int64_t r_count = size   / row_stride;
-        GGML_ASSERT(r_start + r_count <= tensor->ne[2]);
-
        for (size_t s = 0; s < split_state.n_segments; s++) {
            for (size_t j = 0; j < n_bufs; j++) {
                ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
                const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
-                ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
-                    simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes,
-                    r_count, simple_tensor->nb[2], tensor->nb[2]);
+                ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data, simple_offsets[j], nbytes,
+                    tensor->ne[2], simple_tensor->nb[2], tensor->nb[2]);
                offset_data       += nbytes;
                simple_offsets[j] += nbytes;
            }
        }
-        GGML_ASSERT(offset_data*r_count == size);
+        GGML_ASSERT(offset_data*tensor->ne[2] == size);
        return;
    }

@@ -1312,57 +1273,40 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co

    if (split_state.n_segments != 1) {
        GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
+        GGML_ASSERT(offset == 0);
+        GGML_ASSERT(size == ggml_nbytes(tensor));
        GGML_ASSERT(tensor->ne[3] == 1);
-
        size_t offset_data = 0;
        std::vector<size_t> simple_offsets(n_bufs, 0);
        if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
            GGML_ASSERT(tensor->ne[2] == 1);
-
-            const size_t row_stride = tensor->nb[1];
-            GGML_ASSERT(offset % row_stride == 0);
-            GGML_ASSERT(size   % row_stride == 0);
-            const int64_t r_start = offset / row_stride;
-            const int64_t r_count = size   / row_stride;
-            GGML_ASSERT(r_start + r_count <= tensor->ne[1]);
-
            const int64_t blck_size = ggml_blck_size(tensor->type);
            for (size_t s = 0; s < split_state.n_segments; s++) {
                for (size_t j = 0; j < n_bufs; j++) {
                    const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
                    GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
                    const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
-                    ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
-                        simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes,
-                        r_count, simple_tensor->nb[1], tensor->nb[1]);
+                    ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
+                        tensor->ne[1], simple_tensor->nb[1], tensor->nb[1]);
                    offset_data       += nbytes;
                    simple_offsets[j] += nbytes;
                }
            }
-            GGML_ASSERT(offset_data*r_count == size);
+            GGML_ASSERT(offset_data*tensor->ne[1] == size);
            return;
        }
        GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
-
-        const size_t row_stride = tensor->nb[2];
-        GGML_ASSERT(offset % row_stride == 0);
-        GGML_ASSERT(size   % row_stride == 0);
-        const int64_t r_start = offset / row_stride;
-        const int64_t r_count = size   / row_stride;
-        GGML_ASSERT(r_start + r_count <= tensor->ne[2]);
-
        for (size_t s = 0; s < split_state.n_segments; s++) {
            for (size_t j = 0; j < n_bufs; j++) {
                const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
                const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
-                ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
-                    simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes,
-                    r_count, simple_tensor->nb[2], tensor->nb[2]);
+                ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
+                    tensor->ne[2], simple_tensor->nb[2], tensor->nb[2]);
                offset_data       += nbytes;
                simple_offsets[j] += nbytes;
            }
        }
-        GGML_ASSERT(offset_data*r_count == size);
+        GGML_ASSERT(offset_data*tensor->ne[2] == size);
        return;
    }

@@ -1498,20 +1442,17 @@ struct ggml_backend_meta_context {
    struct backend_config {
        ggml_backend_t backend;

-        std::vector<cgraph_config>           cgraphs;
-        std::vector<ggml_tensor *>           nodes;
-        std::vector<ggml_backend_buffer_ptr> bufs;
+        std::vector<cgraph_config> cgraphs;
+        std::vector<ggml_tensor *> nodes;
+        ggml_backend_buffer_ptr    buf;

-        backend_config(ggml_backend_t backend, const size_t n_reduce_steps) : backend(backend) {
-            bufs.resize(n_reduce_steps);
-        }
+        backend_config(ggml_backend_t backend) : backend(backend) {}
    };
    std::string                 name;
    std::vector<backend_config> backend_configs;
    ggml_context_ptr            ctx;
    std::vector<ggml_cgraph *>  cgraphs_aux;
    std::vector<ggml_tensor *>  nodes_aux;
-    size_t                      n_reduce_steps;
    int                         max_nnodes    = 0;
    size_t                      max_tmp_size  = 0;
    size_t                      max_subgraphs = 0;
@@ -1523,7 +1464,6 @@ struct ggml_backend_meta_context {

    ggml_backend_meta_context(ggml_backend_dev_t meta_dev, const char * params) {
        const size_t n_devs = ggml_backend_meta_dev_n_devs(meta_dev);
-        n_reduce_steps = std::ceil(std::log2(n_devs));
        name = "Meta(";
        std::vector<ggml_backend_t> simple_backends;
        backend_configs.reserve(n_devs);
@@ -1535,7 +1475,7 @@ struct ggml_backend_meta_context {
            }
            name += ggml_backend_dev_name(simple_dev);
            simple_backends.push_back(ggml_backend_dev_init(simple_dev, params));
-            backend_configs.emplace_back(simple_backends.back(), n_reduce_steps);
+            backend_configs.emplace_back(simple_backends.back());
        }
        name += ")";

@@ -1565,6 +1505,10 @@ struct ggml_backend_meta_context {
            ggml_backend_free(bc.backend);
        }
    }
+
+    size_t n_reduce_steps() const {
+        return std::ceil(std::log2(backend_configs.size()));
+    }
 };

 static const char * ggml_backend_meta_get_name(ggml_backend_t backend) {
@@ -1717,36 +1661,6 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,

                ggml_tensor * node = cgraph->nodes[id];
                int32_t n_used = ggml_node_get_use_count(cgraph, id);
-
-                // Skip MIRRORED nodes that don't consume node
-                auto skip_unrelated = [&]() {
-                    while (id + 1 < cgraph->n_nodes) {
-                        ggml_tensor * next = cgraph->nodes[id+1];
-                        if (ggml_backend_meta_get_split_state(next, false).axis != GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
-                            break;
-                        }
-                        bool safe = true;
-                        for (int s = 0; s < GGML_MAX_SRC; s++) {
-                            if (next->src[s] == nullptr) {
-                                continue;
-                            }
-                            if (next->src[s] == node) {
-                                safe = false;
-                                break;
-                            }
-                            if (ggml_backend_meta_get_split_state(next->src[s], false).axis != GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
-                                safe = false;
-                                break;
-                            }
-                        }
-                        if (!safe) {
-                            break;
-                        }
-                        id++;
-                    }
-                };
-
-                skip_unrelated();
                if (id + 1 >= cgraph->n_nodes) {
                    return idr;
                }
@@ -1761,12 +1675,10 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
                        n_used = ggml_node_get_use_count(cgraph, id);
                    }
                }
-                // Chain of MULs with MIRRORED src[1]
-                while (true) {
-                    skip_unrelated();
-                    if (id + 1 >= cgraph->n_nodes) {
-                        return idr;
-                    }
+                if (id + 1 >= cgraph->n_nodes) {
+                    return idr;
+                }
+                {
                    ggml_tensor * next = cgraph->nodes[id+1];
                    if (next->op == GGML_OP_MUL && next->src[0] == node &&
                            ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
@@ -1774,8 +1686,6 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
                        id++;
                        idr = id;
                        n_used = ggml_node_get_use_count(cgraph, id);
-                    } else {
-                        break;
                    }
                }

@@ -1826,24 +1736,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
                    continue;
                }

-                const int i_delayed = get_i_delayed(i);
-
-                // If we can delay the AllReduce we need to consider the interaction with zero-sized tensor slices.
-                // A backend with such a slice would normally have valid data after participating in the AllReduce with a node that has
-                //     its compute flag disabled and thus gets its data zeroed out.
-                // If the AllReduce is delayed then the nodes until that point also need to have their compute flag disabled.
-                if (i_delayed > i) {
-                    for (size_t j = 0; j < n_backends; j++) {
-                        auto & bcj = backend_ctx->backend_configs[j];
-                        if ((bcj.nodes[i]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
-                            for (int ii = i + 1; ii <= i_delayed; ii++) {
-                                bcj.nodes[ii]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
-                            }
-                        }
-                    }
-                }
-
-                i = i_delayed;
+                i = get_i_delayed(i);

                for (size_t j = 0; j < n_backends; j++) {
                    auto & bcj = backend_ctx->backend_configs[j];
@@ -1861,17 +1754,16 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
        if (max_tmp_size > backend_ctx->max_tmp_size) {
            for (size_t j = 0; j < n_backends; j++) {
                auto & bcj = backend_ctx->backend_configs[j];
-                for (size_t i = 0; i < backend_ctx->n_reduce_steps; i++) {
-                    bcj.bufs[i].reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
-                }
+                bcj.buf.reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
            }
            backend_ctx->max_tmp_size = max_tmp_size;
        }

        if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
            backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
-            const size_t n_nodes_per_device = 3 * backend_ctx->n_reduce_steps; // tmp + ADD (+zeroing) graph per step and device
-            const size_t n_cgraphs_per_device = 2 * backend_ctx->n_reduce_steps; // ADD ( + zeroing) graph per step and device
+            const size_t n_reduce_steps = backend_ctx->n_reduce_steps();
+            const size_t n_nodes_per_device = 2 * n_reduce_steps; // tmp + ADD per step
+            const size_t n_cgraphs_per_device = n_reduce_steps;    // 1 ADD graph per step
            const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
            const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
            const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
@@ -1920,6 +1812,11 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
    size_t iga = 0; // i graph aux
    size_t ina = 0; // i node aux

+    // FIXME usage_counts
+    auto get_cgraph_aux = [&]() -> ggml_cgraph * {
+        ggml_cgraph * ret = backend_ctx->cgraphs_aux[iga++];
+        return ret;
+    };
    auto get_node_aux = [&](ggml_tensor * t) -> ggml_tensor * {
        ggml_tensor * ret = backend_ctx->nodes_aux[ina++];
        memset(ret, 0, sizeof(ggml_tensor));
@@ -1931,110 +1828,75 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
        }
        return ret;
    };
-    auto set_tmp_data = [&](ggml_tensor * tensor, const size_t j, const size_t i_buf) {
-        auto & bcj = backend_ctx->backend_configs[j];
-        ggml_backend_buffer_ptr & buf_ptr = bcj.bufs[i_buf];
-        if (!buf_ptr || ggml_backend_buffer_get_size(buf_ptr.get()) < backend_ctx->max_tmp_size) {
-            buf_ptr.reset(ggml_backend_alloc_buffer(bcj.backend, backend_ctx->max_tmp_size));
-        }
-        tensor->buffer = buf_ptr.get();
-        tensor->data   = ggml_backend_buffer_get_base(buf_ptr.get());
-    };
-    // FIXME usage_counts
-    auto get_cgraph_aux = [&]() -> ggml_cgraph * {
-        ggml_cgraph * ret = backend_ctx->cgraphs_aux[iga++];
-        return ret;
-    };

    // Preferentially use backend-specific allreduce_tensor_async (e.g. NCCL for CUDA), use a generic fallback if unavailable:
    auto allreduce_fallback = [&](size_t i) -> ggml_status {
        std::vector<ggml_cgraph *> step_cgraphs(n_backends, nullptr);

-        // Zero out nodes that were disabled due to having a zero-sized slice:
-        for (size_t j = 0; j < n_backends; j++) {
-            auto & bcj = backend_ctx->backend_configs[j];
-            ggml_tensor * node = bcj.cgraphs[i].cgraph_main->nodes[bcj.cgraphs[i].cgraph_main->n_nodes - 1];
-            if (node->flags & GGML_TENSOR_FLAG_COMPUTE) {
-                continue;
-            }
-            ggml_tensor * node_zero = get_node_aux(node);
-            node_zero->op = GGML_OP_SCALE; // FIXME 0.0f * NaN == NaN
-            node_zero->src[0] = node;
-            ggml_set_op_params_f32(node_zero, 0, 0.0f);
-            node_zero->data = node->data;
-            node_zero->flags |= GGML_TENSOR_FLAG_COMPUTE;
-
-            step_cgraphs[j] = get_cgraph_aux();
-            step_cgraphs[j]->nodes[0] = node_zero;
-            step_cgraphs[j]->n_nodes = 1;
-            const ggml_status status = ggml_backend_graph_compute_async(bcj.backend, step_cgraphs[j]);
-            if (status != GGML_STATUS_SUCCESS) {
-                return status;
-            }
-        }
-        std::fill(step_cgraphs.begin(), step_cgraphs.end(), nullptr);
-
-        auto push_data = [&](const size_t j_src, const size_t j_dst, const size_t i_buf) {
-            assert(step_cgraphs[j_dst] == nullptr);
-            auto & bcj_src = backend_ctx->backend_configs[j_src];
-            auto & bcj_dst = backend_ctx->backend_configs[j_dst];
-
-            ggml_tensor * node_src = bcj_src.cgraphs[i].cgraph_main->nodes[bcj_src.cgraphs[i].cgraph_main->n_nodes - 1];
-            ggml_tensor * node_dst = bcj_dst.cgraphs[i].cgraph_main->nodes[bcj_dst.cgraphs[i].cgraph_main->n_nodes - 1];
-            GGML_ASSERT(ggml_is_contiguous(node_src));
-            GGML_ASSERT(ggml_is_contiguous(node_dst));
-
-            ggml_tensor * node_tmp = get_node_aux(node_dst);
-            set_tmp_data(node_tmp, j_dst, i_buf);
-
-            ggml_backend_tensor_copy_async(bcj_src.backend, bcj_dst.backend, node_src, node_tmp);
-
-            ggml_tensor * node_red = get_node_aux(node_dst);
-            node_red->view_src = node_dst->view_src == nullptr ? node_dst : node_dst->view_src;
-            node_red->view_offs = node_dst->view_offs;
-            node_red->op = GGML_OP_ADD;
-            node_red->src[0] = node_dst;
-            node_red->src[1] = node_tmp;
-            node_red->flags |= GGML_TENSOR_FLAG_COMPUTE;
-            ggml_backend_view_init(node_red);
-
-            ggml_cgraph * cgraph_aux = get_cgraph_aux();
-            cgraph_aux->nodes[0] = node_red;
-            cgraph_aux->n_nodes = 1;
-            step_cgraphs[j_dst] = cgraph_aux;
-        };
-
-        size_t offset_j = n_backends/2;
-        while ((offset_j & (offset_j - 1)) != 0) {
-            offset_j--;
-        }
-        const size_t offset_j_max = offset_j;
-        size_t i_buf = 0;
-
-        // If n_backends is not a power of 2, fold in the excess prior to butterfly reduction:
-        for (size_t j_src = 2*offset_j_max; j_src < n_backends; j_src++) {
-            const size_t j_dst = j_src - 2*offset_j_max;
-            push_data(j_src, j_dst, i_buf);
-            const ggml_status status = ggml_backend_graph_compute_async(backend_ctx->backend_configs[j_dst].backend, step_cgraphs[j_dst]);
-            if (status != GGML_STATUS_SUCCESS) {
-                return status;
-            }
-            i_buf = 1;
-        }
-
-        // Butterfly reduction:
-        for (; offset_j >= 1; offset_j /= 2) {
+        for (size_t offset_j = 1; offset_j < n_backends; offset_j *= 2) {
            std::fill(step_cgraphs.begin(), step_cgraphs.end(), nullptr);

-            for (size_t j = 0; j < 2*offset_j_max; j++) {
+            for (size_t j = 0; j < n_backends; j++) {
                const size_t j_other = j ^ offset_j;
-                if (j_other >= n_backends) {
+                if (j_other > j) {
                    continue;
                }
-                push_data(j, j_other, i_buf);
+
+                auto & bcj1 = backend_ctx->backend_configs[j];
+                auto & bcj2 = backend_ctx->backend_configs[j_other];
+
+                ggml_tensor * node1 = bcj1.cgraphs[i].cgraph_main->nodes[bcj1.cgraphs[i].cgraph_main->n_nodes - 1];
+                ggml_tensor * node2 = bcj2.cgraphs[i].cgraph_main->nodes[bcj2.cgraphs[i].cgraph_main->n_nodes - 1];
+                GGML_ASSERT(ggml_is_contiguous(node1));
+                GGML_ASSERT(ggml_is_contiguous(node2));
+
+                // Tmp tensors to receive P2P copies
+                ggml_tensor * node_tmp_1 = get_node_aux(node1);
+                node_tmp_1->buffer = bcj1.buf.get();
+                node_tmp_1->data = ggml_backend_buffer_get_base(bcj1.buf.get());
+
+                ggml_tensor * node_tmp_2 = get_node_aux(node2);
+                node_tmp_2->buffer = bcj2.buf.get();
+                node_tmp_2->data = ggml_backend_buffer_get_base(bcj2.buf.get());
+
+                // 2 P2P copies: exchange full buffers
+                ggml_backend_tensor_copy_async(bcj1.backend, bcj2.backend, node1, node_tmp_2);
+                ggml_backend_tensor_copy_async(bcj2.backend, bcj1.backend, node2, node_tmp_1);
+
+                // Local ADD: node1 += tmp1 (in-place via view)
+                ggml_tensor * node_red_1 = get_node_aux(node1);
+                node_red_1->view_src = node1->view_src == nullptr ? node1 : node1->view_src;
+                node_red_1->view_offs = node1->view_offs;
+                node_red_1->op = GGML_OP_ADD;
+                node_red_1->src[0] = node1;
+                node_red_1->src[1] = node_tmp_1;
+                node_red_1->flags |= GGML_TENSOR_FLAG_COMPUTE;
+                ggml_backend_view_init(node_red_1);
+
+                // Local ADD: node2 += tmp2 (in-place via view)
+                ggml_tensor * node_red_2 = get_node_aux(node2);
+                node_red_2->view_src = node2->view_src == nullptr ? node2 : node2->view_src;
+                node_red_2->view_offs = node2->view_offs;
+                node_red_2->op = GGML_OP_ADD;
+                node_red_2->src[0] = node2;
+                node_red_2->src[1] = node_tmp_2;
+                node_red_2->flags |= GGML_TENSOR_FLAG_COMPUTE;
+                ggml_backend_view_init(node_red_2);
+
+                // Build 1-node cgraphs for the ADD ops
+                ggml_cgraph * cgraph_aux_1 = get_cgraph_aux();
+                cgraph_aux_1->nodes[0] = node_red_1;
+                cgraph_aux_1->n_nodes = 1;
+                step_cgraphs[j] = cgraph_aux_1;
+
+                ggml_cgraph * cgraph_aux_2 = get_cgraph_aux();
+                cgraph_aux_2->nodes[0] = node_red_2;
+                cgraph_aux_2->n_nodes = 1;
+                step_cgraphs[j_other] = cgraph_aux_2;
            }

-            for (size_t j = 0; j < 2*offset_j_max; j++) {
+            // Execute local ADDs for this step
+            for (size_t j = 0; j < n_backends; j++) {
                if (step_cgraphs[j] == nullptr) {
                    continue;
                }
@@ -2044,20 +1906,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
                    return status;
                }
            }
-            i_buf++;
        }
-        assert(i_buf == backend_ctx->n_reduce_steps);
-
-        // If n_backends is not a power of 2, copy back the reduced tensors to the excess:
-        for (size_t j = 2*offset_j_max; j < n_backends; j++) {
-            auto & bcj_src = backend_ctx->backend_configs[j - 2*offset_j_max];
-            auto & bcj_dst = backend_ctx->backend_configs[j];
-
-            ggml_tensor * node_src = bcj_src.cgraphs[i].cgraph_main->nodes[bcj_src.cgraphs[i].cgraph_main->n_nodes - 1];
-            ggml_tensor * node_dst = bcj_dst.cgraphs[i].cgraph_main->nodes[bcj_dst.cgraphs[i].cgraph_main->n_nodes - 1];
-            ggml_backend_tensor_copy_async(bcj_src.backend, bcj_dst.backend, node_src, node_dst);
-        }
-
        return GGML_STATUS_SUCCESS;
    };

@@ -2100,8 +1949,8 @@ static const ggml_backend_i ggml_backend_meta_i = {
    /* .free                    = */ ggml_backend_meta_free,
    /* .set_tensor_async        = */ ggml_backend_meta_set_tensor_async,
    /* .get_tensor_async        = */ ggml_backend_meta_get_tensor_async,
-    /* .set_tensor_2d_async     = */ nullptr,
    /* .get_tensor_2d_async     = */ nullptr,
+    /* .set_tensor_2d_async     = */ nullptr,
    /* .cpy_tensor_async        = */ nullptr,
    /* .synchronize             = */ ggml_backend_meta_synchronize,
    /* .graph_plan_create       = */ nullptr,
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -181,12 +181,6 @@ struct ggml_backend_registry {
            return;
        }

-        for (auto & entry : backends) {
-            if (entry.reg == reg) {
-                return;
-            }
-        }
-
 #ifndef NDEBUG
        GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
            __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
@@ -198,12 +192,6 @@ struct ggml_backend_registry {
    }

    void register_device(ggml_backend_dev_t device) {
-        for (auto & dev : devices) {
-            if (dev == device) {
-                return;
-            }
-        }
-
 #ifndef NDEBUG
        GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
 #endif
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -965,7 +965,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
        }
        if (sched->debug > 1) {
            ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
-            GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d,c=%d:", i, ggml_op_desc(node), node->name,
+            GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d,c=%d:", i, ggml_op_name(node->op), node->name,
                fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node),
                graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)], node->flags & GGML_TENSOR_FLAG_COMPUTE ? 1 : 0);
            for (int j = 0; j < GGML_MAX_SRC; j++) {
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	35df147d80	cont : remove /api/tags	2026-04-20 15:45:42 +03:00
Georgi Gerganov	c1891fd6eb	server : remove /api endpoints	2026-04-20 15:34:18 +03:00