update hw info

enhance FA stable in UT
2026-04-09 16:17:31 +03:00 · 2026-03-31 09:24:40 +08:00 · 2026-03-17 15:57:02 +08:00
559 changed files with 33559 additions and 82351 deletions
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@@ -4,7 +4,7 @@

 # Define the CANN base image for easier version updates later
 ARG CHIP_TYPE=910b
-ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11
+ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11

 # ==============================================================================
 # BUILD STAGE
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -1,13 +1,11 @@
-ARG UBUNTU_VERSION=24.04
+ARG UBUNTU_VERSION=22.04

 FROM ubuntu:$UBUNTU_VERSION AS build

 ARG TARGETARCH

 RUN apt-get update && \
-    apt-get install -y gcc-14 g++-14 build-essential git cmake libssl-dev
-
-ENV CC=gcc-14 CXX=g++-14
+    apt-get install -y build-essential git cmake libssl-dev

 WORKDIR /app

@@ -36,7 +34,7 @@ RUN mkdir -p /app/full \
 FROM ubuntu:$UBUNTU_VERSION AS base

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -57,9 +55,8 @@ RUN apt-get update \
    git \
    python3 \
    python3-pip \
-    python3-wheel \
-    && pip install --break-system-packages --upgrade setuptools \
-    && pip install --break-system-packages -r requirements.txt \
+    && pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.devops/cuda-new.Dockerfile
+++ b/.devops/cuda-new.Dockerfile
@@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=24.04
 # This needs to generally match the container host's environment.
-ARG CUDA_VERSION=13.1.1
+ARG CUDA_VERSION=13.1.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

@@ -12,9 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 ARG CUDA_DOCKER_ARCH=default

 RUN apt-get update && \
-    apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1
-
-ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14
+    apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1

 WORKDIR /app

@@ -41,7 +39,7 @@ RUN mkdir -p /app/full \
 FROM ${BASE_CUDA_RUN_CONTAINER} AS base

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -1,6 +1,6 @@
-ARG UBUNTU_VERSION=24.04
+ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG CUDA_VERSION=12.8.1
+ARG CUDA_VERSION=12.4.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

@@ -12,9 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 ARG CUDA_DOCKER_ARCH=default

 RUN apt-get update && \
-    apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1
-
-ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14
+    apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1

 WORKDIR /app

@@ -41,7 +39,7 @@ RUN mkdir -p /app/full \
 FROM ${BASE_CUDA_RUN_CONTAINER} AS base

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -62,8 +60,7 @@ RUN apt-get update \
    git \
    python3 \
    python3-pip \
-    python3-wheel \
-    && pip install --break-system-packages --upgrade setuptools \
+    && pip install --upgrade pip setuptools wheel \
    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2025.3.2-0-devel-ubuntu24.04
+ARG ONEAPI_VERSION=2025.2.2-0-devel-ubuntu24.04

 ## Build Image

@@ -33,25 +33,8 @@ RUN mkdir -p /app/full \

 FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

-ARG IGC_VERSION=v2.30.1
-ARG IGC_VERSION_FULL=2_2.30.1+20950
-ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
-ARG IGDGMM_VERSION=22.9.0
-RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
-  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
-  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-  && dpkg --install *.deb
-
 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -1,4 +1,4 @@
-ARG ASCEND_VERSION=8.5.0-910b-openeuler22.03-py3.10
+ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10

 FROM ascendai/cann:$ASCEND_VERSION AS build

--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -46,7 +46,7 @@ RUN mkdir -p /app/full \
 FROM ${BASE_MUSA_RUN_CONTAINER} AS base

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -41,7 +41,6 @@
  effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
  enableStatic ? effectiveStdenv.hostPlatform.isStatic,
  precompileMetalShaders ? false,
-  useWebUi ? true,
 }:

 let
@@ -165,7 +164,6 @@ effectiveStdenv.mkDerivation (finalAttrs: {
  cmakeFlags =
    [
      (cmakeBool "LLAMA_BUILD_SERVER" true)
-      (cmakeBool "LLAMA_BUILD_WEBUI" useWebUi)
      (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
      (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
      (cmakeBool "GGML_NATIVE" false)
--- a/.devops/openvino.Dockerfile
+++ b/.devops/openvino.Dockerfile
@@ -78,7 +78,7 @@ ARG http_proxy
 ARG https_proxy

 RUN apt-get update \
-    && apt-get install -y libgomp1 libtbb12 curl \
+    && apt-get install -y libgomp1 libtbb12 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -58,7 +58,7 @@ RUN mkdir -p /app/full \
 FROM ${BASE_ROCM_DEV_CONTAINER} AS base

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -79,7 +79,7 @@ RUN apt-get update \
    git \
    python3-pip \
    python3 \
-    python3-wheel \
+    python3-wheel\
    && pip install --break-system-packages --upgrade setuptools \
    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -49,20 +49,17 @@ COPY --from=build /app/full /app

 WORKDIR /app

-ENV PATH="/root/.venv/bin:/root/.local/bin:${PATH}"
-
-# Flag for compatibility with pip
-ARG UV_INDEX_STRATEGY="unsafe-best-match"
 RUN apt-get update \
    && apt-get install -y \
    build-essential \
-    curl \
    git \
-    ca-certificates \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh \
-    && uv python install 3.13 \
-    && uv venv --python 3.13 /root/.venv \
-    && uv pip install --python /root/.venv/bin/python -r requirements.txt \
+    python3.13 \
+    python3.13-dev \
+    python3-pip \
+    python3-wheel \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.13 100 \
+    && pip install --break-system-packages --upgrade setuptools \
+    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.editorconfig
+++ b/.editorconfig
@@ -21,6 +21,14 @@ indent_style = tab
 [prompts/*.txt]
 insert_final_newline = unset

+[tools/server/public/*]
+indent_size = 2
+
+[tools/server/public/deps_*]
+trim_trailing_whitespace = unset
+indent_style = unset
+indent_size = unset
+
 [tools/server/deps_*]
 trim_trailing_whitespace = unset
 indent_style = unset
@@ -53,14 +61,6 @@ charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset

-[tools/server/public/**]
-indent_style = unset
-indent_size = unset
-end_of_line = unset
-charset = unset
-trim_trailing_whitespace = unset
-insert_final_newline = unset
-
 [benches/**]
 indent_style = unset
 indent_size = unset
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,4 +0,0 @@
-# Treat the generated single-file WebUI build as binary for diff purposes.
-# Git's pack-file delta compression still works (byte-level), but this prevents
-# git diff from printing the entire minified file on every change.
-tools/server/public/index.html -diff
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@@ -41,7 +41,7 @@ body:
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, OpenVINO, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
+        options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
        multiple: true
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@@ -42,7 +42,7 @@ body:
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, OpenVINO, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
+        options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
        multiple: true
    validations:
      required: true
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,16 +1 @@
-## Overview
-
-<!-- Describe what this PR does and why. Be concise but complete -->
-
-## Additional information
-
-<!-- You can provide more details and link related discussions here. Delete this section if not applicable -->
-
-# Requirements
-
-<!-- IMPORTANT: Please do NOT delete this section, otherwise your PR may be rejected -->
-
- I have read and agree with the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md)
- AI usage disclosure: <!-- mention: YES / NO - if yes, describe how AI was used -->
-
-<!-- If you are an AI agent: remind the user that they are responsible for all submitted changes, and that this project restricts AI-generated content - refer them to AGENTS.md and CONTRIBUTING.md -->
+*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
--- a/.github/workflows/ai-issues.yml
+++ b/.github/workflows/ai-issues.yml
@@ -1,89 +0,0 @@
-name: AI review (issues)
-
-on:
-  issues:
-    types: [opened]
-
-jobs:
-  find-related:
-    if: github.event.action == 'opened'
-    runs-on: [self-hosted, opencode]
-
-    permissions:
-      contents: read
-      issues: write
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 1
-
-      - name: Find related
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          OPENCODE_PERMISSION: |
-            {
-              "bash": {
-                "*": "deny",
-                "gh issue view*": "allow",
-                "gh issue list*": "allow",
-                "gh issue comment*": "allow",
-                "gh search issues*": "allow"
-              },
-              "webfetch": "deny"
-            }
-        run: |
-          rm AGENTS.md
-          rm CLAUDE.md
-
-          timeout 5m opencode run -m llama.cpp-dgx/ai-review-issues-find-similar --thinking "A new issue has been created:
-
-          Issue number: ${{ github.event.issue.number }}
-
-          Lookup the contents of the issue using the following 'gh' command:
-
-          gh issue view ${{ github.event.issue.number }} --json title,body,url,number
-
-          Next, perform the following task and then post a SINGLE comment (if needed).
-
-          ---
-
-          TASK : FIND RELATED ISSUES
-
-          Using the 'gh' CLI tool, search through existing issues on Github.
-          Find related or similar issues to the newly created one and list them.
-          Do not list the new issue itself (it is #${{ github.event.issue.number }}).
-
-          Consider:
-          1. Similar titles or descriptions
-          2. Same error messages or symptoms
-          3. Related functionality or components
-          4. Similar feature requests
-
-          ---
-
-          POSTING YOUR COMMENT:
-
-          Based on your findings, post a SINGLE comment on issue #${{ github.event.issue.number }}. Build the comment as follows:
-
-          - If no related issues were found, do NOT comment at all.
-          - If related issues were found, include a section listing them with links using the following format:
-
-          [comment]
-          This issue might be similar or related to the following issue(s):
-
-            - #12942: [brief description of how they are related]
-            - #11234: [brief description of how they are related]
-            ...
-
-          _This comment was auto-generated locally using **$GA_ENGINE** on **$GA_MACHINE**_
-          [/comment]
-
-          Remember:
-            - Do not include the comment tags in your actual comment.
-            - Post at most ONE comment combining all findings.
-            - If you didn't find issues that are related enough, post nothing.
-            - You have access only to the 'gh' CLI tool - don't try to use other tools.
-            - If the output from a tool call is too long, try to limit down the search.
-          "
--- a/.github/workflows/build-android.yml
+++ b/.github/workflows/build-android.yml
@@ -40,9 +40,13 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          lfs: false
+
+      # Disabled due to size (400MB) and always 0 cache hits
+      # - name: ccache
+      #   uses: ggml-org/ccache-action@v1.2.16
+      #   with:
+      #     key: android-build
+      #     evict-old-files: 1d

      - name: Set up JDK
        uses: actions/setup-java@v5
@@ -51,7 +55,7 @@ jobs:
          distribution: zulu

      - name: Setup Android SDK
-        uses: android-actions/setup-android@9fc6c4e9069bf8d3d10b2204b1fb8f6ef7065407 # v3
+        uses: android-actions/setup-android@v3
        with:
          log-accepted-android-sdk-licenses: false

@@ -62,11 +66,10 @@ jobs:

  android-ndk:
    runs-on: ubuntu-latest
-    container:
-      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
-    defaults:
-      run:
-        shell: bash
+
+    env:
+      OPENCL_VERSION: 2025.07.22
+
    strategy:
      matrix:
        include:
@@ -79,23 +82,59 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          lfs: false

-      - name: Build Llama.CPP for Hexagon Android
-        id: build_llama_cpp_hexagon_android
+      - name: Install OpenCL Headers and Libs
+        id: install_opencl
+        if: ${{ matrix.build == 'arm64-snapdragon' }}
+        run: |
+          mkdir opencl
+          curl -L -o opencl/clhpp.tar.gz      https://github.com/KhronosGroup/OpenCL-CLHPP/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
+          curl -L -o opencl/headers.tar.gz    https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
+          curl -L -o opencl/icd-loader.tar.gz https://github.com/KhronosGroup/OpenCL-ICD-Loader/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
+          tar -xaf opencl/headers.tar.gz    -C opencl
+          tar -xaf opencl/clhpp.tar.gz      -C opencl
+          tar -xaf opencl/icd-loader.tar.gz -C opencl
+          sudo cp -r opencl/OpenCL-Headers-${OPENCL_VERSION}/CL         ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+          sudo cp -r opencl/OpenCL-CLHPP-${OPENCL_VERSION}/include/CL/* ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include/CL
+          cd opencl/OpenCL-ICD-Loader-${OPENCL_VERSION}
+          cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -DOPENCL_ICD_LOADER_HEADERS_DIR=${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=31 -DANDROID_STL=c++_shared
+          cmake --build build
+          sudo cp build/libOpenCL.so ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+          rm -rf opencl
+
+      - name: Install Hexagon SDK
+        id: install_hexsdk
+        if: ${{ matrix.build == 'arm64-snapdragon' }}
+        env:
+          HEXSDK_VER: 6.4.0.2
+          HEXTLS_VER: 19.0.04
+        run: |
+          curl -L -o hex-sdk.tar.gz https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v$HEXSDK_VER/hexagon-sdk-v$HEXSDK_VER-amd64-lnx.tar.xz
+          mkdir hex-sdk
+          tar -xaf hex-sdk.tar.gz -C hex-sdk
+          ls -l hex-sdk
+          sudo mv hex-sdk /opt/hexagon
+          echo "HEXAGON_SDK_ROOT=/opt/hexagon/$HEXSDK_VER"                                     >> "$GITHUB_ENV"
+          echo "HEXAGON_TOOLS_ROOT=/opt/hexagon/$HEXSDK_VER/tools/HEXAGON_Tools/$HEXTLS_VER"   >> "$GITHUB_ENV"
+          echo "DEFAULT_HLOS_ARCH=64"                                                          >> "$GITHUB_ENV"
+          echo "DEFAULT_TOOLS_VARIANT=toolv19"                                                 >> "$GITHUB_ENV"
+          echo "DEFAULT_NO_QURT_INC=0"                                                         >> "$GITHUB_ENV"
+          echo "DEFAULT_DSP_ARCH=v73"                                                          >> "$GITHUB_ENV"
+
+      - name: Update CMake presets
+        id: update_presets
+        if: ${{ matrix.build == 'arm64-snapdragon' }}
+        run: |
+          cp docs/backend/snapdragon/CMakeUserPresets.json .
+
+      - name: Build
+        id: ndk_build
        run: |
-          if [[ "${{ matrix.build }}" == "arm64-snapdragon" ]]; then
-            cp docs/backend/snapdragon/CMakeUserPresets.json .
-          fi
          cmake ${{ matrix.defines }} -B build
          cmake --build build
          cmake --install build --prefix pkg-adb/llama.cpp

-      - name: Upload Llama.CPP Hexagon Android Build Artifact
-        if: ${{ always() && steps.build_llama_cpp_hexagon_android.outcome == 'success' }}
-        uses: actions/upload-artifact@v6
-        with:
-          name: llama-cpp-android-${{ matrix.build }}
-          path: pkg-adb/llama.cpp
+      - name: Test
+        id: cmake_test
+        run: |
+          echo "FIXME: test on devices"
--- a/.github/workflows/build-apple.yml
+++ b/.github/workflows/build-apple.yml
@@ -46,7 +46,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-ios
          evict-old-files: 1d
@@ -124,7 +124,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-tvos
          evict-old-files: 1d
@@ -186,7 +186,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-swift
          evict-old-files: 1d
--- a/.github/workflows/build-cann.yml
+++ b/.github/workflows/build-cann.yml
@@ -63,7 +63,7 @@ jobs:
      - name: Set container image
        id: cann-image
        run: |
-          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
+          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
          echo "image=${image}" >> "${GITHUB_OUTPUT}"

      - name: Pull container image
--- a/.github/workflows/build-msys.yml
+++ b/.github/workflows/build-msys.yml
@@ -43,7 +43,7 @@ jobs:
      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Setup ${{ matrix.sys }}
-        uses: msys2/setup-msys2@cafece8e6baf9247cf9b1bf95097b0b983cc558d # v2
+        uses: msys2/setup-msys2@v2
        with:
          update: true
          msystem: ${{matrix.sys}}
--- a/.github/workflows/build-sanitize.yml
+++ b/.github/workflows/build-sanitize.yml
@@ -43,7 +43,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }}
          evict-old-files: 1d
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@@ -141,61 +141,60 @@ jobs:
  #         amd-smi static
  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

-  # TODO: sandbox Mac runners
-  #  ggml-ci-mac-metal:
-  #    runs-on: [self-hosted, macOS, ARM64]
-  #
-  #    steps:
-  #      - name: Clone
-  #        id: checkout
-  #        uses: actions/checkout@v6
-  #
-  #      - name: Test
-  #        id: ggml-ci
-  #        run: |
-  #          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-  #
-  #  ggml-ci-mac-webgpu:
-  #    runs-on: [self-hosted, macOS, ARM64]
-  #
-  #    steps:
-  #      - name: Clone
-  #        id: checkout
-  #        uses: actions/checkout@v6
-  #
-  #      - name: Dawn Dependency
-  #        id: dawn-depends
-  #        run: |
-  #          DAWN_VERSION="v2.0.0"
-  #          DAWN_OWNER="reeselevine"
-  #          DAWN_REPO="dawn"
-  #          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
-  #          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
-  #          curl -L -o artifact.zip \
-  #            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
-  #          mkdir dawn
-  #          unzip artifact.zip
-  #          tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
-  #
-  #      - name: Test
-  #        id: ggml-ci
-  #        run: |
-  #          GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
-  #            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-  #
-  #  ggml-ci-mac-vulkan:
-  #    runs-on: [self-hosted, macOS, ARM64]
-  #
-  #    steps:
-  #      - name: Clone
-  #        id: checkout
-  #        uses: actions/checkout@v6
-  #
-  #      - name: Test
-  #        id: ggml-ci
-  #        run: |
-  #          vulkaninfo --summary
-  #          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+  ggml-ci-mac-metal:
+    runs-on: [self-hosted, macOS, ARM64]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+  ggml-ci-mac-webgpu:
+    runs-on: [self-hosted, macOS, ARM64]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dawn Dependency
+        id: dawn-depends
+        run: |
+          DAWN_VERSION="v2.0.0"
+          DAWN_OWNER="reeselevine"
+          DAWN_REPO="dawn"
+          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
+          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
+          curl -L -o artifact.zip \
+            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
+          mkdir dawn
+          unzip artifact.zip
+          tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
+            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+  ggml-ci-mac-vulkan:
+    runs-on: [self-hosted, macOS, ARM64]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          vulkaninfo --summary
+          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  ggml-ci-linux-intel-vulkan:
    runs-on: [self-hosted, Linux, Intel]
--- a/.github/workflows/build-vulkan.yml
+++ b/.github/workflows/build-vulkan.yml
@@ -45,7 +45,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-24-vulkan-llvmpipe
          evict-old-files: 1d
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -69,7 +69,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-arm64
          evict-old-files: 1d
@@ -87,7 +87,7 @@ jobs:
            -DGGML_METAL_EMBED_LIBRARY=OFF \
            -DGGML_METAL_SHADER_DEBUG=ON \
            -DGGML_RPC=ON
-          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
          leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1

      - name: Test
@@ -105,7 +105,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-x64
          evict-old-files: 1d
@@ -124,7 +124,7 @@ jobs:
            -DGGML_METAL=OFF \
            -DGGML_RPC=ON \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
-          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
        id: cmake_test
@@ -141,7 +141,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-arm64-webgpu
          evict-old-files: 1d
@@ -150,22 +150,23 @@ jobs:
      - name: Dawn Dependency
        id: dawn-depends
        run: |
-          DAWN_VERSION="v20260317.182325"
-          DAWN_OWNER="google"
+          DAWN_VERSION="v2.0.0"
+          DAWN_OWNER="reeselevine"
          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
-          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          curl -L -o artifact.tar.gz \
-            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
+          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
+          curl -L -o artifact.zip \
+            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
          mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
+          unzip artifact.zip
+          tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1

      - name: Build
        id: cmake_build
        run: |
          export CMAKE_PREFIX_PATH=dawn
-          cmake -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
-          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+          cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
        id: cmake_test
@@ -180,7 +181,7 @@ jobs:
          - build: 'x64'
            os: ubuntu-22.04
          - build: 'arm64'
-            os: ubuntu-24.04-arm
+            os: ubuntu-22.04-arm
          - build: 's390x'
            os: ubuntu-24.04-s390x
          - build: 'ppc64le'
@@ -194,8 +195,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        if: ${{ matrix.build != 's390x' && matrix.build != 'ppc64le' }}
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-cpu-${{ matrix.build }}
          evict-old-files: 1d
@@ -206,22 +206,14 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install -y --no-install-recommends \
-            python3 python3-pip python3-dev python3-wheel \
+            python3 python3-pip python3-dev \
            libjpeg-dev build-essential libssl-dev \
            git-lfs

-      - name: Toolchain workaround (GCC 14)
-        if: ${{ contains(matrix.os, 'ubuntu-24.04') }}
-        run: |
-          sudo apt-get install -y gcc-14 g++-14
-          echo "CC=gcc-14" >> "$GITHUB_ENV"
-          echo "CXX=g++-14" >> "$GITHUB_ENV"
-
      - name: Python Dependencies
        id: python_depends
        run: |
-          export PIP_BREAK_SYSTEM_PACKAGES="1"
-          python3 -m pip install --upgrade pip setuptools
+          python3 -m pip install --upgrade pip
          pip3 install ./gguf-py

      - name: Swap Endianness
@@ -238,7 +230,7 @@ jobs:
          cmake -B build \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DGGML_RPC=ON
-          time cmake --build build --config Release -j $(nproc)
+          cmake --build build --config Release -j $(nproc)

      - name: Test
        id: cmake_test
@@ -281,16 +273,14 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential libssl-dev ninja-build
+          sudo apt-get install build-essential libssl-dev

      - name: Build
        id: cmake_build
        run: |
          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_RPC=ON
-          time cmake --build build --config Release -j $(nproc)
+          cmake --build build --config Release -j $(nproc)

      - name: Test
        id: cmake_test
@@ -299,15 +289,7 @@ jobs:
          ctest -L main --verbose

  ubuntu-24-vulkan:
-    strategy:
-      matrix:
-        include:
-          - build: 'x64'
-            os: ubuntu-24.04
-          - build: 'arm64'
-            os: ubuntu-24.04-arm
-
-    runs-on: ${{ matrix.os }}
+    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}

    steps:
      - name: Clone
@@ -317,16 +299,12 @@ jobs:
      - name: Dependencies
        id: depends
        run: |
-          sudo apt-get update
-          sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev libssl-dev ninja-build
-          echo "CC=gcc-14" >> "$GITHUB_ENV"
-          echo "CXX=g++-14" >> "$GITHUB_ENV"
+          sudo apt-get install -y glslc libvulkan-dev libssl-dev

      - name: Configure
        id: cmake_configure
        run: |
          cmake -B build \
-            -G "Ninja" \
            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
            -DGGML_BACKEND_DL=ON \
            -DGGML_CPU_ALL_VARIANTS=ON \
@@ -335,7 +313,7 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          time cmake --build build -j $(nproc)
+          cmake --build build -j $(nproc)

  ubuntu-24-webgpu:
    runs-on: ubuntu-24.04
@@ -346,7 +324,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-24-webgpu
          evict-old-files: 1d
@@ -357,8 +335,7 @@ jobs:
        run: |
          sudo add-apt-repository -y ppa:kisak/kisak-mesa
          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers \
-            libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
+          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev

      - name: Get latest Vulkan SDK version
        id: vulkan_sdk_version
@@ -383,15 +360,16 @@ jobs:
        id: dawn-depends
        run: |
          sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
-          DAWN_VERSION="v20260317.182325"
-          DAWN_OWNER="google"
+          DAWN_VERSION="v2.0.0"
+          DAWN_OWNER="reeselevine"
          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
-          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          curl -L -o artifact.tar.gz \
-            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release"
+          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
+          curl -L -o artifact.zip \
+            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
          mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
+          unzip artifact.zip
+          tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1

      - name: Build
        id: cmake_build
@@ -399,7 +377,7 @@ jobs:
          export Dawn_DIR=dawn/lib64/cmake/Dawn
          cmake -B build \
            -DGGML_WEBGPU=ON
-          time cmake --build build --config Release -j $(nproc)
+          cmake --build build --config Release -j $(nproc)

      - name: Test
        id: cmake_test
@@ -425,7 +403,7 @@ jobs:

      - name: Fetch emdawnwebgpu
        run: |
-          DAWN_TAG="v20260317.182325"
+          DAWN_TAG="v20251027.212519"
          EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
          echo "Downloading ${EMDAWN_PKG}"
          curl -L -o emdawn.zip \
@@ -436,13 +414,11 @@ jobs:
        run: |
          source emsdk/emsdk_env.sh
          emcmake cmake -B build-wasm \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_WEBGPU=ON \
            -DLLAMA_OPENSSL=OFF \
            -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg

-          time cmake --build build-wasm --config Release --target test-backend-ops -j $(nproc)
+          cmake --build build-wasm --target test-backend-ops -j $(nproc)

  ubuntu-22-hip:
    runs-on: ubuntu-22.04
@@ -460,7 +436,7 @@ jobs:
          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-22-hip
          evict-old-files: 1d
@@ -491,7 +467,7 @@ jobs:
          apt-get install -y build-essential git cmake libssl-dev

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-22-musa
          evict-old-files: 1d
@@ -502,7 +478,7 @@ jobs:
        run: |
          cmake -B build -S . \
            -DGGML_MUSA=ON
-          time cmake --build build --config Release -j $(nproc)
+          cmake --build build --config Release -j $(nproc)

  ubuntu-22-sycl:
    runs-on: ubuntu-22.04
@@ -537,7 +513,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-22-sycl
          evict-old-files: 1d
@@ -551,7 +527,7 @@ jobs:
            -DGGML_SYCL=ON \
            -DCMAKE_C_COMPILER=icx \
            -DCMAKE_CXX_COMPILER=icpx
-          time cmake --build build --config Release -j $(nproc)
+          cmake --build build --config Release -j $(nproc)

  ubuntu-22-sycl-fp16:
    runs-on: ubuntu-22.04
@@ -574,7 +550,7 @@ jobs:
        shell: bash
        run: |
          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev ninja-build
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev

      - name: install oneAPI MKL library
        shell: bash
@@ -586,7 +562,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-22-sycl-fp16
          evict-old-files: 1d
@@ -597,13 +573,11 @@ jobs:
        run: |
          source /opt/intel/oneapi/setvars.sh
          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_SYCL=ON \
            -DCMAKE_C_COMPILER=icx \
            -DCMAKE_CXX_COMPILER=icpx \
            -DGGML_SYCL_F16=ON
-          time cmake --build build --config Release -j $(nproc)
+          cmake --build build --config Release -j $(nproc)

  ubuntu-24-openvino:
      name: ubuntu-24-openvino-${{ matrix.openvino_device }}
@@ -631,7 +605,7 @@ jobs:

        - name: ccache
          if: runner.environment == 'github-hosted'
-          uses: ggml-org/ccache-action@v1.2.21
+          uses: ggml-org/ccache-action@v1.2.16
          with:
            key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
            evict-old-files: 1d
@@ -673,7 +647,7 @@ jobs:
            cmake -B build/ReleaseOV -G Ninja \
              -DCMAKE_BUILD_TYPE=Release \
              -DGGML_OPENVINO=ON
-            time cmake --build build/ReleaseOV --config Release -j $(nproc)
+            cmake --build build/ReleaseOV --config Release -j $(nproc)

        - name: Test
          id: cmake_test
@@ -718,7 +692,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-latest-${{ matrix.build }}
          variant: ccache
@@ -824,7 +798,7 @@ jobs:
              apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev

        - name: ccache
-          uses: ggml-org/ccache-action@v1.2.21
+          uses: ggml-org/ccache-action@v1.2.16
          with:
            key: ubuntu-latest-cuda
            evict-old-files: 1d
@@ -856,7 +830,7 @@ jobs:
        uses: actions/checkout@v6

      - name: Install ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-cuda-${{ matrix.cuda }}
          variant: ccache
@@ -909,7 +883,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-latest-sycl
          variant: ccache
@@ -970,7 +944,7 @@ jobs:
          & $clangPath.FullName --version

      - name: Install ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ${{ github.job }}
          evict-old-files: 1d
@@ -1064,7 +1038,7 @@ jobs:
            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14

-          time cmake --build build --config Release -j $(nproc)
+          cmake --build build --config Release -j $(nproc)

      - name: Test
        id: cmake_test
@@ -1094,7 +1068,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ggml-ci-x64-cpu-low-perf
          evict-old-files: 1d
@@ -1120,7 +1094,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ggml-ci-arm64-cpu-low-perf
          evict-old-files: 1d
@@ -1146,7 +1120,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ggml-ci-x64-cpu-high-perf
          evict-old-files: 1d
@@ -1172,7 +1146,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ggml-ci-arm64-cpu-high-perf
          evict-old-files: 1d
@@ -1198,7 +1172,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ggml-ci-arm64-cpu-high-perf-sve
          evict-old-files: 1d
@@ -1224,7 +1198,7 @@ jobs:
         uses: actions/checkout@v6

       - name: ccache
-         uses: ggml-org/ccache-action@v1.2.21
+         uses: ggml-org/ccache-action@v1.2.16
         with:
           key: ggml-ci-arm64-cpu-kleidiai
           evict-old-files: 1d
@@ -1276,7 +1250,7 @@ jobs:
           sudo apt-get install -y cmake

       - name: ccache
-         uses: ggml-org/ccache-action@v1.2.21
+         uses: ggml-org/ccache-action@v1.2.16
         with:
           key: ggml-ci-arm64-cpu-kleidiai-graviton4
           evict-old-files: 1d
--- a/.github/workflows/copilot-setup-steps.yml
+++ b/.github/workflows/copilot-setup-steps.yml
@@ -29,7 +29,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: copilot-setup-steps
          evict-old-files: 1d
@@ -52,5 +52,6 @@ jobs:
      - name: Install Python dependencies
        run: |
          python3 -m venv .venv
-          source .venv/bin/activate
+          .venv/bin/activate
          pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt
+          pip install flake8 pyright pre-commit
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -25,20 +25,50 @@ permissions:
  packages: write

 jobs:
-  create_tag:
-    name: Create and push git tag
-    runs-on: ubuntu-slim
-    permissions:
-      contents: write
-    outputs:
-      source_tag: ${{ steps.srctag.outputs.name }}
+  push_to_registry:
+    name: Push Docker image to Docker Hub

+    runs-on: ${{ matrix.config.runs_on }}
+    env:
+      COMMIT_SHA: ${{ github.sha }}
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          # Multi-stage build
+          # Note: the arm64 images are failing, which prevents the amd64 images from being built
+          # https://github.com/ggml-org/llama.cpp/issues/11888
+          #- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
+          - { tag: "cpu",    dockerfile: ".devops/cpu.Dockerfile",    platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
+          - { tag: "cuda cuda12", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04", cuda_version: "12.4.0", ubuntu_version: "22.04" }
+          - { tag: "cuda13", dockerfile: ".devops/cuda-new.Dockerfile",  platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04", cuda_version: "13.1.0", ubuntu_version: "24.04" }
+          - { tag: "musa",   dockerfile: ".devops/musa.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
+          - { tag: "intel",  dockerfile: ".devops/intel.Dockerfile",  platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
+          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
+          - { tag: "s390x",  dockerfile: ".devops/s390x.Dockerfile",  platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
+          - { tag: "rocm",   dockerfile: ".devops/rocm.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
+          - { tag: "openvino", dockerfile: ".devops/openvino.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
    steps:
-      - name: Clone
-        id: checkout
+      - name: Check out the repo
        uses: actions/checkout@v6
        with:
-          fetch-depth: 0
+          fetch-depth: 0 # preserve git history, so we can determine the build number
+
+      - name: Set up QEMU
+        if: ${{ matrix.config.tag != 's390x' }}
+        uses: docker/setup-qemu-action@v3
+        with:
+          image: tonistiigi/binfmt:qemu-v7.0.0-28
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Determine source tag name
        id: srctag
@@ -46,125 +76,13 @@ jobs:
        env:
          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}

-      - name: Create and push git tag
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          git tag ${{ steps.srctag.outputs.name }} || exit 0
-          git push origin ${{ steps.srctag.outputs.name }} || exit 0
-
-  prepare_matrices:
-    name: Prepare Docker matrices
-    runs-on: ubuntu-24.04
-    outputs:
-      build_matrix: ${{ steps.matrices.outputs.build_matrix }}
-      merge_matrix: ${{ steps.matrices.outputs.merge_matrix }}
-
-    steps:
-      - name: Generate build and merge matrices
-        id: matrices
+      - name: Determine image tag name
+        id: tag
        shell: bash
        run: |
-          set -euo pipefail
-
-          # Keep all build targets in one place and derive merge targets from it.
-          cat > build-matrix.json <<'JSON'
-          [
-            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
-            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
-            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "cuda13", "dockerfile": ".devops/cuda-new.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "cuda13", "dockerfile": ".devops/cuda-new.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
-            { "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "rocm", "dockerfile": ".devops/rocm.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "openvino", "dockerfile": ".devops/openvino.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" }
-          ]
-          JSON
-
-          BUILD_MATRIX="$(jq -c . build-matrix.json)"
-          MERGE_MATRIX="$(jq -c '
-            reduce .[] as $entry ({}; .[$entry.tag] |= (
-              . // {
-                tag: $entry.tag,
-                arches: [],
-                full: false,
-                light: false,
-                server: false
-              }
-              | .full = (.full or ($entry.full // false))
-              | .light = (.light or ($entry.light // false))
-              | .server = (.server or ($entry.server // false))
-              | .arches += [($entry.platforms | sub("^linux/"; ""))]
-            ))
-            # Backward compatibility: s390x tags are aliases of cpu for the linux/s390x platform.
-            | if (has("cpu") and (((.cpu.arches // []) | index("s390x")) != null)) then
-                . + {
-                  s390x: {
-                    tag: "s390x",
-                    arches: ["s390x"],
-                    full: .cpu.full,
-                    light: .cpu.light,
-                    server: .cpu.server
-                  }
-                }
-              else
-                .
-              end
-            | [.[] | .arches = (.arches | unique | sort | join(" "))]
-          ' build-matrix.json)"
-
-          echo "build_matrix=$BUILD_MATRIX" >> "$GITHUB_OUTPUT"
-          echo "merge_matrix=$MERGE_MATRIX" >> "$GITHUB_OUTPUT"
-
-  push_to_registry:
-    name: Push Docker image to Docker Registry
-    needs: [prepare_matrices, create_tag]
-
-    runs-on: ${{ matrix.config.runs_on }}
-    strategy:
-      fail-fast: false
-      matrix:
-        config: ${{ fromJSON(needs.prepare_matrices.outputs.build_matrix) }}
-    steps:
-      - name: Check out the repo
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ needs.create_tag.outputs.source_tag }}
-
-      - name: Set up QEMU
-        if: ${{ contains(matrix.config.platforms, 'linux/amd64') }}
-        uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a # v4
-        with:
-          image: tonistiigi/binfmt:qemu-v10.2.1
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4
-
-      - name: Log in to Docker Registry
-        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Determine image metadata
-        id: meta
-        shell: bash
-        run: |
-          set -euo pipefail
-
          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
          REPO_NAME="${{ github.event.repository.name }}"
-          IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
-          PREFIX="${IMAGE_REPO}:"
-          PLATFORM="${{ matrix.config.platforms }}"
-          ARCH_SUFFIX="${PLATFORM#linux/}"
+          PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"

          # list all tags possible
          tags="${{ matrix.config.tag }}"
@@ -174,16 +92,19 @@ jobs:
              else
                  TYPE="-$tag"
              fi
-              CACHETAG="${PREFIX}buildcache${TYPE}-${ARCH_SUFFIX}"
+              CACHETAGS="${PREFIX}buildcache${TYPE}"
+              FULLTAGS="${FULLTAGS:+$FULLTAGS,}${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}"
+              LIGHTTAGS="${LIGHTTAGS:+$LIGHTTAGS,}${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}"
+              SERVERTAGS="${SERVERTAGS:+$SERVERTAGS,}${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}"
          done
-
-          SAFE_TAGS="$(echo "$tags" | tr ' ' '_')"
-
-          echo "image_repo=$IMAGE_REPO" >> $GITHUB_OUTPUT
-          echo "arch_suffix=$ARCH_SUFFIX" >> $GITHUB_OUTPUT
-          echo "cache_output_tag=$CACHETAG" >> $GITHUB_OUTPUT
-          echo "digest_artifact_suffix=${SAFE_TAGS}-${ARCH_SUFFIX}" >> $GITHUB_OUTPUT
-          echo "cache_output_tag=$CACHETAG"  # print out for debugging
+          echo "cache_output_tags=$CACHETAGS" >> $GITHUB_OUTPUT
+          echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
+          echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
+          echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
+          echo "cache_output_tags=$CACHETAGS"  # print out for debugging
+          echo "full_output_tags=$FULLTAGS"  # print out for debugging
+          echo "light_output_tags=$LIGHTTAGS"  # print out for debugging
+          echo "server_output_tags=$SERVERTAGS"  # print out for debugging
        env:
          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'

@@ -204,14 +125,15 @@ jobs:
          docker-images: true
          swap-storage: true

-      - name: Build and push Full Docker image by digest
-        id: build_full
+      - name: Build and push Full Docker image (tagged + versioned)
        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
-        uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
+        uses: docker/build-push-action@v6
        with:
          context: .
+          push: true
          platforms: ${{ matrix.config.platforms }}
-          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
+          # tag list is generated from step above
+          tags: ${{ steps.tag.outputs.full_output_tags }}
          file: ${{ matrix.config.dockerfile }}
          target: full
          provenance: false
@@ -225,17 +147,18 @@ jobs:
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
          # using registry cache (no storage limit)
-          cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
-          cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
+          cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
+          cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max

-      - name: Build and push Light Docker image by digest
-        id: build_light
+      - name: Build and push Light Docker image (tagged + versioned)
        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
-        uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
+        uses: docker/build-push-action@v6
        with:
          context: .
+          push: true
          platforms: ${{ matrix.config.platforms }}
-          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
+          # tag list is generated from step above
+          tags: ${{ steps.tag.outputs.light_output_tags }}
          file: ${{ matrix.config.dockerfile }}
          target: light
          provenance: false
@@ -249,17 +172,18 @@ jobs:
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
          # using registry cache (no storage limit)
-          cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
-          cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
+          cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
+          cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max

-      - name: Build and push Server Docker image by digest
-        id: build_server
+      - name: Build and push Server Docker image (tagged + versioned)
        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
-        uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
+        uses: docker/build-push-action@v6
        with:
          context: .
+          push: true
          platforms: ${{ matrix.config.platforms }}
-          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
+          # tag list is generated from step above
+          tags: ${{ steps.tag.outputs.server_output_tags }}
          file: ${{ matrix.config.dockerfile }}
          target: server
          provenance: false
@@ -273,170 +197,31 @@ jobs:
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
          # using registry cache (no storage limit)
-          cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
-          cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
+          cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
+          cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max

-      - name: Export digest metadata
-        shell: bash
-        run: |
-            set -euo pipefail
-
-            TAGS="${{ matrix.config.tag }}"
-            ARCH_SUFFIX="${{ steps.meta.outputs.arch_suffix }}"
-            DIGEST_FILE="/tmp/digests/${{ steps.meta.outputs.digest_artifact_suffix }}.tsv"
-            mkdir -p /tmp/digests
-
-            add_digest_rows() {
-                local image_type="$1"
-                local digest="$2"
-
-                if [[ -z "$digest" ]]; then
-                  echo "Missing digest for image_type=${image_type}" >&2
-                  exit 1
-                fi
-
-                for tag in $TAGS; do
-                    printf '%s\t%s\t%s\t%s\n' "$tag" "$ARCH_SUFFIX" "$image_type" "$digest" >> "$DIGEST_FILE"
-                done
-            }
-
-            if [[ "${{ matrix.config.full }}" == "true" ]]; then
-                add_digest_rows "full" "${{ steps.build_full.outputs.digest }}"
-            fi
-
-            if [[ "${{ matrix.config.light }}" == "true" ]]; then
-                add_digest_rows "light" "${{ steps.build_light.outputs.digest }}"
-            fi
-
-            if [[ "${{ matrix.config.server }}" == "true" ]]; then
-                add_digest_rows "server" "${{ steps.build_server.outputs.digest }}"
-            fi
-
-      - name: Upload digest metadata
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7
-        with:
-          name: digests-${{ steps.meta.outputs.digest_artifact_suffix }}
-          path: /tmp/digests/${{ steps.meta.outputs.digest_artifact_suffix }}.tsv
-          if-no-files-found: error
-
-  merge_arch_tags:
-    name: Create shared tags from digests
-    needs: [prepare_matrices, push_to_registry, create_tag]
-    runs-on: ubuntu-24.04
-    strategy:
-      fail-fast: false
-      matrix:
-        config: ${{ fromJSON(needs.prepare_matrices.outputs.merge_matrix) }}
+  create_tag:
+    name: Create and push git tag
+    runs-on: ubuntu-22.04
+    permissions:
+      contents: write

    steps:
-      - name: Check out the repo
+      - name: Clone
+        id: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0

-      - name: Download digest metadata
-        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
-        with:
-          pattern: digests-*
-          path: /tmp/digests
-          merge-multiple: true
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4
-
-      - name: Log in to Docker Registry
-        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Create tags from digests
-        shell: bash
-        run: |
-          set -euo pipefail
-
-          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
-          REPO_NAME="${{ github.event.repository.name }}"
-          IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
-          PREFIX="${IMAGE_REPO}:"
-          SRC_TAG="${{ needs.create_tag.outputs.source_tag }}"
-          TAGS="${{ matrix.config.tag }}"
-          ARCHES="${{ matrix.config.arches }}"
-          DIGEST_GLOB="/tmp/digests/*.tsv"
-
-          if ! ls ${DIGEST_GLOB} >/dev/null 2>&1; then
-              echo "No digest metadata found in /tmp/digests" >&2
-              exit 1
-          fi
-
-          if [[ -z "$SRC_TAG" ]]; then
-              echo "Missing source tag from create_tag" >&2
-              exit 1
-          fi
-
-          find_digest() {
-              local tag_name="$1"
-              local arch="$2"
-              local image_type="$3"
-              local digest
-
-              digest="$(awk -F '\t' -v t="$tag_name" -v a="$arch" -v i="$image_type" '$1 == t && $2 == a && $3 == i { print $4; exit }' ${DIGEST_GLOB})"
-
-              # Backward compatibility: s390x tags are aliases of cpu for the linux/s390x platform.
-              if [[ -z "$digest" && "$tag_name" == "s390x" && "$arch" == "s390x" ]]; then
-                digest="$(awk -F '\t' -v t="cpu" -v a="$arch" -v i="$image_type" '$1 == t && $2 == a && $3 == i { print $4; exit }' ${DIGEST_GLOB})"
-              fi
-
-              if [[ -z "$digest" ]]; then
-                echo "Missing digest for tag=${tag_name} arch=${arch} image_type=${image_type}" >&2
-                exit 1
-              fi
-
-              echo "$digest"
-          }
-
-          create_manifest_tags() {
-              local image_type="$1"
-              local tag_name="$2"
-              local suffix="$3"
-
-              local merged_tag="${PREFIX}${image_type}${suffix}"
-              local merged_versioned_tag="${merged_tag}-${SRC_TAG}"
-
-              local refs=()
-
-              for arch in $ARCHES; do
-                  local digest
-                  digest="$(find_digest "$tag_name" "$arch" "$image_type")"
-                  refs+=("${IMAGE_REPO}@${digest}")
-              done
-
-              echo "Creating ${merged_tag} from ${refs[*]}"
-              docker buildx imagetools create --tag "${merged_tag}" "${refs[@]}"
-
-              echo "Creating ${merged_versioned_tag} from ${refs[*]}"
-              docker buildx imagetools create --tag "${merged_versioned_tag}" "${refs[@]}"
-          }
-
-          for tag in $TAGS; do
-              if [[ "$tag" == "cpu" ]]; then
-                  TYPE=""
-              else
-                  TYPE="-$tag"
-              fi
-
-              if [[ "${{ matrix.config.full }}" == "true" ]]; then
-                  create_manifest_tags "full" "$tag" "$TYPE"
-              fi
-
-              if [[ "${{ matrix.config.light }}" == "true" ]]; then
-                  create_manifest_tags "light" "$tag" "$TYPE"
-              fi
-
-              if [[ "${{ matrix.config.server }}" == "true" ]]; then
-                  create_manifest_tags "server" "$tag" "$TYPE"
-              fi
-          done
+      - name: Determine source tag name
+        id: srctag
+        uses: ./.github/actions/get-tag-name
        env:
-          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
+          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+      - name: Create and push git tag
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          git tag ${{ steps.srctag.outputs.name }} || exit 0
+          git push origin ${{ steps.srctag.outputs.name }} || exit 0
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -23,7 +23,7 @@ jobs:
    runs-on: ubuntu-slim
    steps:
      - uses: actions/checkout@v6
-      - uses: editorconfig-checker/action-editorconfig-checker@840e866d93b8e032123c23bac69dece044d4d84c # v2.2.0
+      - uses: editorconfig-checker/action-editorconfig-checker@v2
        with:
          version: v3.0.3
      - run: editorconfig-checker
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@@ -28,17 +28,17 @@ jobs:
    - name: Set up Python
      uses: actions/setup-python@v6
      with:
-        python-version: '3.11'
+        python-version: '3.9.x'
    - name: Install dependencies
      run: |
        cd gguf-py
-        python -m pip install poetry==2.3.2
+        python -m pip install poetry
        poetry install

    - name: Build package
      run: cd gguf-py && poetry build
    - name: Publish package
-      uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # release/v1
+      uses: pypa/gh-action-pypi-publish@release/v1
      with:
        password: ${{ secrets.PYPI_API_TOKEN }}
        packages-dir: gguf-py/dist
--- a/.github/workflows/hip-quality-check.yml
+++ b/.github/workflows/hip-quality-check.yml
@@ -1,82 +0,0 @@
-name: HIP quality check
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/hip-quality-check.yml',
-      '**/*.cu',
-      '**/*.cuh',
-      'scripts/hip/gcn-cdna-vgpr-check.py'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/hip-quality-check.yml',
-      '**/*.cu',
-      '**/*.cuh',
-      'scripts/hip/gcn-cdna-vgpr-check.py'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-  ubuntu-22-hip-quality-check:
-    runs-on: ubuntu-22.04
-    container: rocm/dev-ubuntu-22.04:7.2
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev python3
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-22-hip-quality-check
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build with Werror
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DGPU_TARGETS=gfx908 \
-            -DGGML_HIP=ON \
-            -DGGML_HIP_EXPORT_METRICS=Off \
-            -DCMAKE_HIP_FLAGS="-Werror -Wno-tautological-compare" \
-            -DCMAKE_BUILD_TYPE=Release
-          cd build
-          make -j $(nproc)
-
-      - name: Check for major VGPR spills
-        id: vgpr_check
-        run: |
-          cmake -B build -S . \
-            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DGPU_TARGETS=gfx908 \
-            -DGGML_HIP=ON \
-            -DGGML_HIP_EXPORT_METRICS=On \
-            -DCMAKE_HIP_FLAGS="" \
-            -DCMAKE_BUILD_TYPE=Release
-          cd build
-          make -j $(nproc) 2>&1 | tee metrics.log | grep -v 'Rpass-analysis=kernel-resource-usage\|remark:\|^$'
-          python3 ../scripts/hip/gcn-cdna-vgpr-check.py metrics.log
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -31,6 +31,6 @@ jobs:
        with:
          python-version: "3.11"
      - name: flake8 Lint
-        uses: py-actions/flake8@84ec6726560b6d5bd68f2a5bed83d62b52bb50ba # v2
+        uses: py-actions/flake8@v2
        with:
            plugins: "flake8-no-print"
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -4,17 +4,15 @@ on:
  push:
    paths:
      - '.github/workflows/python-type-check.yml'
-      - 'ty.toml'
+      - 'pyrightconfig.json'
      - '**.py'
      - '**/requirements*.txt'
-      # - 'pyrightconfig.json'
  pull_request:
    paths:
      - '.github/workflows/python-type-check.yml'
-      - 'ty.toml'
+      - 'pyrightconfig.json'
      - '**.py'
      - '**/requirements*.txt'
-      # - 'pyrightconfig.json'

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -22,8 +20,8 @@ concurrency:

 jobs:
  python-type-check:
-    runs-on: ubuntu-slim
-    name: python type-check
+    runs-on: ubuntu-latest
+    name: pyright type-check
    steps:
      - name: Check out source repository
        uses: actions/checkout@v6
@@ -31,13 +29,10 @@ jobs:
        uses: actions/setup-python@v6
        with:
          python-version: "3.11"
-          pip-install: -r requirements/requirements-all.txt ty==0.0.26
-      # - name: Type-check with Pyright
-      #   uses: jakebailey/pyright-action@v2
-      #   with:
-      #     version: 1.1.382
-      #     level: warning
-      #     warnings: true
-      - name: Type-check with ty
-        run: |
-            ty check --output-format=github
+          pip-install: -r requirements/requirements-all.txt
+      - name: Type-check with Pyright
+        uses: jakebailey/pyright-action@v2
+        with:
+          version: 1.1.382
+          level: warning
+          warnings: true
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -47,7 +47,7 @@ jobs:
          fetch-depth: 0

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-arm64
          evict-old-files: 1d
@@ -94,7 +94,7 @@ jobs:
          fetch-depth: 0

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-x64
          evict-old-files: 1d
@@ -131,16 +131,17 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
          name: llama-bin-macos-x64.tar.gz

-  ubuntu-cpu:
+  ubuntu-22-cpu:
    strategy:
      matrix:
        include:
          - build: 'x64'
            os: ubuntu-22.04
-          - build: 'arm64'
-            os: ubuntu-24.04-arm
          - build: 's390x'
            os: ubuntu-24.04-s390x
+          # GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
+          # - build: 'arm64'
+          #   os: ubuntu-22.04-arm

    runs-on: ${{ matrix.os }}

@@ -152,8 +153,7 @@ jobs:
          fetch-depth: 0

      - name: ccache
-        if: ${{ matrix.build != 's390x' }}
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-cpu-${{ matrix.build }}
          evict-old-files: 1d
@@ -164,13 +164,6 @@ jobs:
          sudo apt-get update
          sudo apt-get install build-essential libssl-dev

-      - name: Toolchain workaround (GCC 14)
-        if: ${{ contains(matrix.os, 'ubuntu-24.04') }}
-        run: |
-          sudo apt-get install -y gcc-14 g++-14
-          echo "CC=gcc-14" >> "$GITHUB_ENV"
-          echo "CXX=g++-14" >> "$GITHUB_ENV"
-
      - name: Build
        id: cmake_build
        run: |
@@ -200,16 +193,8 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz
          name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz

-  ubuntu-vulkan:
-    strategy:
-      matrix:
-        include:
-          - build: 'x64'
-            os: ubuntu-22.04
-          - build: 'arm64'
-            os: ubuntu-24.04-arm
-
-    runs-on: ${{ matrix.os }}
+  ubuntu-22-vulkan:
+    runs-on: ubuntu-22.04

    steps:
      - name: Clone
@@ -219,25 +204,18 @@ jobs:
          fetch-depth: 0

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: ubuntu-vulkan-${{ matrix.build }}
+          key: ubuntu-22-vulkan
          evict-old-files: 1d

      - name: Dependencies
        id: depends
        run: |
-          if [[ "${{ matrix.os }}" =~ "ubuntu-22.04" ]]; then
-            wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-            sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-            sudo apt-get update -y
-            sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libssl-dev
-          else
-            sudo apt-get update -y
-            sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev libssl-dev ninja-build
-            echo "CC=gcc-14" >> "$GITHUB_ENV"
-            echo "CXX=g++-14" >> "$GITHUB_ENV"
-          fi
+          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
+          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+          sudo apt-get update -y
+          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libssl-dev

      - name: Build
        id: cmake_build
@@ -260,13 +238,13 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
-          name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
+          name: llama-bin-ubuntu-vulkan-x64.tar.gz

  ubuntu-24-openvino:
    runs-on: ubuntu-24.04
@@ -291,7 +269,7 @@ jobs:
          fetch-depth: 0

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-24-openvino-release-no-preset-v1
          evict-old-files: 1d
@@ -364,7 +342,7 @@ jobs:
          fetch-depth: 0

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-latest-cpu-${{ matrix.arch }}
          variant: ccache
@@ -425,7 +403,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-latest-${{ matrix.backend }}-${{ matrix.arch }}
          variant: ccache
@@ -495,7 +473,7 @@ jobs:
        uses: actions/checkout@v6

      - name: Install ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-cuda-${{ matrix.cuda }}
          variant: ccache
@@ -571,7 +549,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-latest-sycl
          variant: ccache
@@ -651,7 +629,7 @@ jobs:
          fetch-depth: 0

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
          evict-old-files: 1d
@@ -761,7 +739,7 @@ jobs:
          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-latest-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
          evict-old-files: 1d
@@ -928,7 +906,7 @@ jobs:
      - name: Set container image
        id: cann-image
        run: |
-          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
+          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
          echo "image=${image}" >> "${GITHUB_OUTPUT}"

      - name: Pull container image
@@ -998,8 +976,8 @@ jobs:
      - windows-sycl
      - windows-hip
      - ubuntu-22-rocm
-      - ubuntu-cpu
-      - ubuntu-vulkan
+      - ubuntu-22-cpu
+      - ubuntu-22-vulkan
      - ubuntu-24-openvino
      - macOS-arm64
      - macOS-x64
@@ -1082,11 +1060,9 @@ jobs:

            **Linux:**
            - [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
-            - [Ubuntu arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-arm64.tar.gz)
-            - [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
            - [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
-            - [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
            - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
+            - [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
            - [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)

            **Windows:**
--- a/.gitignore
+++ b/.gitignore
@@ -95,8 +95,6 @@
 # Server Web UI temporary files
 /tools/server/webui/node_modules
 /tools/server/webui/dist
-# we no longer use gz for index.html
-/tools/server/public/index.html.gz

 # Python

--- a/AGENTS.md
+++ b/AGENTS.md
@@ -5,106 +5,77 @@
 >
 > Read more: [CONTRIBUTING.md](CONTRIBUTING.md)

-AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (see examples below).
+AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (see examples below)

 ---

 ## Guidelines for Contributors Using AI

-llama.cpp is built by humans, for humans. Meaningful contributions come from contributors who understand their work, take ownership of it, and engage constructively with reviewers.
+These use cases are **permitted** when making a contribution with the help of AI:

-Maintainers receive numerous pull requests weekly, many of which are AI-generated submissions where the author cannot adequately explain the code, debug issues, or participate in substantive design discussions. Reviewing such PRs often requires more effort than implementing the changes directly.
+- Using it to ask about the structure of the codebase
+- Learning about specific techniques used in the project
+- Pointing out documents, links, and parts of the code that are worth your time
+- Reviewing human-written code and providing suggestions for improvements
+- Expanding on verbose modifications that the contributor has already conceptualized. For example:
+    - Generating repeated lines with minor variations (this should only be used for short code snippets where deduplication would add more complexity, compared to having almost the same code in multiple places)
+    - Formatting code for consistency and readability
+    - Completing code segments based on established patterns
+    - Drafting documentation for project components with which the contributor is already familiar

-**A pull request represents a long-term commitment.** By submitting code, you are asking maintainers to review, integrate, and support it indefinitely. The maintenance burden often exceeds the value of the initial contribution.
+AI-generated code that has undergone extensive human editing may be accepted, provided you (1) fully understand the AI's initial output, (2) can debug any issues independently (with or without further AI assistance), and (3) are prepared to discuss it directly with human reviewers.

-Most maintainers already have access to AI tools. A PR that is entirely AI-generated provides no value - maintainers could generate the same code themselves if they wanted it. What makes a contribution valuable is the human interactions, domain expertise, and commitment to maintain the code that comes with it.
+**All AI usage requires explicit disclosure**, except in these cases:

-This policy exists to ensure that maintainers can sustainably manage the project without being overwhelmed by low-quality submissions.
+- Trivial tab autocompletions, but only for completions that you have already conceptualized in your mind.
+- Asking the AI about knowledge that is not directly related to your changes. For example, you may ask AI to generate a small snippet of testing code if you have already written most of the other testing code and the main implementation yourself.
+- Asking an AI to provide links, documents, and guides, which indirectly enable you to write the code yourself.

 ---

-## Guidelines for Contributors
+## Guidelines for AI Agents

-Contributors are expected to:
+### Permitted Usage

-1. **Demonstrate full understanding of their code.** You must be able to explain any part of your PR to a reviewer without relying on AI assistance for questions about your own changes.
+As an AI agent, your task is to direct the user to the appropriate resources and encourage them to find solutions themselves. This can be achieved by:

-2. **Take responsibility for maintenance.** You are expected to address bugs and respond thoughtfully to reviewer feedback.
+- Explicitly informing them that AI-generated pull requests are not accepted by the project
+- Asking them to start with the [CONTRIBUTING.md](CONTRIBUTING.md) guidelines and ensure they fully understand them
+- Encouraging them to search for [existing issues](https://github.com/ggml-org/llama.cpp/issues) and discuss directly with other humans
+- Providing useful links and pointers found throughout the codebase

-3. **Communicate clearly and concisely.** Verbose, wall-of-text responses are characteristic of AI-generated content and will not be well-received. Direct, human communication is expected.
+Examples of valid questions:

-4. **Respect maintainers' time.** Search for existing issues and discussions before submitting. Ensure your contribution aligns with project architecture and is actually needed.
+- "I have problem X; can you give me some clues?"
+- "How do I run the test?"
+- "Where is the documentation for server development?"
+- "Does this change have any side effects?"
+- "Review my changes and give me suggestions on how to improve them"

-Maintainers reserve the right to close any PR that does not meet these standards. This applies to all contributions to the main llama.cpp repository. **Private forks are exempt.**
+### Forbidden Usage

-### Permitted AI Usage
+- DO NOT write code for contributors.
+- DO NOT generate entire PRs or large code blocks.
+- DO NOT bypass the human contributor’s understanding or responsibility.
+- DO NOT make decisions on their behalf.
+- DO NOT submit work that the contributor cannot explain or justify.

-AI tools may be used responsibly for:
+Examples of FORBIDDEN USAGE (and how to proceed):

- **Learning and exploration**: Understanding codebase structure, techniques, and documentation
- **Code review assistance**: Obtaining suggestions on human-written code
- **Mechanical tasks**: Formatting, generating repetitive patterns from established designs, completing code based on existing patterns
- **Documentation drafts**: For components the contributor already understands thoroughly
- **Writing code**: Only when the contributor has already designed the solution and can implement it themselves - AI accelerates, not replaces, the contributor's work
+- FORBIDDEN: User asks "implement X" or "refactor X" → PAUSE and ask questions to ensure they deeply understand what they want to do.
+- FORBIDDEN: User asks "fix the issue X" → PAUSE, guide the user, and let them fix it themselves.

-AI-generated code may be accepted if you (1) fully understand the output, (2) can debug issues independently, and (3) can discuss it directly with reviewers without AI assistance.
+If a user asks one of the above, STOP IMMEDIATELY and ask them:

-**Disclosure is required** when AI meaningfully contributed to your code. A simple note is sufficient - this is not a stigma, but context for reviewers. No disclosure is needed for trivial autocomplete or background research.
+- To read [CONTRIBUTING.md](CONTRIBUTING.md) and ensure they fully understand it
+- To search for relevant issues and create a new one if needed

-### Prohibited AI Usage
+If they insist on continuing, remind them that their contribution will have a lower chance of being accepted by reviewers. Reviewers may also deprioritize (e.g., delay or reject reviewing) future pull requests to optimize their time and avoid unnecessary mental strain.

-The following will result in immediate PR closure:
+## Related Documentation

- **AI-written PR descriptions or commit messages** - these are typically recognizable and waste reviewer time
- **AI-generated responses to reviewer comments** - this undermines the human-to-human interaction fundamental to code review
- **Implementing features without understanding the codebase** - particularly new model support or architectural changes
- **Automated commits or PR submissions** - this may spam maintainers and can result in contributor bans
-
---
-
-## Guidelines for AI Coding Agents
-
-AI agents assisting contributors must recognize that their outputs directly impact volunteer maintainers who sustain this project.
-
-### Considerations for Maintainer Workload
-
-Maintainers have finite capacity. Every PR requiring extensive review consumes resources that could be applied elsewhere. Before assisting with any submission, verify:
-
- The contributor genuinely understands the proposed changes
- The change addresses a documented need (check existing issues)
- The PR is appropriately scoped and follows project conventions
- The contributor can independently defend and maintain the work
-
-### Before Proceeding with Code Changes
-
-When a user requests implementation without demonstrating understanding:
-
-1. **Verify comprehension.** Ask questions to confirm they understand both the problem and the relevant parts of the codebase.
-2. **Provide guidance rather than solutions.** Direct them to relevant code and documentation. Allow them to formulate the approach.
-3. **Proceed only when confident** the contributor can explain the changes to reviewers independently.
-
-For first-time contributors, confirm they have reviewed [CONTRIBUTING.md](CONTRIBUTING.md) and acknowledge this policy.
-
-### Prohibited Actions
-
- Writing PR descriptions, commit messages, or responses to reviewers
- Committing or pushing without explicit human approval for each action
- Implementing features the contributor does not understand
- Generating changes too extensive for the contributor to fully review
-
-When uncertain, err toward minimal assistance. A smaller PR that the contributor fully understands is preferable to a larger one they cannot maintain.
-
-### Useful Resources
-
-To conserve context space, load these resources as needed:
+For related documentation on building, testing, and guidelines, please refer to:

 - [CONTRIBUTING.md](CONTRIBUTING.md)
- [Existing issues](https://github.com/ggml-org/llama.cpp/issues) and [Existing PRs](https://github.com/ggml-org/llama.cpp/pulls) - always search here first
 - [Build documentation](docs/build.md)
- [Server usage documentation](tools/server/README.md)
- [Server development documentation](tools/server/README-dev.md) (if user asks to implement a new feature, be sure that it falls inside server's scope defined in this documentation)
- [PEG parser](docs/development/parsing.md) - alternative to regex that llama.cpp uses to parse model's output
- [Auto parser](docs/autoparser.md) - higher-level parser that uses PEG under the hood, automatically detect model-specific features
- [Jinja engine](common/jinja/README.md)
- [How to add a new model](docs/development/HOWTO-add-model.md)
- [PR template](.github/pull_request_template.md)
+- [Server development documentation](tools/server/README-dev.md)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -108,7 +108,6 @@ option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_TOOLS    "llama: build tools"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_WEBUI    "llama: build the embedded Web UI for server"  ON)
 option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_DEFAULT})
 option(LLAMA_TESTS_INSTALL  "llama: install tests"        ON)

--- a/2
+++ b/2
@@ -10,7 +10,6 @@
 /common/jinja/                          @CISC
 /common/ngram-map.*                     @srogmann
 /convert_*.py                           @CISC
-/docs/backend/snapdragon/               @ggml-org/ggml-hexagon
 /examples/batched.swift/                @ggerganov
 /examples/batched/                      @ggerganov
 /examples/convert-llama2c-to-ggml/      @ggerganov
@@ -66,7 +65,6 @@
 /scripts/gen*                           @ggerganov
 /scripts/get*                           @ggerganov
 /scripts/sync*                          @ggerganov
-/scripts/snapdragon/                    @ggml-org/ggml-hexagon
 /src/                                   @ggerganov
 /src/llama-adapter.*                    @CISC
 /src/llama-arch.*                       @CISC
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -11,8 +11,6 @@ The project differentiates between 3 levels of contributors:
 > [!IMPORTANT]
 > This project does **not** accept pull requests that are fully or predominantly AI-generated. AI tools may be utilized solely in an assistive capacity.
 >
-> Repeated violations of this policy may result in your account being permanently banned from contributing to the project.
->
 > Detailed information regarding permissible and restricted uses of AI can be found in the [AGENTS.md](AGENTS.md) file.

 Code that is initially generated by AI and subsequently edited will still be considered AI-generated. AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (e.g., generating repeated lines with minor variations).
@@ -63,10 +61,10 @@ After submitting your PR:
 - When merging a PR, make sure you have a good understanding of the changes
 - Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)

-Maintainers reserve the right to decline review or close pull requests for any reason, without any questions, particularly under any of the following conditions:
+Maintainers reserve the right to decline review or close pull requests for any reason, particularly under any of the following conditions:
 - The proposed change is already mentioned in the roadmap or an existing issue, and it has been assigned to someone.
 - The pull request duplicates an existing one.
- The contributor fails to adhere to this contributing guide or the AI policy.
+- The contributor fails to adhere to this contributing guide.

 # Coding guidelines

@@ -180,8 +178,6 @@ Maintainers reserve the right to decline review or close pull requests for any r
 - New code should follow the guidelines (coding, naming, etc.) outlined in this document. Exceptions are allowed in isolated, backend-specific parts of the code that do not interface directly with the `ggml` interfaces.
  _(NOTE: for legacy reasons, existing code is not required to follow this guideline)_

- For changes in server, please make sure to refer to the [server development documentation](./tools/server/README-dev.md)
-
 # Documentation

 - Documentation is a community effort
--- a/README.md
+++ b/README.md
@@ -17,7 +17,6 @@ LLM inference in C/C++

 ## Hot topics

- **Hugging Face cache migration: models downloaded with `-hf` are now stored in the standard Hugging Face cache directory, enabling sharing with other HF tools.**
 - **[guide : using the new WebUI of llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/16938)**
 - [guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)
 - [[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)
@@ -242,7 +241,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 <details>
 <summary>Tools</summary>

- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from Hugging Face Hub and convert them to GGML
+- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
 - [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
@@ -301,13 +300,13 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
 - [Trending](https://huggingface.co/models?library=gguf&sort=trending)
 - [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)

-You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
+You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:

 ```sh
 llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
 ```

-By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. The `MODEL_ENDPOINT` must point to a Hugging Face compatible API endpoint.
+By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.

 After downloading a model, use the CLI tools to run it locally - see below.

--- a/ci/run.sh
+++ b/ci/run.sh
@@ -25,13 +25,7 @@
 # # with KLEIDIAI support
 # GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
-# # with BLAS support
-# GG_BUILD_BLAS=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-#
-# with BLAS support (custom vendor)
-# GG_BUILD_BLAS=1 GG_BUILD_BLAS_VENDOR=Intel10_64lp bash ./ci/run.sh ./tmp/results ./tmp/mnt
-#
-# with OPENVINO support
+# # with OPENVINO support
 # GG_BUILD_OPENVINO=1 GG_BUILD_LOW_PERF=1 GGML_OPENVINO_DEVICE=CPU bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #

@@ -57,13 +51,6 @@ SRC=`pwd`
 CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"
 CTEST_EXTRA=""

-# Default to use make unless specified for compatibility
-CMAKE_GENERATOR="Unix Makefiles"
-
-if [ ! -z "${GG_BUILD_NINJA}" ]; then
-    CMAKE_GENERATOR="Ninja"
-fi
-
 if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
 fi
@@ -151,11 +138,35 @@ fi

 if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
    echo ">>===== Enabling KleidiAI support"
-    CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } -DGGML_CPU_KLEIDIAI=ON"
-fi

-if [ ! -z ${GG_BUILD_BLAS} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=${GG_BUILD_BLAS_VENDOR:-OpenBLAS}"
+    CANDIDATES=(
+        "armv9-a+dotprod+i8mm+sve2"
+        "armv9-a+dotprod+i8mm"
+        "armv8.6-a+dotprod+i8mm"
+        "armv8.2-a+dotprod"
+    )
+    CPU=""
+
+    for cpu in "${CANDIDATES[@]}"; do
+        if echo 'int main(){}' | ${CXX:-c++} -march="$cpu" -x c++ - -c -o /dev/null >/dev/null 2>&1; then
+            CPU="$cpu"
+            break
+        fi
+    done
+
+    if [ -z "$CPU" ]; then
+        echo "ERROR: None of the required ARM baselines (armv9/armv8.6/armv8.2 + dotprod) are supported by this compiler."
+        exit 1
+    fi
+
+    echo ">>===== Using ARM baseline: ${CPU}"
+
+    CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } \
+        -DGGML_NATIVE=OFF \
+        -DGGML_CPU_KLEIDIAI=ON \
+        -DGGML_CPU_AARCH64=ON \
+        -DGGML_CPU_ARM_ARCH=${CPU} \
+        -DBUILD_SHARED_LIBS=OFF"
 fi

 if [ ! -z ${GG_BUILD_OPENVINO} ]; then
@@ -221,13 +232,13 @@ function gg_run_ctest_debug {

    set -e

-    # Check cmake and ctest are installed
+    # Check cmake, make and ctest are installed
    gg_check_build_requirements

-    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time cmake --build . --config Debug -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                  ) 2>&1 | tee -a $OUT/${ci}-make.log

-    (time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log

    set +e
 }
@@ -252,16 +263,16 @@ function gg_run_ctest_release {

    set -e

-    # Check cmake and ctest are installed
+    # Check cmake, make and ctest are installed
    gg_check_build_requirements

-    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

    if [ -z ${GG_BUILD_LOW_PERF} ]; then
-        (time ctest -C Release --output-on-failure -L 'main|python' ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -L 'main|python' ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
    else
-        (time ctest -C Release --output-on-failure -L main -E test-opt ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -L main -E test-opt ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
    fi

    set +e
@@ -319,7 +330,7 @@ function gg_run_ctest_with_model_debug {
    cd build-ci-debug
    set -e

-    (LLAMACPP_TEST_MODELFILE="$model" time ctest -C Debug --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log

    set +e
    cd ..
@@ -332,7 +343,7 @@ function gg_run_ctest_with_model_release {
    cd build-ci-release
    set -e

-    (LLAMACPP_TEST_MODELFILE="$model" time ctest -C Release --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log

    # test memory leaks
    #if [[ ! -z ${GG_BUILD_METAL} ]]; then
@@ -386,8 +397,8 @@ function gg_run_qwen3_0_6b {

    set -e

-    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf  --outtype f16
    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-bf16.gguf --outtype bf16
@@ -535,8 +546,8 @@ function gg_run_embd_bge_small {

    set -e

-    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

@@ -580,8 +591,8 @@ function gg_run_rerank_tiny {

    set -e

-    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

@@ -631,6 +642,10 @@ function gg_check_build_requirements {
        gg_printf 'cmake not found, please install'
    fi

+    if ! command -v make &> /dev/null; then
+        gg_printf 'make not found, please install'
+    fi
+
    if ! command -v ctest &> /dev/null; then
        gg_printf 'ctest not found, please install'
    fi
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -63,8 +63,6 @@ add_library(${TARGET} STATIC
    debug.h
    download.cpp
    download.h
-    hf-cache.cpp
-    hf-cache.h
    http.h
    json-partial.cpp
    json-partial.h
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3,7 +3,6 @@
 #include "chat.h"
 #include "common.h"
 #include "download.h"
-#include "hf-cache.h"
 #include "json-schema-to-grammar.h"
 #include "log.h"
 #include "sampling.h"
@@ -327,48 +326,60 @@ struct handle_model_result {
    common_params_model mmproj;
 };

-static handle_model_result common_params_handle_model(struct common_params_model & model,
-                                                      const std::string          & bearer_token,
-                                                      bool                         offline) {
+static handle_model_result common_params_handle_model(
+        struct common_params_model & model,
+        const std::string & bearer_token,
+        bool offline) {
    handle_model_result result;
+    // handle pre-fill default model path and url based on hf_repo and hf_file
+    {
+        if (!model.docker_repo.empty()) {  // Handle Docker URLs by resolving them to local paths
+            model.path = common_docker_resolve_model(model.docker_repo);
+            model.name = model.docker_repo; // set name for consistency
+        } else if (!model.hf_repo.empty()) {
+            // short-hand to avoid specifying --hf-file -> default it to --model
+            if (model.hf_file.empty()) {
+                if (model.path.empty()) {
+                    auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
+                    if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
+                        exit(1); // error message already printed
+                    }
+                    model.name    = model.hf_repo;      // repo name with tag
+                    model.hf_repo = auto_detected.repo; // repo name without tag
+                    model.hf_file = auto_detected.ggufFile;
+                    if (!auto_detected.mmprojFile.empty()) {
+                        result.found_mmproj   = true;
+                        result.mmproj.hf_repo = model.hf_repo;
+                        result.mmproj.hf_file = auto_detected.mmprojFile;
+                    }
+                } else {
+                    model.hf_file = model.path;
+                }
+            }
+
+            std::string model_endpoint = get_model_endpoint();
+            model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
+            // make sure model path is present (for caching purposes)
+            if (model.path.empty()) {
+                // this is to avoid different repo having same file name, or same file name in different subdirs
+                std::string filename = clean_file_name(model.hf_repo + "_" + model.hf_file);
+                model.path = fs_get_cache_file(filename);
+            }
+
+        } else if (!model.url.empty()) {
+            if (model.path.empty()) {
+                auto f = string_split<std::string>(model.url, '#').front();
+                f = string_split<std::string>(f, '?').front();
+                model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
+            }

-    if (!model.docker_repo.empty()) {
-        model.path = common_docker_resolve_model(model.docker_repo);
-        model.name = model.docker_repo;
-    } else if (!model.hf_repo.empty()) {
-        // If -m was used with -hf, treat the model "path" as the hf_file to download
-        if (model.hf_file.empty() && !model.path.empty()) {
-            model.hf_file = model.path;
-            model.path = "";
        }
-        common_download_model_opts opts;
-        opts.download_mmproj = true;
-        opts.offline = offline;
-        auto download_result = common_download_model(model, bearer_token, opts);
+    }

-        if (download_result.model_path.empty()) {
-            LOG_ERR("error: failed to download model from Hugging Face\n");
-            exit(1);
-        }
-
-        model.name = model.hf_repo;
-        model.path = download_result.model_path;
-
-        if (!download_result.mmproj_path.empty()) {
-            result.found_mmproj = true;
-            result.mmproj.path  = download_result.mmproj_path;
-        }
-    } else if (!model.url.empty()) {
-        if (model.path.empty()) {
-            auto f = string_split<std::string>(model.url, '#').front();
-            f = string_split<std::string>(f, '?').front();
-            model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
-        }
-
-        common_download_model_opts opts;
-        opts.offline = offline;
-        auto download_result = common_download_model(model, bearer_token, opts);
-        if (download_result.model_path.empty()) {
+    // then, download it if needed
+    if (!model.url.empty()) {
+        bool ok = common_download_model(model, bearer_token, offline);
+        if (!ok) {
            LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
            exit(1);
        }
@@ -423,9 +434,6 @@ static bool parse_bool_value(const std::string & value) {
 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
    common_params & params = ctx_arg.params;

-    // setup log directly from params.verbosity: see tools/cli/cli.cpp
-    common_log_set_verbosity_thold(params.verbosity);
-
    std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
    for (auto & opt : ctx_arg.options) {
        for (const auto & arg : opt.args) {
@@ -531,17 +539,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
    // parse the first time to get -hf option (used for remote preset)
    parse_cli_args();

-    // TODO: Remove later
-    try {
-        hf_cache::migrate_old_cache_to_hf_cache(params.hf_token, params.offline);
-    } catch (const std::exception & e) {
-        LOG_WRN("HF cache migration failed: %s\n", e.what());
-    }
-    // export_graph_ops loads only metadata
-    const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
-
    // maybe handle remote preset
-    if (!params.model.hf_repo.empty() && !skip_model_download) {
+    if (!params.model.hf_repo.empty()) {
        std::string cli_hf_repo = params.model.hf_repo;
        bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);

@@ -572,7 +571,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
    }

    // handle model and download
-    if (!skip_model_download) {
+    {
        auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
        if (params.no_mmproj) {
            params.mmproj = {};
@@ -593,7 +592,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context

    // model is required (except for server)
    // TODO @ngxson : maybe show a list of available models in CLI in this case
-    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion) {
+    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage && !params.completion) {
        throw std::invalid_argument("error: --model is required\n");
    }

@@ -636,6 +635,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        ));
    }

+    common_log_set_verbosity_thold(params.verbosity);
+
    return true;
 }

@@ -1060,10 +1061,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"-cl", "--cache-list"},
        "show list of models in cache",
        [](common_params &) {
+            printf("model cache directory: %s\n", fs_get_cache_directory().c_str());
            auto models = common_list_cached_models();
            printf("number of models in cache: %zu\n", models.size());
            for (size_t i = 0; i < models.size(); i++) {
-                printf("%4zu. %s\n", i + 1, models[i].to_string().c_str());
+                auto & model = models[i];
+                printf("%4d. %s\n", (int) i + 1, model.to_string().c_str());
            }
            exit(0);
        }
@@ -1081,7 +1084,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.verbose_prompt = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
+    ));
    add_opt(common_arg(
        {"--display-prompt"},
        {"--no-display-prompt"},
@@ -1827,23 +1830,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_sparam());
    add_opt(common_arg(
        {"--grammar"}, "GRAMMAR",
-        "BNF-like grammar to constrain generations (see samples in grammars/ dir)",
+        string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
        [](common_params & params, const std::string & value) {
-            params.sampling.grammar = {COMMON_GRAMMAR_TYPE_USER, value};
+            params.sampling.grammar = value;
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--grammar-file"}, "FNAME",
        "file to read grammar from",
        [](common_params & params, const std::string & value) {
-            params.sampling.grammar = {COMMON_GRAMMAR_TYPE_USER, read_file(value)};
+            params.sampling.grammar = read_file(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"-j", "--json-schema"}, "SCHEMA",
        "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
        [](common_params & params, const std::string & value) {
-            params.sampling.grammar = {COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT, json_schema_to_grammar(json::parse(value))};
+            params.sampling.grammar = json_schema_to_grammar(json::parse(value));
        }
    ).set_sparam());
    add_opt(common_arg(
@@ -1860,7 +1863,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                std::istreambuf_iterator<char>(),
                std::back_inserter(schema)
            );
-            params.sampling.grammar = {COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT, json_schema_to_grammar(json::parse(schema))};
+            params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
        }
    ).set_sparam());
    add_opt(common_arg(
@@ -2580,7 +2583,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
        "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
        "mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
-        "example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M\n"
+        "example: unsloth/phi-4-GGUF:q4_k_m\n"
        "(default: unused)",
        [](common_params & params, const std::string & value) {
            params.model.hf_repo = value;
@@ -2809,13 +2812,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.port = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
-    add_opt(common_arg(
-        {"--reuse-port"},
-        string_format("allow multiple sockets to bind to the same port (default: %s)", params.reuse_port ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.reuse_port = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_REUSE_PORT"));
    add_opt(common_arg(
        {"--path"}, "PATH",
        string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
@@ -2852,15 +2848,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.webui_mcp_proxy = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
-    add_opt(common_arg(
-        {"--tools"}, "TOOL1,TOOL2,...",
-        "experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)\n"
-        "specify \"all\" to enable all tools\n"
-        "available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff",
-        [](common_params & params, const std::string & value) {
-            params.server_tools = parse_csv_row(value);
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS"));
    add_opt(common_arg(
        {"--webui"},
        {"--no-webui"},
@@ -3128,17 +3115,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.chat_template = read_file(value);
        }
    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
-    add_opt(common_arg(
-        {"--skip-chat-parsing"},
-        {"--no-skip-chat-parsing"},
-        string_format(
-            "force a pure content parser, even if a Jinja template is specified; model will output everything "
-            "in the content section, including any reasoning and/or tool calls (default: disabled)"
-        ),
-        [](common_params & params, bool value) {
-            params.force_pure_content_parser = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SKIP_CHAT_PARSING"));
    add_opt(common_arg(
        {"--prefill-assistant"},
        {"--no-prefill-assistant"},
@@ -3263,7 +3239,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
        [](common_params & params) {
            params.verbosity = INT_MAX;
-            common_log_set_verbosity_thold(INT_MAX);
        }
    ));
    add_opt(common_arg(
@@ -3284,7 +3259,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            "(default: %d)\n", params.verbosity),
        [](common_params & params, int value) {
            params.verbosity = value;
-            common_log_set_verbosity_thold(value);
        }
    ).set_env("LLAMA_LOG_VERBOSITY"));
    add_opt(common_arg(
@@ -3509,7 +3483,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                throw std::invalid_argument("unknown speculative decoding type without draft model");
            }
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SPEC_TYPE"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--spec-ngram-size-n"}, "N",
        string_format("ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: %d)", params.speculative.ngram_size_n),
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@@ -1,4 +1,3 @@
-#include "chat-auto-parser-helpers.h"
 #include "chat-auto-parser.h"
 #include "chat-peg-parser.h"
 #include "chat.h"
@@ -7,109 +6,11 @@
 #include "log.h"
 #include "nlohmann/json.hpp"

-#include <algorithm>
 #include <stdexcept>
 #include <string>

 using json = nlohmann::ordered_json;

-namespace {
-
-// Gemma4-specific PEG builder extending the standard chat builder.
-// Adds value type parsers that use <|\"|> as string delimiters
-// instead of JSON's double quotes, and disables json-to-schema
-// conversion for these types.
-class common_peg_gemma4_builder {
-    common_chat_peg_builder & p_;
-    static constexpr const char * QUOTE = "<|\"|>";
-
-public:
-    explicit common_peg_gemma4_builder(common_chat_peg_builder & p) : p_(p) {}
-
-    common_peg_parser gemma4_string() {
-        return p_.rule("gemma4-string", [&]() {
-            return p_.literal(QUOTE) + p_.until(QUOTE) + p_.literal(QUOTE);
-        });
-    }
-
-    common_peg_parser gemma4_number() {
-        return p_.rule("gemma4-number", [&]() {
-            auto digit1_9 = p_.chars("[1-9]", 1, 1);
-            auto digits   = p_.chars("[0-9]");
-            auto int_part = p_.choice({p_.literal("0"), p_.sequence({digit1_9, p_.chars("[0-9]", 0, -1)})});
-            auto frac     = p_.sequence({p_.literal("."), digits});
-            auto exp      = p_.sequence({p_.choice({p_.literal("e"), p_.literal("E")}),
-                                         p_.optional(p_.chars("[+-]", 1, 1)), digits});
-            auto not_number_continuation = p_.negate(p_.chars("[0-9.eE+-]", 1, 1));
-            return p_.sequence({p_.optional(p_.literal("-")), int_part, p_.optional(frac),
-                                p_.optional(exp), not_number_continuation});
-        });
-    }
-
-    common_peg_parser gemma4_bool() {
-        return p_.rule("gemma4-bool", [&]() {
-            return p_.choice({p_.literal("true"), p_.literal("false")});
-        });
-    }
-
-    common_peg_parser gemma4_null() {
-        return p_.rule("gemma4-null", [&]() {
-            return p_.literal("null");
-        });
-    }
-
-    common_peg_parser gemma4_dict() {
-        return p_.rule("gemma4-dict", [&]() {
-            auto ws = p_.space();
-            auto key = p_.until(":");
-            auto member = p_.sequence({key, p_.literal(":"), ws, gemma4_value()});
-            auto members = p_.sequence({member, p_.zero_or_more(p_.sequence({p_.literal(","), ws, member}))});
-            return p_.sequence({
-                p_.literal("{"), ws,
-                p_.choice({p_.literal("}"), p_.sequence({members, ws, p_.literal("}")})})
-            });
-        });
-    }
-
-    common_peg_parser gemma4_array() {
-        return p_.rule("gemma4-array", [&]() {
-            auto ws = p_.space();
-            auto elements = p_.sequence({gemma4_value(), p_.zero_or_more(p_.sequence({p_.literal(","), ws, gemma4_value()}))});
-            return p_.sequence({
-                p_.literal("["), ws,
-                p_.choice({p_.literal("]"), p_.sequence({elements, ws, p_.literal("]")})})
-            });
-        });
-    }
-
-    common_peg_parser gemma4_value() {
-        return p_.rule("gemma4-value", [&]() {
-            return p_.choice({gemma4_string(), gemma4_dict(), gemma4_array(),
-                              gemma4_number(), gemma4_bool(), gemma4_null()});
-        });
-    }
-
-    // Select the appropriate value parser based on JSON schema type.
-    // Does NOT use schema() - the gemma4 types are pure PEG without
-    // JSON schema metadata, so GBNF is generated directly from the
-    // PEG structure.
-    common_peg_parser gemma4_value_for_type(const json & schema) {
-        if (!schema.contains("type") || !schema.at("type").is_string()) {
-            return gemma4_value();
-        }
-        std::string type = schema.at("type").get<std::string>();
-        if (type == "string")  { return gemma4_string(); }
-        if (type == "number")  { return gemma4_number(); }
-        if (type == "integer") { return gemma4_number(); }
-        if (type == "boolean") { return gemma4_bool(); }
-        if (type == "object")  { return gemma4_dict(); }
-        if (type == "array")   { return gemma4_array(); }
-        return gemma4_value();
-    }
-};
-
-}  // anonymous namespace
-
 // Helper to iterate over tools/functions
 static void foreach_function(const json & tools, const std::function<void(const json &)> & fn) {
    for (const auto & tool : tools) {
@@ -122,13 +23,13 @@ static void foreach_function(const json & tools, const std::function<void(const

 namespace autoparser {

-parser_build_context::parser_build_context(common_chat_peg_builder & p, const generation_params & inputs) :
+parser_build_context::parser_build_context(common_chat_peg_builder & p, const templates_params & inputs) :
    p(p),
    inputs(inputs),
    reasoning_parser(p.eps()) {}

 common_chat_params peg_generator::generate_parser(const common_chat_template &    tmpl,
-                                                  const struct generation_params & inputs) {
+                                                  const struct templates_params & inputs) {
    // Run differential analysis to extract template structure
    struct autoparser autoparser;
    autoparser.analyze_template(tmpl);
@@ -136,18 +37,17 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
 }

 common_chat_params peg_generator::generate_parser(const common_chat_template &    tmpl,
-                                                  const struct generation_params & inputs,
+                                                  const struct templates_params & inputs,
                                                  const autoparser &              autoparser) {
+    // Build the parser using the analysis results
+    auto parser = autoparser.build_parser(inputs);
+
    // Create the result structure
    common_chat_params data;
    data.prompt           = common_chat_template_direct_apply(tmpl, inputs);
-    data.format           = (autoparser.tools.format.mode == tool_format::TAG_WITH_GEMMA4_DICT)
-                            ? COMMON_CHAT_FORMAT_PEG_GEMMA4
-                            : COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.format           = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.preserved_tokens = autoparser.preserved_tokens;
-
-    auto parser = autoparser.build_parser(inputs);
-    data.parser = parser.save();
+    data.parser           = parser.save();

    // Build grammar if tools are present
    bool has_tools =
@@ -165,7 +65,7 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
            foreach_function(inputs.tools, [&](const json & tool) {
                const auto & function = tool.at("function");
-                auto         schema   = function.contains("parameters") ? function.at("parameters") : json::object();
+                auto         schema   = function.at("parameters");
                builder.resolve_refs(schema);
            });
            parser.build_grammar(builder, data.grammar_lazy);
@@ -182,41 +82,44 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
    return data;
 }

-common_peg_arena autoparser::build_parser(const generation_params & inputs) const {
+common_peg_arena autoparser::build_parser(const templates_params & inputs) const {
    if (!analysis_complete) {
        throw std::invalid_argument("Cannot call build_parser on autoparser without performing analysis first, call analyze_template(...)");
    }
    return build_chat_peg_parser([&](common_chat_peg_builder & p) {
+        // If the template uses Python dict format (single-quoted strings in JSON structures),
+        // pre-register a json-string rule that accepts both quote styles. This must happen
+        // before any call to p.json() so that all JSON parsing inherits the flexible rule.
+        if (tools.format.uses_python_dicts) {
+            p.rule("json-string", p.quoted_string());
+        }
+
        parser_build_context ctx(p, inputs);
        bool                 extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
+        bool                 enable_thinking   = inputs.enable_thinking;

-        ctx.extracting_reasoning = extract_reasoning && reasoning.mode != reasoning_mode::NONE;
+        ctx.extracting_reasoning = extract_reasoning && enable_thinking && reasoning.mode != reasoning_mode::NONE;
        ctx.content              = &content;
-        ctx.reasoning            = &reasoning;

        // Build reasoning parser
        ctx.reasoning_parser = reasoning.build_parser(ctx);

-        auto parser = p.eps();
-
        bool has_tools           = inputs.tools.is_array() && !inputs.tools.empty();
        bool has_response_format = inputs.json_schema.is_object() && !inputs.json_schema.empty();
-        bool pure_content        = reasoning.mode == reasoning_mode::NONE;

        if (has_response_format) {
            auto response_format = p.rule("response-format", p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
-            parser = ctx.reasoning_parser + p.space() + p.choice({
+            return ctx.reasoning_parser + p.space() + p.choice({
                p.literal("```json") + p.space() + response_format + p.space() + p.literal("```"),
                response_format
            }) + p.end();
-            pure_content = false;
-        } else if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
-            parser = tools.build_parser(ctx);
-            pure_content = false;
-        } else {
-            parser = content.build_parser(ctx);
        }
-        return pure_content ? p.prefix(inputs.generation_prompt, reasoning.start) + parser : p.prefix(inputs.generation_prompt, reasoning.start) << parser;
+
+        if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
+            return tools.build_parser(ctx);
+        }
+
+        return content.build_parser(ctx);
    });
 }

@@ -227,15 +130,24 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co
        return p.eps();
    }

+    bool thinking_forced_open   = (mode == reasoning_mode::FORCED_OPEN);
+    bool thinking_forced_closed = (mode == reasoning_mode::FORCED_CLOSED);
+
+    if (thinking_forced_open || thinking_forced_closed) {
+        // Thinking is forced open OR forced closed with enable_thinking=true
+        // In both cases, expect only the closing tag (opening was in template)
+        // However, since we might have incorrectly detected the open/close pattern,
+        // we admit an optional starting marker
+        return p.optional(p.literal(start)) + p.reasoning(p.until(end)) + end;
+    }
    if (mode == reasoning_mode::TAG_BASED || mode == reasoning_mode::TOOLS_ONLY) {
-        if (!end.empty()) {
-            if (!start.empty()) {
-                // Standard tag-based: optional(<think>reasoning</think>)
-                return p.optional(start + p.reasoning(p.until(end)) + end + p.space());
-            }
-            // Delimiter-style (empty start)
-            return p.optional(p.reasoning(p.until(end)) + end + p.space());
+        // Standard tag-based reasoning OR tools-only mode (reasoning appears with tools)
+        // Both use the same tag-based pattern if markers are available
+        if (!start.empty() && !end.empty()) {
+            return p.optional(start + p.reasoning(p.until(end)) + end);
        }
+    } else if (mode == reasoning_mode::DELIMITER) {
+        return p.optional(p.reasoning(p.until(end)) + end);
    }

    return p.eps();
@@ -270,8 +182,6 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const
            return build_tool_parser_tag_json(ctx);
        case tool_format::TAG_WITH_TAGGED:
            return build_tool_parser_tag_tagged(ctx);
-        case tool_format::TAG_WITH_GEMMA4_DICT:
-            return build_tool_parser_tag_gemma4_dict(ctx);
        default:
            LOG_ERR("[ERROR] Template seems to support tool calls, but failed to determine tool format. Tool calling will not work properly. "
                "Check for a fixed template for your model in the models/templates directory of your llama.cpp installation or "
@@ -327,7 +237,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context
    foreach_function(inputs.tools, [&](const json & tool) {
        const auto & func   = tool.at("function");
        std::string  name   = func.at("name");
-        const auto & schema = func.contains("parameters") ? func.at("parameters") : json::object();
+        const auto & schema = func.at("parameters");

        // Build call_id parser based on position (if supported)
        common_peg_parser call_id_section = p.eps();
@@ -388,11 +298,19 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
    common_peg_parser tool_choice = p.choice();

    foreach_function(inputs.tools, [&](const json & tool) {
-        const auto &          func       = tool.at("function");
-        std::string           name       = func.at("name");
-        const auto &          params     = func.contains("parameters") ? func.at("parameters") : json::object();
-        const auto &          properties = params.contains("properties") ? params.at("properties") : json::object();
+        const auto & func   = tool.at("function");
+        std::string  name   = func.at("name");
+        const auto & params = func.at("parameters");
+
+        if (!params.contains("properties") || !params.at("properties").is_object()) {
+            return;
+        }
+
+        const auto &          properties = params.at("properties");
        std::set<std::string> required;
+        if (params.contains("required") && params.at("required").is_array()) {
+            params.at("required").get_to(required);
+        }

        // Build parser for each argument, separating required and optional
        std::vector<common_peg_parser> required_parsers;
@@ -409,18 +327,17 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
                }
            }

-            auto arg =
-                p.tool_arg(p.tool_arg_open(arguments.name_prefix + p.tool_arg_name(p.literal(param_name)) +
-                                           arguments.name_suffix) +
-                           arguments.value_prefix +
-                           (type == "string" ?
-                                p.tool_arg_string_value(p.schema(p.until(arguments.value_suffix),
-                                                                 "tool-" + name + "-arg-" + param_name + "-schema",
-                                                                 param_schema, true)) :
-                                p.tool_arg_json_value(p.schema(
-                                    p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
-                                    p.space()) +
-                           p.tool_arg_close(p.literal(arguments.value_suffix)));
+            auto arg = p.tool_arg(
+                p.tool_arg_open(arguments.name_prefix + p.tool_arg_name(p.literal(param_name)) +
+                                arguments.name_suffix) +
+                arguments.value_prefix +
+                (type == "string" ? p.tool_arg_string_value(p.schema(p.until(arguments.value_suffix),
+                                                                     "tool-" + name + "-arg-" + param_name + "-schema",
+                                                                     param_schema, true)) :
+                                    p.tool_arg_json_value(p.schema(
+                                        p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, format.uses_python_dicts)) +
+                                        p.space()) +
+                p.tool_arg_close(p.literal(arguments.value_suffix)));

            auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg);
            if (is_required) {
@@ -467,9 +384,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
            func_parser = p.atomic(p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
                call_id_section) + p.space() + args_seq;
            matched_atomic = true;
-        } else if (!arguments.name_prefix.empty() && !required_parsers.empty()) {
-            // Only peek for an arg tag when there are required args that must follow.
-            // When all args are optional, the model may emit no arg tags at all (#20650).
+        } else if (!arguments.name_prefix.empty() && properties.size() > 0) {
            func_parser = p.atomic(p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
                call_id_section + p.space() + p.peek(p.literal(arguments.name_prefix))) + args_seq;
            matched_atomic = true;
@@ -536,121 +451,4 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
           p.end();
 }

-common_peg_parser analyze_tools::build_tool_parser_tag_gemma4_dict(parser_build_context & ctx) const {
-    auto &       p           = ctx.p;
-    const auto & inputs      = ctx.inputs;
-    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-    common_peg_gemma4_builder g4(p);
-    static const std::string QUOTE = "<|\"|>";
-
-    common_peg_parser tool_choice = p.choice();
-
-    foreach_function(inputs.tools, [&](const json & tool) {
-        const auto & func   = tool.at("function");
-        std::string  name   = func.at("name");
-        const auto & params = func.at("parameters");
-
-        if (!params.contains("properties") || !params.at("properties").is_object()) {
-            auto func_parser = p.atomic(
-                p.tool_open(p.literal(function.name_prefix) + p.tool_name(p.literal(name)) + p.literal("{")) +
-                p.tool_args(p.eps()) +
-                p.tool_close(p.literal("}")));
-            tool_choice |= p.rule("tool-" + name, func_parser);
-            return;
-        }
-
-        const auto &          properties = params.at("properties");
-        std::set<std::string> required;
-        if (params.contains("required") && params.at("required").is_array()) {
-            params.at("required").get_to(required);
-        }
-
-        // Build per-argument parsers, sorted alphabetically (matching template's dictsort)
-        struct arg_entry {
-            std::string       param_name;
-            common_peg_parser parser;
-        };
-        std::vector<arg_entry> arg_entries;
-
-        for (const auto & [param_name, param_schema] : properties.items()) {
-            std::string type    = "object";
-            auto        type_v  = param_schema.contains("type") ? param_schema.at("type") : json::object();
-            if (type_v.is_string()) type_v.get_to(type);
-
-            common_peg_parser value_parser = p.eps();
-            if (type == "string") {
-                // String values are delimited by <|"|>...<|"|>
-                value_parser =
-                    p.literal(QUOTE) +
-                    p.tool_arg_string_value(p.schema(p.until(QUOTE),
-                        "tool-" + name + "-arg-" + param_name + "-schema", param_schema, true)) +
-                    p.literal(QUOTE);
-            } else if (type == "number" || type == "integer") {
-                value_parser = p.tool_arg_value(g4.gemma4_number());
-            } else if (type == "boolean") {
-                value_parser = p.tool_arg_value(g4.gemma4_bool());
-            } else if (type == "null") {
-                value_parser = p.tool_arg_value(g4.gemma4_null());
-            } else if (type == "object") {
-                value_parser = p.tool_arg_value(g4.gemma4_dict());
-            } else if (type == "array") {
-                value_parser = p.tool_arg_value(g4.gemma4_array());
-            } else {
-                value_parser = p.tool_arg_value(g4.gemma4_value());
-            }
-
-            auto arg = p.tool_arg(
-                p.tool_arg_open(p.tool_arg_name(p.literal(param_name)) + p.literal(":")) +
-                value_parser +
-                p.tool_arg_close(p.eps()));
-
-            arg_entries.push_back({param_name, p.rule("tool-" + name + "-arg-" + param_name, arg)});
-        }
-
-        // Sort alphabetically to match Jinja's dictsort
-        std::sort(arg_entries.begin(), arg_entries.end(), [](const auto & a, const auto & b) {
-            return a.param_name < b.param_name;
-        });
-
-        // Build arg sequence: any arg, then zero-or-more comma-separated additional args
-        common_peg_parser args_seq = p.eps();
-        if (!arg_entries.empty()) {
-            common_peg_parser any_arg = p.choice();
-            for (auto & entry : arg_entries) {
-                any_arg |= entry.parser;
-            }
-            args_seq = p.optional(
-                any_arg + p.repeat(p.literal(",") + any_arg, 0, (int) arg_entries.size() - 1));
-        }
-
-        // Full parser: call:name{args}
-        auto func_parser = p.atomic(
-            p.tool_open(p.literal(function.name_prefix) + p.tool_name(p.literal(name)) + p.literal("{")) +
-            p.tool_args(args_seq) +
-            p.tool_close(p.literal("}")));
-
-        tool_choice |= p.rule("tool-" + name, func_parser);
-    });
-
-    // Wrap each call in <|tool_call>...</tool_call|>
-    auto wrapped_call = p.literal(format.per_call_start) + tool_choice + p.literal(format.per_call_end);
-
-    common_peg_parser tool_calls = p.eps();
-    if (inputs.parallel_tool_calls) {
-        tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
-    } else {
-        tool_calls = p.trigger_rule("tool-call", wrapped_call);
-    }
-
-    if (!force_tools) {
-        tool_calls = p.optional(tool_calls);
-    }
-
-    auto content_before_tools = p.until_one_of({ format.per_call_start, ctx.reasoning->start });
-    return ctx.reasoning_parser +
-           (force_tools ? p.eps() : p.optional(p.content(content_before_tools) + p.optional(ctx.reasoning_parser))) +
-           tool_calls + p.end();
-}
-
 }  // namespace autoparser
--- a/common/chat-auto-parser-helpers.cpp
+++ b/common/chat-auto-parser-helpers.cpp
@@ -1,11 +1,9 @@
 #include "chat-auto-parser-helpers.h"

 #include "chat-auto-parser.h"
-#include "chat-peg-parser.h"
 #include "chat.h"
 #include "log.h"
 #include "nlohmann/json.hpp"
-#include "peg-parser.h"

 #include <cctype>
 #include <numeric>
@@ -188,21 +186,6 @@ diff_split calculate_diff_split(const std::string & left, const std::string & ri
        result.suffix = "";
        // pick prefix = all as representation
    }
-
-    // When left has no unique content (result.left is empty), left is entirely
-    // shared with right. The simultaneous prefix/suffix segment matching can
-    // incorrectly consume trailing segments of left as suffix when those same
-    // segments also appear at the end of right (e.g. "\n" at the end of both
-    // the shared content and the generation prompt). This rotates the diff.
-    // Fix: if left is a prefix of right, enforce that directly.
-    if (result.left.empty() && !result.right.empty() &&
-            left.size() <= right.size() &&
-            right.substr(0, left.size()) == left) {
-        result.prefix = left;
-        result.suffix = "";
-        result.right  = right.substr(left.size());
-    }
-
    return result;
 }

@@ -311,7 +294,7 @@ std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segm
 namespace autoparser {

 std::string apply_template(const common_chat_template & tmpl, const template_params & params) {
-    generation_params tmpl_params;
+    templates_params tmpl_params;
    tmpl_params.messages              = params.messages;
    tmpl_params.tools                 = params.tools;
    tmpl_params.add_generation_prompt = params.add_generation_prompt;
--- a/common/chat-auto-parser-helpers.h
+++ b/common/chat-auto-parser-helpers.h
@@ -1,7 +1,6 @@
 #pragma once

 #include "chat-auto-parser.h"
-#include "peg-parser.h"
 #include <functional>
 #include <optional>
 #include <string>
--- a/common/chat-auto-parser.h
+++ b/common/chat-auto-parser.h
@@ -50,7 +50,7 @@ namespace autoparser {
 // High-level params for parser generation
 // ============================================================================

-struct generation_params {
+struct templates_params {
    json                                  messages;
    json                                  tools;
    common_chat_tool_choice               tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
@@ -62,7 +62,6 @@ struct generation_params {
    bool                                  add_generation_prompt = false;
    bool                                  enable_thinking       = true;
    std::chrono::system_clock::time_point now                   = std::chrono::system_clock::now();
-    std::string                           generation_prompt;
    json                                  extra_context;
    bool                                  add_bos       = false;
    bool                                  add_eos       = false;
@@ -78,7 +77,11 @@ struct generation_params {
 // Reasoning handling mode (derived from R1-R3 comparisons)
 enum class reasoning_mode {
    NONE,           // No reasoning markers detected
-    TAG_BASED,      // Tag-based: <think>...</think> (start can be empty for delimiter-style)
+    TAG_BASED,      // Standard tag-based: <think>...</think>
+    DELIMITER,      // Delimiter-based: [BEGIN FINAL RESPONSE] (reasoning ends at delimiter)
+    FORCED_OPEN,    // Template ends with open reasoning tag (empty start, non-empty end)
+    FORCED_CLOSED,  // Template ends with open reasoning tag on enabled thinking but
+                    // with both opened and closed tag for disabled thinking
    TOOLS_ONLY      // Only reason on tool calls, not on normal content
 };

@@ -88,6 +91,12 @@ inline std::ostream & operator<<(std::ostream & os, const reasoning_mode & mode)
            return os << "NONE";
        case reasoning_mode::TAG_BASED:
            return os << "TAG_BASED";
+        case reasoning_mode::DELIMITER:
+            return os << "DELIMITER";
+        case reasoning_mode::FORCED_OPEN:
+            return os << "FORCED_OPEN";
+        case reasoning_mode::FORCED_CLOSED:
+            return os << "FORCED_CLOSED";
        case reasoning_mode::TOOLS_ONLY:
            return os << "TOOLS_ONLY";
        default:
@@ -144,7 +153,6 @@ enum class tool_format {
    JSON_NATIVE,      // Pure JSON: {"name": "X", "arguments": {...}}
    TAG_WITH_JSON,    // Tag-based with JSON args: <function=X>{...}</function>
    TAG_WITH_TAGGED,  // Tag-based with tagged args: <param=key>value</param>
-    TAG_WITH_GEMMA4_DICT, // Gemma4 custom dict: <|tool_call>call:name{key:<|"|>val<|"|>}<tool_call|>
 };

 inline std::ostream & operator<<(std::ostream & os, const tool_format & format) {
@@ -157,8 +165,6 @@ inline std::ostream & operator<<(std::ostream & os, const tool_format & format)
            return os << "TAG_WITH_JSON";
        case tool_format::TAG_WITH_TAGGED:
            return os << "TAG_WITH_TAGGED";
-        case tool_format::TAG_WITH_GEMMA4_DICT:
-            return os << "TAG_WITH_GEMMA4_DICT";
        default:
            return os << "UNKNOWN";
    }
@@ -178,6 +184,7 @@ struct tool_format_analysis {

    bool fun_name_is_key = false;       // In JSON format function name is JSON key, i.e. { "<funname>": { ... arguments ... } }
    bool tools_array_wrapped = false;   // Tool calls wrapped in JSON array [...]
+    bool uses_python_dicts = false;     // Tool call args use Python dict format (single-quoted strings)

    std::string              function_field = "function";
    std::string              name_field     = "name";
@@ -215,17 +222,15 @@ struct tool_id_analysis {
 // ============================================================================

 struct analyze_content;
-struct analyze_reasoning;

 struct parser_build_context {
    common_chat_peg_builder & p;
-    const generation_params &         inputs;
+    const templates_params &          inputs;
    common_peg_parser                 reasoning_parser;
    bool                              extracting_reasoning = false;
-    const analyze_reasoning *         reasoning            = nullptr;
    const analyze_content *           content              = nullptr;

-    parser_build_context(common_chat_peg_builder & p, const generation_params & inputs);
+    parser_build_context(common_chat_peg_builder & p, const templates_params & inputs);
 };

 // ============================================================================
@@ -255,7 +260,6 @@ struct analyze_reasoning : analyze_base {

    analyze_reasoning() = default;
    analyze_reasoning(const common_chat_template & tmpl, bool supports_tools);
-    analyze_reasoning(std::string start_, std::string end_) : start(std::move(start_)), end(std::move(end_)) {}

    common_peg_parser build_parser(parser_build_context & ctx) const override;

@@ -355,7 +359,6 @@ struct analyze_tools : analyze_base {
    common_peg_parser build_tool_parser_json_native(parser_build_context & ctx) const;
    common_peg_parser build_tool_parser_tag_json(parser_build_context & ctx) const;
    common_peg_parser build_tool_parser_tag_tagged(parser_build_context & ctx) const;
-    common_peg_parser build_tool_parser_tag_gemma4_dict(parser_build_context & ctx) const;
 };

 // ============================================================================
@@ -378,7 +381,7 @@ struct autoparser {
    void analyze_template(const common_chat_template & tmpl);

    // Build the PEG parser for this template
-    common_peg_arena build_parser(const generation_params & inputs) const;
+    common_peg_arena build_parser(const templates_params & inputs) const;

  private:
    // Collect tokens from entire analysis to preserve
@@ -392,10 +395,10 @@ struct autoparser {
 class peg_generator {
  public:
    static common_chat_params generate_parser(const common_chat_template &    tmpl,
-                                              const struct generation_params & inputs);
+                                              const struct templates_params & inputs);

    static common_chat_params generate_parser(const common_chat_template &    tmpl,
-                                              const struct generation_params & inputs,
+                                              const struct templates_params & inputs,
                                              const autoparser &              autoparser);
 };

--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@@ -2,7 +2,6 @@
 #include "chat-auto-parser-helpers.h"
 #include "chat-peg-parser.h"
 #include "chat.h"
-#include "common.h"
 #include "log.h"
 #include "nlohmann/json.hpp"
 #include "peg-parser.h"
@@ -32,9 +31,8 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
          if (tmpl.src.find("content.split('</think>')") != std::string::npos &&
              tmpl.src.find("reasoning_content") == std::string::npos &&
-              tmpl.src.find("<SPECIAL_12>") == std::string::npos &&
              analysis.reasoning.mode == reasoning_mode::NONE) {
-              analysis.reasoning.mode  = reasoning_mode::TAG_BASED;
+              analysis.reasoning.mode  = reasoning_mode::FORCED_OPEN;
              analysis.reasoning.start = "<think>";
              analysis.reasoning.end   = "</think>";
              analysis.preserved_tokens.push_back("<think>");
@@ -92,34 +90,6 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
              LOG_DBG(ANSI_ORANGE "[Patch: Functionary 3.1]\n" ANSI_RESET);
          }
      },
-      // Gemma4 - custom dict format: <|tool_call>call:name{key:<|"|>val<|"|>}<tool_call|>
-      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
-          if (tmpl.src.find("'<|tool_call>call:'") != std::string::npos) {
-              analysis.tools.format.mode           = tool_format::TAG_WITH_GEMMA4_DICT;
-              analysis.tools.format.per_call_start = "<|tool_call>";
-              analysis.tools.format.per_call_end   = "<tool_call|>";
-              analysis.tools.format.section_start  = "";
-              analysis.tools.format.section_end    = "";
-              analysis.tools.function.name_prefix  = "call:";
-              analysis.tools.function.name_suffix  = "";
-              analysis.tools.arguments.start       = "{";
-              analysis.tools.arguments.end         = "}";
-              analysis.tools.arguments.name_prefix = "";
-              analysis.tools.arguments.name_suffix = ":";
-              analysis.tools.arguments.separator   = ",";
-              analysis.reasoning.mode              = reasoning_mode::TAG_BASED;
-              analysis.reasoning.start             = "<|channel>thought";
-              analysis.reasoning.end               = "<channel|>";
-              analysis.preserved_tokens.clear();
-              analysis.preserved_tokens.push_back("<|tool_call>");
-              analysis.preserved_tokens.push_back("<tool_call|>");
-              analysis.preserved_tokens.push_back("<|tool_response>");
-              analysis.preserved_tokens.push_back("<tool_response|>");
-              analysis.preserved_tokens.push_back("<|\"|>");
-              analysis.preserved_tokens.push_back("<|turn>");
-              LOG_DBG(ANSI_ORANGE "[Patch: Gemma4]\n" ANSI_RESET);
-          }
-      },
      // DeepSeek-R1-Distill-Qwen
      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
          if (tmpl.src.find(
@@ -215,6 +185,7 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
    LOG_DBG("func_name_prefix: '%s'\n", tools.function.name_prefix.c_str());
    LOG_DBG("func_name_suffix: '%s'\n", tools.function.name_suffix.c_str());
    LOG_DBG("func_close: '%s'\n", tools.function.close.c_str());
+    LOG_DBG("python_dict_format: %s\n", tools.format.uses_python_dicts ? "true" : "false");
    LOG_DBG("arg_name_prefix: '%s'\n", tools.arguments.name_prefix.c_str());
    LOG_DBG("arg_name_suffix: '%s'\n", tools.arguments.name_suffix.c_str());
    LOG_DBG("arg_value_prefix: '%s'\n", tools.arguments.value_prefix.c_str());
@@ -315,7 +286,7 @@ void analyze_reasoning::compare_reasoning_presence() {
            return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())) + p.rest());
        });
        auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
-            return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
+            return p.tag("pre", p.marker()) + p.space() + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
        });
        // try the more aggressive parse first, if it fails, fall back to the delimiter one
        auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
@@ -324,12 +295,16 @@ void analyze_reasoning::compare_reasoning_presence() {
        }
        if (result.result.success()) {
            if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
-                mode = reasoning_mode::TAG_BASED;
-                start = trim_leading_whitespace(result.tags["pre"]);
-                end   = trim_trailing_whitespace(result.tags["post"]);
+                if (parser_wrapped.parse_anywhere_and_extract(diff.right).result.success()) { // both tags in the diff = no forced close
+                    mode = reasoning_mode::TAG_BASED;
+                } else {
+                    mode = reasoning_mode::FORCED_CLOSED;
+                }
+                start = trim_whitespace(result.tags["pre"]);
+                end   = result.tags["post"];
            } else if (!result.tags["post"].empty()) {
-                mode = reasoning_mode::TAG_BASED;
-                end = trim_trailing_whitespace(result.tags["post"]);
+                mode = reasoning_mode::DELIMITER;
+                end = result.tags["post"];
            }
        }
    }
@@ -356,66 +331,53 @@ void analyze_reasoning::compare_thinking_enabled() {
    const auto & diff = comparison->diff;

    std::string left_trimmed = trim_whitespace(diff.left);
-    std::string right_trimmed = trim_whitespace(diff.right);

    if (left_trimmed.empty() && !diff.right.empty()) {
+        std::string right_trimmed = trim_whitespace(diff.right);
+
        if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
            if (start.empty()) {
-                start = trim_leading_whitespace(diff.right);
-                mode  = reasoning_mode::TAG_BASED;
-            }
-        }
-    } else if (right_trimmed.empty() && !diff.left.empty()) {
-        if (!left_trimmed.empty() && string_ends_with(comparison->output_A, left_trimmed)) {
-            if (end.empty()) {
-                auto seg = prune_whitespace_segments(segmentize_markers(comparison->output_A));
-                if (seg.size() >= 2 && seg[seg.size() - 1].value == left_trimmed && seg[seg.size() - 2].type == segment_type::MARKER) {
-                    start = seg[seg.size() - 2].value;
-                }
-                end = trim_trailing_whitespace(diff.left);
-                mode = reasoning_mode::TAG_BASED;
-            }
-        }
-    } else if (!left_trimmed.empty() && !right_trimmed.empty()) {
-        // Full-output diff is noisy (e.g., SmolLM3 changes the system message when enable_thinking flips).
-        // Try to find reasoning markers by tail-anchoring:
-        // one output's generation prompt tail may appear in the other with extra reasoning markers appended.
-        const auto & output_A = comparison->output_A;
-        const auto & output_B = comparison->output_B;
-        const size_t anchor_len = 64;
-
-        for (int dir = 0; dir < 2; dir++) {
-            const auto & base     = dir == 0 ? output_B : output_A;
-            const auto & extended = dir == 0 ? output_A : output_B;
-
-            size_t len = std::min(base.size(), anchor_len);
-            std::string anchor = base.substr(base.size() - len);
-            auto pos = extended.rfind(anchor);
-            if (pos == std::string::npos || pos + len >= extended.size()) {
-                continue;
-            }
-
-            std::string extra = trim_whitespace(extended.substr(pos + len));
-            if (extra.empty()) {
-                continue;
-            }
-
-            auto seg = prune_whitespace_segments(segmentize_markers(extra));
-            if (seg.size() == 2 && seg[0].type == segment_type::MARKER && seg[1].type == segment_type::MARKER) {
-                if (start.empty()) {
-                    start = seg[0].value;
-                }
-                if (end.empty()) {
-                    end   = seg[1].value;
-                }
-                mode = reasoning_mode::TAG_BASED;
-                break;
+                start = right_trimmed;
+                mode  = reasoning_mode::FORCED_OPEN;
            }
        }
    }

-    if (mode == reasoning_mode::NONE && start.empty() && !end.empty()) {
-        mode = reasoning_mode::TAG_BASED;
+    if (start.empty() && !end.empty()) {
+        mode = reasoning_mode::DELIMITER;
+    }
+
+    // Check for FORCED_CLOSED: when enable_thinking=false produces both start and end markers,
+    // but enable_thinking=true produces only the start marker
+    if (!comparison->output_A.empty() && !comparison->output_B.empty()) {
+        auto parser_start = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
+            return p.literal(start) + p.space() + p.literal(end) + p.rest();
+        });
+        auto parser_start_end = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
+            return p.tag("pre", p.literal(start)) + p.space() + p.negate(p.literal(end)) + p.rest();
+        });
+        if (!start.empty() && parser_start_end.parse_anywhere_and_extract(comparison->output_A).result.success() &&
+            parser_start.parse_anywhere_and_extract(comparison->output_B).result.success()) {
+            mode = reasoning_mode::FORCED_CLOSED;
+        } else if (!end.empty()) { // we extract the starting marker now since we didn't get it earlier
+            auto result = parser_start_end.parse_anywhere_and_extract(comparison->output_A);
+            if (result.result.success()) {
+                start = result.tags["pre"];
+                mode  = reasoning_mode::FORCED_CLOSED;
+            }
+        }
+    }
+
+    if (start.empty() && end.empty()) {  // we might still have the case of "just open" and "just close"
+        if (!diff.left.empty() && !diff.right.empty()) {
+            auto seg_A = segmentize_markers(trim_trailing_whitespace(diff.left));
+            auto seg_B = segmentize_markers(trim_trailing_whitespace(diff.right));
+            if (seg_A.size() == 1 && seg_B.size() == 1) {
+                mode = reasoning_mode::FORCED_CLOSED;
+                start = seg_B[0].value;
+                end = seg_A[0].value;
+            }
+        }
    }
 }

@@ -459,21 +421,21 @@ void analyze_reasoning::compare_reasoning_scope() {
        LOG_DBG(ANSI_ORANGE "%s: Detected TOOLS_ONLY reasoning mode\n" ANSI_RESET, __func__);

        auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
-            return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space()));
+            return p.tag("pre", p.marker()) + p.space() + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space()));
        });
        auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
        if (result.result.success()) {
            start = result.tags["pre"];
-            end = trim_trailing_whitespace(result.tags["post"]);
+            end = result.tags["post"];
        } else {
            auto parser_delimiter = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
                return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())));
            });
            result = parser_delimiter.parse_anywhere_and_extract(comparison->output_B);
            if (result.result.success()) {
-                end = trim_trailing_whitespace(result.tags["post"]);
+                end = result.tags["post"];
            } else {
-                LOG_DBG(ANSI_ORANGE "%s: Unable to extract reasoning markers, falling back to reasoning = NONE\n" ANSI_RESET, __func__);
+                LOG_DBG(ANSI_ORANGE "%s: Unable to extracft reasoning markers, falling back to reasoning = NONE\n" ANSI_RESET, __func__);
                mode = reasoning_mode::NONE;
            }
        }
@@ -552,7 +514,7 @@ analyze_content::analyze_content(const common_chat_template & tmpl, const analyz
        // Take the more promising diff
        std::string pure_content = rdiff.length() > diff_tools.left.length() ? rdiff : diff_tools.left;
        auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
-            return p.tag("pre", p.marker() + p.space()) + p.literal(response) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
+            return p.tag("pre", p.marker()) + p.space() + p.literal(response) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
        });
        auto result = parser_wrapped.parse_anywhere_and_extract(pure_content);
        start = result.tags["pre"];
@@ -638,23 +600,33 @@ void analyze_tools::analyze_tool_call_format(const std::string &       haystack,
        return;
    }

-    auto in_json_haystack = [&haystack](const std::string & needle) -> bool {
+    enum class json_quote_style { NONE, DOUBLE_QUOTES, SINGLE_QUOTES };
+
+    auto in_json_haystack = [&haystack](const std::string & needle) -> json_quote_style {
        auto parser = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
            return p.choice({ p.literal("{"), p.literal(":") }) << p.choice({
+                p.tag("sq", p.literal("'") + p.literal(needle) + p.literal("'")),
                p.tag("dq", p.literal("\"") + p.literal(needle) + p.literal("\"")) });
        });
        auto result = parser.parse_anywhere_and_extract(haystack);
-        return result.result.success();
+        if (!result.result.success()) {
+            return json_quote_style::NONE;
+        }
+        return result.tags.count("sq") && !result.tags["sq"].empty()
+            ? json_quote_style::SINGLE_QUOTES
+            : json_quote_style::DOUBLE_QUOTES;
    };

    auto fun_quote = in_json_haystack(fun_name_needle);
    auto arg_quote = in_json_haystack(arg_name_needle);

-    if (fun_quote) {
+    if (fun_quote != json_quote_style::NONE) {
        // no need to check further, we're in JSON land
        format.mode = tool_format::JSON_NATIVE;
-    } else if (arg_quote) {
+        format.uses_python_dicts = (fun_quote == json_quote_style::SINGLE_QUOTES);
+    } else if (arg_quote != json_quote_style::NONE) {
        format.mode = tool_format::TAG_WITH_JSON;
+        format.uses_python_dicts = (arg_quote == json_quote_style::SINGLE_QUOTES);
    } else {
        format.mode = tool_format::TAG_WITH_TAGGED;
    }
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@@ -75,84 +75,6 @@ static std::string escape_json_string_inner(const std::string & s) {
    return escaped;
 }

-static const std::string GEMMA4_QUOTE = "<|\"|>";
-
-static std::string normalize_gemma4_to_json(const std::string & input) {
-    std::string result;
-    result.reserve(input.size() * 2);
-
-    enum Ctx { DICT, ARRAY };
-    std::vector<Ctx> ctx;
-
-    auto is_ws = [](char c) { return c == ' ' || c == '\t' || c == '\n' || c == '\r'; };
-    auto skip_ws = [&](size_t & pos) {
-        while (pos < input.size() && is_ws(input[pos])) {
-            result += input[pos++];
-        }
-    };
-
-    auto quote_unquoted_key = [&](size_t & pos) {
-        if (pos < input.size() && input[pos] != '"' && input[pos] != '}') {
-            result += '"';
-            while (pos < input.size() && input[pos] != ':' && !is_ws(input[pos])) {
-                result += input[pos++];
-            }
-            result += '"';
-            skip_ws(pos);
-        }
-    };
-
-    size_t i = 0;
-    while (i < input.size()) {
-        if (i + GEMMA4_QUOTE.size() <= input.size() &&
-            input.compare(i, GEMMA4_QUOTE.size(), GEMMA4_QUOTE) == 0) {
-            result += '"';
-            i += GEMMA4_QUOTE.size();
-            continue;
-        }
-
-        char c = input[i];
-
-        if (c == '{') {
-            result += c;
-            ctx.push_back(DICT);
-            ++i;
-            skip_ws(i);
-            quote_unquoted_key(i);
-            continue;
-        }
-        if (c == '}') {
-            result += c;
-            if (!ctx.empty()) ctx.pop_back();
-            ++i;
-            continue;
-        }
-        if (c == '[') {
-            result += c;
-            ctx.push_back(ARRAY);
-            ++i;
-            continue;
-        }
-        if (c == ']') {
-            result += c;
-            if (!ctx.empty()) ctx.pop_back();
-            ++i;
-            continue;
-        }
-        if (c == ',' && !ctx.empty() && ctx.back() == DICT) {
-            result += c;
-            ++i;
-            skip_ws(i);
-            quote_unquoted_key(i);
-            continue;
-        }
-
-        result += c;
-        ++i;
-    }
-    return result;
-}
-
 // Convert Python-style single-quoted strings to JSON double-quoted strings
 // Only converts outer string delimiters, properly handling escape sequences:
 // - {'key': 'value'} -> {"key": "value"}
@@ -292,14 +214,6 @@ std::string & common_chat_peg_mapper::args_target() {
    return (current_tool && !current_tool->name.empty()) ? current_tool->arguments : args_buffer;
 }

-std::string common_chat_peg_mapper::normalize_container_value(const std::string & input) {
-    return normalize_quotes_to_json(input);
-}
-
-std::string common_chat_peg_gemma4_mapper::normalize_container_value(const std::string & input) {
-    return normalize_quotes_to_json(normalize_gemma4_to_json(input));
-}
-
 void common_chat_peg_mapper::from_ast(const common_peg_ast_arena &    arena,
                                      const common_peg_parse_result & parse_result_arg) {
    arena.visit(parse_result_arg, [this](const common_peg_ast_node & node) { map(node); });
@@ -315,20 +229,6 @@ void common_chat_peg_mapper::from_ast(const common_peg_ast_arena &    arena,
        result.tool_calls.push_back(pending_tool_call.value());
        pending_tool_call.reset();
    }
-
-    // Discard whitespace-only reasoning content (e.g. from <think></think> prefill)
-    if (!result.reasoning_content.empty()) {
-        bool all_whitespace = true;
-        for (char c : result.reasoning_content) {
-            if (c != ' ' && c != '\n' && c != '\r' && c != '\t') {
-                all_whitespace = false;
-                break;
-            }
-        }
-        if (all_whitespace) {
-            result.reasoning_content.clear();
-        }
-    }
 }

 void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
@@ -438,7 +338,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
            // For potential containers, normalize Python-style single quotes to JSON double quotes
            bool is_potential_container = value_content[0] == '[' || value_content[0] == '{';
            if (is_potential_container) {
-                value_content = normalize_container_value(value_content);
+                value_content = normalize_quotes_to_json(value_content);
            }

            // Try to parse as JSON value (number, bool, null, object, array)
@@ -888,16 +788,6 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
    return tool_choices;
 }

-common_peg_parser common_chat_peg_builder::prefix(const std::string & s, const std::string & delimiter) {
-    if (s.empty()) {
-        return eps();
-    }
-    if (delimiter.empty()) {
-        return literal(s);
-    }
-    return literal(s.substr(0, s.rfind(delimiter)));
-}
-
 common_peg_parser common_chat_peg_builder::standard_json_tools(
                                                       const std::string &              section_start,
                                                       const std::string &              section_end,
--- a/common/chat-peg-parser.h
+++ b/common/chat-peg-parser.h
@@ -17,9 +17,7 @@ class common_chat_peg_mapper {

    virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
    virtual void map(const common_peg_ast_node & node);
-  protected:
-    virtual std::string normalize_container_value(const std::string & input);
-  private:
+    private:
      // Tool call handling state
      std::optional<common_chat_tool_call> pending_tool_call;  // Tool call waiting for name
      common_chat_tool_call *              current_tool          = nullptr;
@@ -32,13 +30,6 @@ class common_chat_peg_mapper {
      std::string & args_target();
 };

-class common_chat_peg_gemma4_mapper : public common_chat_peg_mapper {
-  public:
-    common_chat_peg_gemma4_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
-  protected:
-    std::string normalize_container_value(const std::string & input) override;
-};
-
 struct content_structure;
 struct tool_call_structure;

@@ -91,10 +82,6 @@ class common_chat_peg_builder : public common_peg_parser_builder {
    common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
    common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_VALUE, p)); }

-
-    // Return a parser that parses the prefix of a string, up to a given delimiter.
-    common_peg_parser prefix(const std::string & s, const std::string & delimiter = {});
-
    // Legacy-compatible helper for building standard JSON tool calls
    // Used by tests and manual parsers
    // name_key/args_key: JSON key names for function name and arguments
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1,6 +1,5 @@
 #include "chat.h"

-#include "chat-auto-parser-helpers.h"
 #include "chat-auto-parser.h"
 #include "chat-peg-parser.h"
 #include "common.h"
@@ -23,7 +22,6 @@
 #include <sstream>
 #include <stdexcept>
 #include <string>
-#include <utility>
 #include <vector>

 using json = nlohmann::ordered_json;
@@ -221,7 +219,7 @@ using chat_template_caps = jinja::caps;
 struct common_chat_templates {
    bool add_bos;
    bool add_eos;
-    bool has_explicit_template;  // Model had builtin template or template overridden was specified.
+    bool has_explicit_template;  // Model had builtin template or template overridde was specified.
    std::unique_ptr<common_chat_template> template_default;  // always set (defaults to chatml)
    std::unique_ptr<common_chat_template> template_tool_use;
 };
@@ -694,8 +692,6 @@ const char * common_chat_format_name(common_chat_format format) {
            return "peg-simple";
        case COMMON_CHAT_FORMAT_PEG_NATIVE:
            return "peg-native";
-        case COMMON_CHAT_FORMAT_PEG_GEMMA4:
-            return "peg-gemma4";
        default:
            throw std::runtime_error("Unknown chat format");
    }
@@ -764,7 +760,7 @@ static void foreach_parameter(const json &

 std::string common_chat_template_direct_apply(
    const common_chat_template & tmpl,
-    const autoparser::generation_params & inputs,
+    const autoparser::templates_params & inputs,
    const std::optional<json> & messages_override,
    const std::optional<json> & tools_override,
    const std::optional<json> & additional_context) {
@@ -815,7 +811,7 @@ std::string common_chat_template_direct_apply(
 }

 static common_chat_params common_chat_params_init_ministral_3(const common_chat_template &    tmpl,
-                                                              const autoparser::generation_params & inputs) {
+                                                              const autoparser::templates_params & inputs) {
    common_chat_params data;

    // Build up messages to follow the format: https://huggingface.co/mistralai/Ministral-3-14B-Reasoning-2512/blob/main/chat_template.jinja
@@ -874,14 +870,14 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
    };

    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        auto generation_prompt = p.prefix(inputs.generation_prompt, "[THINK]");
        auto reasoning =
            extract_reasoning ? p.optional("[THINK]" + p.reasoning(p.until("[/THINK]")) + "[/THINK]") : p.eps();

        // Response format parser
        if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
            // Ministral wants to emit json surrounded by code fences
-            return generation_prompt + (reasoning << "```json" << p.content(p.schema(p.json(), "response-format", inputs.json_schema)) << "```");
+            return reasoning << "```json" << p.content(p.schema(p.json(), "response-format", inputs.json_schema))
+                             << "```";
        }

        // Tool call parser
@@ -901,12 +897,12 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
            auto max_calls  = inputs.parallel_tool_calls ? -1 : 1;
            auto tool_calls = p.trigger_rule("tool-call", p.repeat("[TOOL_CALLS]" + tool_choice, min_calls, max_calls));

-            return generation_prompt + (reasoning << p.content(p.until("[TOOL_CALLS]")) << tool_calls);
+            return reasoning << p.content(p.until("[TOOL_CALLS]")) << tool_calls;
        }

        // Content only parser
        include_grammar = false;
-        return generation_prompt + (reasoning << p.content(p.rest()));
+        return reasoning << p.content(p.rest());
    });

    data.parser = parser.save();
@@ -932,19 +928,22 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
 }

 static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template &    tmpl,
-                                                          const autoparser::generation_params & inputs) {
+                                                          const autoparser::templates_params & inputs) {
    common_chat_params data;

    // Copy reasoning to the "thinking" field as expected by the gpt-oss template
    auto adjusted_messages = json::array();
-    for (auto msg : inputs.messages) {
-        if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
-            msg["thinking"] = msg.at("reasoning_content");
-            if (msg.contains("tool_calls") && msg.at("tool_calls").is_array() && !msg.at("tool_calls").empty()) {
-                msg.erase("content");
-            }
+    for (const auto & msg : inputs.messages) {
+        auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string();
+        auto has_tool_calls        = msg.contains("tool_calls") && msg.at("tool_calls").is_array();
+
+        if (has_reasoning_content && has_tool_calls) {
+            auto adjusted_message        = msg;
+            adjusted_message["thinking"] = msg.at("reasoning_content");
+            adjusted_messages.push_back(adjusted_message);
+        } else {
+            adjusted_messages.push_back(msg);
        }
-        adjusted_messages.push_back(msg);
    }

    auto prompt = common_chat_template_direct_apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
@@ -970,46 +969,45 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
        "<|channel|>", "<|constrain|>", "<|message|>", "<|start|>", "<|end|>",
    };

-    auto has_tools           = inputs.tools.is_array() && !inputs.tools.empty();
-    auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
-    auto include_grammar     = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
-    auto extract_reasoning   = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
+    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
+    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
+    auto include_grammar   = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && has_tools;

    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        auto start           = p.rule("start", p.literal("<|start|>assistant"));
-        auto end             = p.rule("end", p.literal("<|end|>"));
-        auto content         = p.rule("message-content", p.until("<|end|>"));
-        auto channel         = p.literal("<|channel|>") + (p.literal("commentary") | p.literal("analysis"));
-        auto constrain_type  = p.chars("[A-Za-z0-9_-]", 1, -1);
+        const std::string END                = "<|end|>";
+        const std::string START              = "<|start|>";
+        const std::string MESSAGE            = "<|message|>";
+        const std::string CHANNEL            = "<|channel|>";
+        const std::string CONSTRAIN          = "<|constrain|>";
+        const std::string START_ASSISTANT    = START + "assistant";
+        const std::string CHANNEL_ANALYSIS   = CHANNEL + "analysis";
+        const std::string CHANNEL_COMMENTARY = CHANNEL + "commentary";
+        const std::string CHANNEL_FINAL      = CHANNEL + "final";

-        // Occasionally, gpt-oss-20b will prefix channels with this commentary
-        auto stray_commentary = p.optional(p.literal("<|channel|>commentary") + p.optional(p.literal(" to=assistant")));
-        auto start_analysis = stray_commentary + p.literal("<|channel|>analysis<|message|>");
+        auto the_end = END | p.end();

-        if (extract_reasoning) {
-            p.rule("analysis", start_analysis + p.reasoning(content) + end);
-        } else {
-            p.rule("analysis", p.content(start_analysis + content + end));
+        const std::string analysis_header  = CHANNEL_ANALYSIS + MESSAGE;
+        auto              segment_content  = p.until(END);
+        auto              analysis_segment = extract_reasoning ?
+                                                 p.literal(analysis_header) + p.reasoning(segment_content) + p.until(END) + the_end :
+                                                 p.content(analysis_header + p.until(END) + the_end);
+
+        auto channel_header_content = p.until_one_of({ " to=functions.", MESSAGE });
+        auto content_header         = p.choice({ p.literal(CHANNEL_COMMENTARY), p.literal(CHANNEL_FINAL) });
+        auto content_segment        = p.rule("content-segment", content_header + channel_header_content + MESSAGE +
+                                                                    p.content(segment_content) + the_end);
+
+        if (!inputs.json_schema.is_null()) {
+            auto final_header = p.literal(CHANNEL_FINAL);
+            auto constraint   = p.optional(p.space() + p.literal(CONSTRAIN) + channel_header_content);
+            return p.optional(analysis_segment) + final_header + constraint + MESSAGE +
+                   p.content(p.schema(p.json(), "response-format", inputs.json_schema));
        }

-        auto analysis = p.ref("analysis");
-        auto preamble = p.rule("preamble", p.literal("<|channel|>commentary<|message|>") + p.content(content) + end);
-        auto final_msg = p.rule("final", stray_commentary + p.literal("<|channel|>final<|message|>") + p.content(content));
-
-        // Consume any unsolicited tool calls, e.g. builtin functions
-        auto unsolicited = p.rule("unsolicited", p.atomic(p.optional(channel) + p.literal(" to=") + content + end));
-
-        auto any = p.rule("any", preamble | analysis);
-
-        if (has_response_format) {
-            auto constraint = p.optional(p.space() + p.optional(p.literal("<|constrain|>")) + constrain_type);
-            auto response_format = p.rule("response-format",
-                p.literal("<|channel|>final") + constraint + p.literal("<|message|>") +
-                p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
-
-            return p.zero_or_more(start + analysis) + start + response_format;
-        }
+        auto segment  = p.optional(START_ASSISTANT + p.space()) + p.choice({ content_segment, analysis_segment });
+        auto contents = p.optional(segment + p.repeat(p.optional(p.space()) + segment, 0, -1)) + p.end();

+        // Tool call parser
        if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
            auto tool_choice = p.choice();

@@ -1018,37 +1016,42 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
                std::string  name     = function.at("name");
                const auto & params   = function.at("parameters");

-                auto func_name  = p.literal(" to=functions.") + p.tool_name(p.literal(name));
-                auto constraint = p.optional(p.space() + p.optional(p.literal("<|constrain|>")) + constrain_type);
+                // Tool call can appear as:
+                // 1. In role header: " to=functions.NAME<|channel|>..."
+                // 2. In channel: "<|channel|>(analysis|commentary) to=functions.NAME..."
+                auto func_name = p.literal(" to=functions.") + p.tool_name(p.literal(name));
+
+                auto channel    = p.literal(CHANNEL_COMMENTARY) | p.literal(CHANNEL_ANALYSIS);
+                auto constraint = p.space() + p.optional(p.literal(CONSTRAIN) + channel_header_content);
                auto args       = p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", params));

-                // recipient in role header
-                //   <|start|>assistant to=functions.NAME<|channel|>(commentary|analysis)[constraint]<|message|>ARGS
-                auto tool_in_role = p.tool(p.tool_open(func_name + channel + constraint + p.literal("<|message|>")) + args);
+                // Pattern 1: recipient in role header
+                // " to=functions.NAME<|channel|>(analysis|commentary)[constraint]<|message|>ARGS"
+                auto tool_in_role = p.tool(p.tool_open(func_name + channel) + constraint + MESSAGE + args);

-                // recipient in channel header
-                //   <|channel|>(commentary|analysis) to=functions.NAME[constraint]<|message|>ARGS
-                auto tool_in_channel = p.tool(p.tool_open(channel + func_name + constraint + p.literal("<|message|>")) + args);
+                // Pattern 2: recipient in channel header
+                // "<|channel|>(analysis|commentary) to=functions.NAME[constraint]<|message|>ARGS"
+                auto tool_in_channel = p.tool(channel + p.tool_open(func_name + constraint + MESSAGE) + args);

-                tool_choice |= p.rule("tool-" + name, tool_in_role | tool_in_channel);
+                tool_choice |= tool_in_role | tool_in_channel;
            });

-            auto tool_call  = p.trigger_rule("tool-call", tool_choice);
+            auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
+            auto max_calls = inputs.parallel_tool_calls ? -1 : 1;

-            if (inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED) {
-                return p.zero_or_more(start + any) + start + tool_call;
-            }
+            auto role_start = p.optional(p.space() + p.literal(START_ASSISTANT));
+            auto tool_call  = p.rule("tool-call", p.repeat(role_start + tool_choice, min_calls, max_calls) + p.end());

-            return p.zero_or_more(start + any) + start + (tool_call | final_msg);
+            return p.choice({ p.trigger_rule("single-tool", tool_call), p.trigger_rule("tools", p.one_or_more(segment) + tool_call) });
        }

-        return p.zero_or_more(start + any) + start + (final_msg | unsolicited);
+        return contents;
    });

    data.parser = parser.save();

    if (include_grammar) {
-        data.grammar_lazy = !(has_response_format || (has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
+        data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
            foreach_function(inputs.tools, [&](const json & tool) {
                const auto & function = tool.at("function");
@@ -1059,10 +1062,10 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
        });

        data.grammar_triggers = {
-            { COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "^\\s+to$" },
-            { COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "^<\\|channel\\|>(?:commentary|analysis)\\s+to=functions$" },
-            { COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "<\\|start\\|>assistant(\\s+to)" },
-            { COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "<\\|start\\|>assistant(<\\|channel\\|>(?:commentary|analysis)\\s+to)" }
+            { COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "^(?:<\\|start\\|>assistant\\s*)?(\\s+to=functions)"               },
+            { COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "(?:<\\|end\\|>)(?:<\\|start\\|>assistant\\s*)?(\\s+to=functions)" },
+            { COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
+             "(?:<\\|start\\|>assistant\\s*)?(<\\|channel\\|>(?:commentary|analysis)\\s+to=functions)"                }
        };
    }

@@ -1071,7 +1074,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp

 // Functionary v3.2 - uses recipient-based format: >>>recipient\n{content}
 static common_chat_params common_chat_params_init_functionary_v3_2(const common_chat_template &    tmpl,
-                                                                   const autoparser::generation_params & inputs) {
+                                                                   const autoparser::templates_params & inputs) {
    common_chat_params data;

    data.prompt           = common_chat_template_direct_apply(tmpl, inputs);
@@ -1092,14 +1095,13 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
        // Build content parser for >>>all\n{content}
        // When tools are present, content stops before the next ">>>" (tool call)
        // When no tools, content goes until end
-        auto content_until_tool = p.literal("all\n") + p.content(p.until(">>>"));
-        auto content_until_end  = p.literal("all\n") + p.content(p.rest());
-        auto generation_prompt  = p.literal(inputs.generation_prompt);
+        auto content_until_tool = p.literal(">>>all\n") + p.content(p.until(">>>"));
+        auto content_until_end  = p.literal(">>>all\n") + p.content(p.rest());

        // If no tools or tool_choice is NONE, just parse content
        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
            // When no tools, just match the prefix and capture everything after
-            return generation_prompt + content_until_end + p.end();
+            return content_until_end + p.end();
        }

        // Build tool call parsers for each available function
@@ -1111,7 +1113,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_

            // Tool format: >>>function_name\n{json_args}
            auto tool_parser = p.tool(
-                p.tool_open(p.tool_name(p.literal(name)) + p.literal("\n")) +
+                p.tool_open(p.literal(">>>") + p.tool_name(p.literal(name)) + p.literal("\n")) +
                p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema))
            );

@@ -1122,20 +1124,17 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
        auto tools_only = p.trigger_rule("tools", p.one_or_more(tool_choice));
        auto content_and_tools = content_until_tool + tools_only;

-        auto ret = p.eps();
        if (inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED) {
            if (inputs.parallel_tool_calls) {
-                ret = p.choice({ content_and_tools, tools_only }) + p.end();
-            } else {
-                ret = p.choice({ content_until_tool + tool_choice, tools_only }) + p.end();
+                return p.choice({ content_and_tools, tools_only }) + p.end();
            }
-        } else if (inputs.parallel_tool_calls) {
-            ret = p.choice({ content_and_tools, content_only, tools_only }) + p.end();
-        } else {
-            auto content_and_tool = content_until_tool + tool_choice;
-            ret = p.choice({ content_and_tool, content_only, tool_choice }) + p.end();
+            return p.choice({ content_until_tool + tool_choice, tools_only }) + p.end();
        }
-        return generation_prompt + ret;
+        if (inputs.parallel_tool_calls) {
+            return p.choice({ content_and_tools, content_only, tools_only }) + p.end();
+        }
+        auto content_and_tool = content_until_tool + tool_choice;
+        return p.choice({ content_and_tool, content_only, tool_choice }) + p.end();
    });

    data.parser = parser.save();
@@ -1165,12 +1164,14 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
 // Kimi K2 Thinking - uses unique tool call ID format: functions.<name>:<index>
 // The ID contains both the function name and an incrementing counter
 static common_chat_params common_chat_params_init_kimi_k2(const common_chat_template &    tmpl,
-                                                          const autoparser::generation_params & inputs) {
+                                                          const autoparser::templates_params & inputs) {
    common_chat_params data;

    data.prompt             = common_chat_template_direct_apply(tmpl, inputs);
    data.format             = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking  = true;
+    data.thinking_start_tag = "<think>";
+    data.thinking_end_tag   = "</think>";
    data.preserved_tokens  = {
        "<|tool_calls_section_begin|>",
        "<|tool_calls_section_end|>",
@@ -1185,18 +1186,6 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;

-    const std::string SECTION_BEGIN = "<|tool_calls_section_begin|>";
-    const std::string SECTION_END   = "<|tool_calls_section_end|>";
-    const std::string CALL_BEGIN    = "<|tool_call_begin|>";
-    const std::string ARGS_BEGIN    = "<|tool_call_argument_begin|>";
-    const std::string CALL_END      = "<|tool_call_end|>";
-
-    const std::string THINK_START = "<think>";
-    const std::string THINK_END   = "</think>";
-
-    data.thinking_start_tag = THINK_START;
-    data.thinking_end_tag   = THINK_END;
-
    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
        // Kimi K2 Thinking format:
        // - Reasoning: <think>{reasoning}</think>
@@ -1208,7 +1197,16 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
        //   <|tool_calls_section_end|>
        // The ID format is: functions.<function_name>:<counter> where counter is 0, 1, 2, ...

-        // Tool call markers
+                // Tool call markers
+        const std::string SECTION_BEGIN = "<|tool_calls_section_begin|>";
+        const std::string SECTION_END   = "<|tool_calls_section_end|>";
+        const std::string CALL_BEGIN    = "<|tool_call_begin|>";
+        const std::string ARGS_BEGIN    = "<|tool_call_argument_begin|>";
+        const std::string CALL_END      = "<|tool_call_end|>";
+
+        const std::string THINK_START   = "<think>";
+        const std::string THINK_END     = "</think>";
+
        auto end = p.end();

        // Note: this model is CRAZY. It can diverge from its supposed tool calling pattern in so many ways it's not funny.
@@ -1216,12 +1214,11 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
        auto reasoning = extract_reasoning ? p.optional(THINK_START + p.reasoning(
            p.until_one_of({ THINK_END, "<|tool_calls_section_begin|>", "<|tool_call_begin|>" })) +
            p.optional(p.literal(THINK_END))) : p.eps();
-        auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);


        // Content only parser (no tools)
        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
-            return generation_prompt + reasoning + p.content(p.rest()) + end;
+            return reasoning + p.content(p.rest()) + end;
        }

        // Build tool call parsers for each available function
@@ -1257,7 +1254,7 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp

        auto content_before_tools = p.content(p.until_one_of({ SECTION_BEGIN, CALL_BEGIN }));

-        return generation_prompt + reasoning + content_before_tools + tool_calls + end;
+        return reasoning + content_before_tools + tool_calls + end;
    });

    data.parser = parser.save();
@@ -1281,14 +1278,13 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
    return data;
 }

-// LFM2 format: uses <|tool_list_start|>[...]<|tool_list_end|> in system prompt
-// and <|tool_call_start|>[name(arg="val")]<|tool_call_end|> for tool calls.
-// - Reasoning: <think>{reasoning}</think> (optional)
-// - Content: text before a tool call (optional)
-// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")]
-//   Tool calls can appear multiple times (parallel tool calls supported)
+// LFM2 format:
+// - Reasoning: <think>{reasoning}</think> (optional, only if enable_thinking is true)
+// - Content: text after reasoning (optional)
+// - Tool calls: <|tool_call_start|>[function_name(arg1="value1", arg2="value2")]<|tool_call_end|>
+// Tool calls can appear multiple times (parallel tool calls)
 static common_chat_params common_chat_params_init_lfm2(const common_chat_template &    tmpl,
-                                                       const autoparser::generation_params & inputs) {
+                                                       const autoparser::templates_params & inputs) {
    common_chat_params data;

    data.prompt            = common_chat_template_direct_apply(tmpl, inputs);
@@ -1307,16 +1303,13 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;

+
    const std::string TOOL_CALL_START = "<|tool_call_start|>";
    const std::string TOOL_CALL_END   = "<|tool_call_end|>";
    const std::string THINK_START     = "<think>";
    const std::string THINK_END       = "</think>";
-
-    data.thinking_start_tag = THINK_START;
-    data.thinking_end_tag   = THINK_END;
-
    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
+
        auto end = p.end();

        auto reasoning = p.eps();
@@ -1325,11 +1318,11 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
        }

        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
-            return generation_prompt + reasoning + p.content(p.rest()) + end;
+            return reasoning + p.content(p.rest()) + end;
        }
+
        auto tool_calls = p.rule("tool-calls",
-            p.trigger_rule("tool-call",
-                p.literal(TOOL_CALL_START) +
+            p.trigger_rule("tool-call", p.literal(TOOL_CALL_START) +
                p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) +
                p.literal(TOOL_CALL_END)
            )
@@ -1337,7 +1330,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat

        auto content = p.content(p.until(TOOL_CALL_START));

-        return generation_prompt + reasoning + content + tool_calls + end;
+        return reasoning + content + tool_calls + end;
    });

    data.parser = parser.save();
@@ -1357,87 +1350,13 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
            { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, TOOL_CALL_START }
        };
    }
-    return data;
-}
-
-// LFM2.5 format: uses plain "List of tools: [...]" in system prompt, no wrapper tokens.
-// Tool calls are bare [name(arg="val")], though model may optionally emit <|tool_call_start|>.
-// - Reasoning: <think>{reasoning}</think> (optional)
-// - Content: text before a tool call (optional)
-// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")]
-//   Tool calls can appear multiple times (parallel tool calls supported)
-static common_chat_params common_chat_params_init_lfm2_5(const common_chat_template &    tmpl,
-                                                         const autoparser::generation_params & inputs) {
-    common_chat_params data;
-
-    data.prompt            = common_chat_template_direct_apply(tmpl, inputs);
-    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
-    data.supports_thinking = true;
-    data.preserved_tokens  = {
-        "<|tool_call_start|>",
-        "<|tool_call_end|>",
-        "<think>",
-        "</think>",
-    };
-
-    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
-    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
-    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
-
-    const std::string THINK_START     = "<think>";
-    const std::string THINK_END       = "</think>";
-
-    data.thinking_start_tag = THINK_START;
-    data.thinking_end_tag   = THINK_END;
-
-    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
-        auto end = p.end();
-
-        auto reasoning = p.eps();
-        if (extract_reasoning && inputs.enable_thinking) {
-            reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
-        }
-
-        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
-            return generation_prompt + reasoning + p.content(p.rest()) + end;
-        }
-
-        auto tool_calls = p.rule("tool-calls",
-            p.trigger_rule("tool-call",
-                p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls)
-            )
-        );
-
-        auto content = p.content(p.until_one_of({"<|tool_call_start|>", "["}));
-        auto maybe_start = p.optional(p.literal("<|tool_call_start|>"));
-        return generation_prompt + reasoning + content + maybe_start + tool_calls + end;
-    });
-
-    data.parser = parser.save();
-
-    if (include_grammar) {
-        data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
-        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                auto         schema   = function.at("parameters");
-                builder.resolve_refs(schema);
-            });
-            parser.build_grammar(builder, data.grammar_lazy);
-        });
-        foreach_function(inputs.tools, [&](const json & tool) {
-            const std::string name = tool.at("function").at("name");
-            data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[" + name + "(" });
-        });
-    }

    return data;
 }

 static common_chat_params common_chat_params_init_gigachat_v3(
        const common_chat_template & tmpl,
-        const autoparser::generation_params & inputs) {
+        const autoparser::templates_params & inputs) {

    common_chat_params data;

@@ -1451,10 +1370,9 @@ static common_chat_params common_chat_params_init_gigachat_v3(

    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
-    const auto *tool_call_start_prefix = "<|message_sep|>\n\nfunction call<|role_sep|>\n";
+    auto tool_call_start_prefix = "<|message_sep|>\n\nfunction call<|role_sep|>\n";

    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        auto ret = p.eps();
        if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
            // Build a choice of all available tools
            auto tool_choice = p.choice();
@@ -1477,14 +1395,13 @@ static common_chat_params common_chat_params_init_gigachat_v3(
            auto tool_call = p.rule("tool-call", p.literal(tool_call_start_prefix) + tool_choice);
            auto tool_calls = p.trigger_rule("tool-call-root", p.repeat(tool_call, /* min = */ min_calls, /* max = */ max_calls));

-            ret = p.content(p.until("<|message_sep|>\n\n")) << tool_calls;
-        } else {
-            // Content only parser
-            include_grammar = false;
-            ret = p.content(p.rest());
+            return p.content(p.until("<|message_sep|>\n\n")) << tool_calls;
        }

-        return p.literal(inputs.generation_prompt) + ret;
+        // Content only parser
+        include_grammar = false;
+        return p.content(p.rest());
+
    });

    data.parser = parser.save();
@@ -1547,50 +1464,6 @@ static void requires_non_null_content(json & messages) {
    }
 }

-// Gemma4 uses a custom tool_responses field instead of role:tool messages.
-// Convert consecutive role:tool messages into a single user message with tool_responses.
-static void convert_tool_responses_gemma4(json & messages) {
-    json result = json::array();
-    size_t i = 0;
-    while (i < messages.size()) {
-        if (messages[i].contains("role") && messages[i].at("role") == "tool") {
-            json tool_responses = json::array();
-            while (i < messages.size() &&
-                   messages[i].contains("role") &&
-                   messages[i].at("role") == "tool") {
-                const auto & tool_msg = messages[i];
-                std::string name;
-                if (tool_msg.contains("tool_call_id") && tool_msg.at("tool_call_id").is_string()) {
-                    name = tool_msg.at("tool_call_id");
-                } else if (tool_msg.contains("name") && tool_msg.at("name").is_string()) {
-                    name = tool_msg.at("name");
-                }
-                json response;
-                if (tool_msg.contains("content")) {
-                    const auto & content = tool_msg.at("content");
-                    if (content.is_string()) {
-                        // Try to parse the content as JSON; fall back to raw string
-                        try {
-                            response = json::parse(content.get<std::string>());
-                        } catch (...) {
-                            response = content;
-                        }
-                    } else {
-                        response = content;
-                    }
-                }
-                tool_responses.push_back({{"name", name}, {"response", response}});
-                i++;
-            }
-            result.push_back({{"role", "user"}, {"tool_responses", tool_responses}});
-        } else {
-            result.push_back(messages[i]);
-            i++;
-        }
-    }
-    messages = result;
-}
-
 static void func_args_not_string(json & messages) {
    GGML_ASSERT(messages.is_array());
    for (auto & message : messages) {
@@ -1623,10 +1496,72 @@ static json common_chat_extra_context() {
    return ctx;
 }

-static std::optional<common_chat_params> try_specialized_template(
-        const common_chat_template &          tmpl,
-        const std::string &                   src,
-        const autoparser::generation_params & params) {
+static common_chat_params common_chat_templates_apply_jinja(const struct common_chat_templates *        tmpls,
+                                                            const struct common_chat_templates_inputs & inputs) {
+    autoparser::templates_params params;
+    params.tools = common_chat_tools_to_json_oaicompat(inputs.tools);
+    const auto & tmpl = params.tools.is_array() && tmpls->template_tool_use
+        ? *tmpls->template_tool_use
+        : *tmpls->template_default;
+    const auto & src = tmpl.source();
+    const auto & caps = tmpl.original_caps();
+    params.messages = render_message_to_json(inputs.messages, tmpl.original_caps());
+    params.add_generation_prompt = inputs.add_generation_prompt;
+    params.tool_choice = inputs.tool_choice;
+    params.reasoning_format = inputs.reasoning_format;
+    params.enable_thinking = inputs.enable_thinking;
+    params.grammar = inputs.grammar;
+    params.now = inputs.now;
+    params.add_bos = tmpls->add_bos;
+    params.add_eos = tmpls->add_eos;
+
+    if (src.find("<|channel|>") == std::string::npos) {
+        // map developer to system for all models except for GPT-OSS
+        workaround::map_developer_role_to_system(params.messages);
+    }
+
+    if (!tmpl.original_caps().supports_system_role) {
+        workaround::system_message_not_supported(params.messages);
+    }
+
+    if (tmpl.original_caps().supports_tool_calls) {
+        // some templates will require the content field in tool call messages
+        // to still be non-null, this puts an empty string everywhere where the
+        // content field is null
+        workaround::requires_non_null_content(params.messages);
+    }
+
+    if (tmpl.original_caps().supports_object_arguments) {
+        workaround::func_args_not_string(params.messages);
+    }
+
+    params.extra_context = common_chat_extra_context();
+    for (auto el : inputs.chat_template_kwargs) {
+        params.extra_context[el.first] = json::parse(el.second);
+    }
+
+    if (!inputs.json_schema.empty()) {
+        params.json_schema = json::parse(inputs.json_schema);
+    }
+
+    // if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
+    //     LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
+    //     params.parallel_tool_calls = false;
+    // } else {
+    params.parallel_tool_calls = inputs.parallel_tool_calls;
+    //}
+
+    if (params.tools.is_array()) {
+        if (params.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && !params.grammar.empty()) {
+            throw std::runtime_error("Cannot specify grammar with tools");
+        }
+        if (caps.supports_tool_calls && !caps.supports_tools) {
+            LOG_WRN(
+                "Template supports tool calls but does not natively describe tools. The fallback behaviour used may "
+                "produce bad results, inspect prompt w/ --verbose & consider overriding the template.\n");
+        }
+    }
+
    // Ministral/Mistral Large 3 - uses special reasoning structure fixes, can't use autoparser
    // Note: Mistral Small 3.2 uses [CALL_ID] which Ministral doesn't have, so we can distinguish them
    if (src.find("[SYSTEM_PROMPT]") != std::string::npos && src.find("[TOOL_CALLS]") != std::string::npos &&
@@ -1656,127 +1591,25 @@ static std::optional<common_chat_params> try_specialized_template(
        return common_chat_params_init_kimi_k2(tmpl, params);
    }

-    // LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
-    // and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
+    // LFM2 - uses <|tool_list_start|>/<|tool_list_end|> markers and <|tool_call_start|>[name(args)]<|tool_call_end|> format
+    // Detection: template has "<|tool_list_start|>" and "<|tool_list_end|>" markers
    if (src.find("<|tool_list_start|>") != std::string::npos &&
        src.find("<|tool_list_end|>") != std::string::npos) {
        LOG_DBG("Using specialized template: LFM2\n");
        return common_chat_params_init_lfm2(tmpl, params);
    }

-    // LFM2.5 format detection: template uses plain "List of tools: [...]" with no special tokens
-    if (src.find("List of tools: [") != std::string::npos &&
-        src.find("<|tool_list_start|>") == std::string::npos) {
-        LOG_DBG("Using specialized template: LFM2.5\n");
-        return common_chat_params_init_lfm2_5(tmpl, params);
-    }
-
    // GigaChatV3 format detection
    if (src.find("<|role_sep|>") != std::string::npos &&
        src.find("<|message_sep|>") != std::string::npos &&
-        src.find("<|function_call|>") == std::string::npos) {
+        src.find("<|function_call|>") == std::string::npos
+    ) {
        LOG_DBG("Using specialized template: GigaChatV3\n");
        return common_chat_params_init_gigachat_v3(tmpl, params);
    }

-    return std::nullopt;
-}
-
-static common_chat_params common_chat_templates_apply_jinja(const struct common_chat_templates *        tmpls,
-                                                            const struct common_chat_templates_inputs & inputs) {
-    autoparser::generation_params params;
-    params.tools = common_chat_tools_to_json_oaicompat(inputs.tools);
-    const auto & tmpl =
-        params.tools.is_array() && tmpls->template_tool_use ? *tmpls->template_tool_use : *tmpls->template_default;
-    const auto & src        = tmpl.source();
-    const auto & caps       = tmpl.original_caps();
-    params.messages         = render_message_to_json(inputs.messages, tmpl.original_caps());
-    params.tool_choice      = inputs.tool_choice;
-    params.reasoning_format = inputs.reasoning_format;
-    params.enable_thinking  = inputs.enable_thinking;
-    params.grammar          = inputs.grammar;
-    params.now              = inputs.now;
-    params.add_bos          = tmpls->add_bos;
-    params.add_eos          = tmpls->add_eos;
-
-    if (src.find("<|channel|>") == std::string::npos) {
-        // map developer to system for all models except for GPT-OSS
-        workaround::map_developer_role_to_system(params.messages);
-    }
-
-    if (!tmpl.original_caps().supports_system_role) {
-        workaround::system_message_not_supported(params.messages);
-    }
-
-    if (tmpl.original_caps().supports_tool_calls) {
-        // some templates will require the content field in tool call messages
-        // to still be non-null, this puts an empty string everywhere where the
-        // content field is null
-        workaround::requires_non_null_content(params.messages);
-    }
-
-    if (tmpl.original_caps().supports_object_arguments) {
-        workaround::func_args_not_string(params.messages);
-    }
-
-    if (src.find("'<|tool_call>call:'") != std::string::npos) {
-        workaround::convert_tool_responses_gemma4(params.messages);
-    }
-
-    params.add_generation_prompt = false;
-    std::string no_gen_prompt    = common_chat_template_direct_apply(tmpl, params);
-    params.add_generation_prompt = true;
-    std::string gen_prompt       = common_chat_template_direct_apply(tmpl, params);
-    auto        diff             = calculate_diff_split(no_gen_prompt, gen_prompt);
-    params.generation_prompt     = diff.right;
-
-    params.add_generation_prompt = inputs.add_generation_prompt;
-
-    params.extra_context = common_chat_extra_context();
-    for (auto el : inputs.chat_template_kwargs) {
-        params.extra_context[el.first] = json::parse(el.second);
-    }
-
-    if (!inputs.json_schema.empty()) {
-        params.json_schema = json::parse(inputs.json_schema);
-    }
-
-    params.parallel_tool_calls = inputs.parallel_tool_calls;
-
-    if (params.tools.is_array()) {
-        if (params.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && !params.grammar.empty()) {
-            throw std::runtime_error("Cannot specify grammar with tools");
-        }
-        if (caps.supports_tool_calls && !caps.supports_tools) {
-            LOG_WRN(
-                "Template supports tool calls but does not natively describe tools. The fallback behaviour used may "
-                "produce bad results, inspect prompt w/ --verbose & consider overriding the template.\n");
-        }
-    }
-
-    if (inputs.force_pure_content) {
-        LOG_WRN("Forcing pure content template, will not render reasoning or tools separately.");
-        // Create the result structure
-        common_chat_params data;
-        auto params_copy               = params;
-        params_copy.reasoning_format   = COMMON_REASONING_FORMAT_NONE;
-        data.prompt                    = common_chat_template_direct_apply(tmpl, params_copy);
-        data.format                    = COMMON_CHAT_FORMAT_PEG_NATIVE;
-        data.generation_prompt         = params.generation_prompt;
-        auto parser                    = build_chat_peg_parser([&params](common_chat_peg_builder &p) {
-            return p.prefix(params.generation_prompt) << p.content(p.rest());
-        });
-        data.parser                    = parser.save();
-        return data;
-    }
-
-    if (auto result = try_specialized_template(tmpl, src, params)) {
-        result->generation_prompt = params.generation_prompt;
-        return *result;
-    }
-
    try {
-        LOG_DBG("%s: using differential autoparser\n", __func__);
+        LOG_DBG("Using differential autoparser\n");
        struct autoparser::autoparser autoparser;
        autoparser.analyze_template(tmpl);
        auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
@@ -1784,11 +1617,13 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        if (auto_params.supports_thinking) {
            auto_params.thinking_start_tag = autoparser.reasoning.start;
            auto_params.thinking_end_tag   = autoparser.reasoning.end;
+            // FORCED_OPEN and FORCED_CLOSED both put <think> in the generation prompt
+            // (FORCED_CLOSED forces empty <think></think> when thinking is disabled,
+            //  but forces <think> open when thinking is enabled)
+            auto_params.thinking_forced_open =
+                autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_OPEN ||
+                autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_CLOSED;
        }
-        auto_params.generation_prompt = params.generation_prompt;
-        common_peg_arena arena;
-        arena.load(auto_params.parser);
-        LOG_DBG("%s: generated parser:\n%s\n\nparser generation prompt: %s\n", __func__, arena.dump(arena.root()).c_str(), auto_params.generation_prompt.c_str());
        return auto_params;
    } catch (const std::exception & e) {
        throw std::invalid_argument(std::string("Unable to generate parser for this template. Automatic parser generation failed: ") + e.what());
@@ -1886,18 +1721,14 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
        LOG_DBG("No parser definition detected, assuming pure content parser.");
    }

-    const std::string effective_input = params.generation_prompt.empty()
-        ? input
-        : params.generation_prompt + input;
-
-    LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());
+    LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), input.c_str());

    common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT;
    if (params.debug) {
        flags |= COMMON_PEG_PARSE_FLAG_DEBUG;
    }

-    common_peg_parse_context ctx(effective_input, flags);
+    common_peg_parse_context ctx(input, flags);
    auto result = parser.parse(ctx);

    if (result.fail()) {
@@ -1907,13 +1738,8 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
            // Try to extract any partial results from what was successfully parsed
            common_chat_msg msg;
            msg.role = "assistant";
-            std::unique_ptr<common_chat_peg_mapper> mapper;
-            if (params.format == COMMON_CHAT_FORMAT_PEG_GEMMA4) {
-                mapper = std::make_unique<common_chat_peg_gemma4_mapper>(msg);
-            } else {
-                mapper = std::make_unique<common_chat_peg_mapper>(msg);
-            }
-            mapper->from_ast(ctx.ast, result);
+            auto mapper = common_chat_peg_mapper(msg);
+            mapper.from_ast(ctx.ast, result);

            if (ctx.is_debug()) {
                fprintf(stderr, "\nAST for partial parse (fail):\n%s\n", ctx.ast.dump().c_str());
@@ -1922,19 +1748,14 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
            return msg;
        }
        throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end) + ": " +
-                                 effective_input.substr(result.end));
+                                 input.substr(result.end));
    }

    common_chat_msg msg;
    msg.role = "assistant";

-    std::unique_ptr<common_chat_peg_mapper> mapper;
-    if (params.format == COMMON_CHAT_FORMAT_PEG_GEMMA4) {
-        mapper = std::make_unique<common_chat_peg_gemma4_mapper>(msg);
-    } else {
-        mapper = std::make_unique<common_chat_peg_mapper>(msg);
-    }
-    mapper->from_ast(ctx.ast, result);
+    auto mapper = common_chat_peg_mapper(msg);
+    mapper.from_ast(ctx.ast, result);

    if (ctx.is_debug()) {
        fprintf(stderr, "\nAST for %s parse:\n%s\n", is_partial ? "partial" : "full", ctx.ast.dump().c_str());
--- a/common/chat.h
+++ b/common/chat.h
@@ -24,7 +24,7 @@ using json = nlohmann::ordered_json;
 struct common_chat_templates;

 namespace autoparser {
-struct generation_params;
+struct templates_params;
 }  // namespace autoparser

 struct common_chat_tool_call {
@@ -184,7 +184,6 @@ enum common_chat_format {
    // These are intended to be parsed by the PEG parser
    COMMON_CHAT_FORMAT_PEG_SIMPLE,
    COMMON_CHAT_FORMAT_PEG_NATIVE,
-    COMMON_CHAT_FORMAT_PEG_GEMMA4,

    COMMON_CHAT_FORMAT_COUNT,  // Not a format, just the # formats
 };
@@ -205,7 +204,6 @@ struct common_chat_templates_inputs {
    std::map<std::string, std::string>    chat_template_kwargs;
    bool                                  add_bos = false;
    bool                                  add_eos = false;
-    bool                                  force_pure_content = false;
 };

 struct common_chat_params {
@@ -213,7 +211,7 @@ struct common_chat_params {
    std::string                         prompt;
    std::string                         grammar;
    bool                                grammar_lazy         = false;
-    std::string                         generation_prompt;
+    bool                                thinking_forced_open = false;
    bool                                supports_thinking    = false;
    std::string                         thinking_start_tag;  // e.g., "<think>"
    std::string                         thinking_end_tag;    // e.g., "</think>"
@@ -230,14 +228,14 @@ struct common_chat_parser_params {
    common_reasoning_format reasoning_format     = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool parse_reasoning"
    // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
    bool                    reasoning_in_content = false;
-    std::string             generation_prompt;
+    bool                    thinking_forced_open = false;
    bool                    parse_tool_calls     = true;
    bool                    debug                = false;  // Enable debug output for PEG parser
    common_peg_arena        parser               = {};
    common_chat_parser_params() = default;
    common_chat_parser_params(const common_chat_params & chat_params) {
-        format  = chat_params.format;
-        generation_prompt = chat_params.generation_prompt;
+        format               = chat_params.format;
+        thinking_forced_open = chat_params.thinking_forced_open;
    }
 };

@@ -303,7 +301,7 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem

 std::string common_chat_template_direct_apply(
    const common_chat_template & tmpl,
-    const autoparser::generation_params & inputs,
+    const autoparser::templates_params & inputs,
    const std::optional<json> & messages_override = std::nullopt,
    const std::optional<json> & tools_override = std::nullopt,
    const std::optional<json> & additional_context = std::nullopt);
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -359,11 +359,6 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
 }

 void common_init() {
-#if defined(_WIN32)
-    SetConsoleOutputCP(CP_UTF8);
-    SetConsoleCP(CP_UTF8);
-#endif
-
    llama_log_set(common_log_default_callback, NULL);

 #ifdef NDEBUG
@@ -372,7 +367,7 @@ void common_init() {
    const char * build_type = " (debug)";
 #endif

-    LOG_DBG("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
+    LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
 }

 std::string common_params_get_system_info(const common_params & params) {
@@ -661,97 +656,6 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
    return true;
 }

-static inline bool glob_class_match(const char c, const char * pattern, const char * class_end) {
-    const char * class_start = pattern;
-    bool negated = false;
-
-    if (*class_start == '!') {
-        negated = true;
-        class_start++;
-    }
-
-    // If first character after negation is ']' or '-', treat it as literal
-    if (*class_start == ']' || *class_start == '-') {
-        if (class_start < class_end && *class_start == c) {
-            return !negated;
-        }
-        class_start++;
-    }
-
-    bool matched = false;
-
-    while (class_start < class_end) {
-        if (class_start + 2 < class_end && class_start[1] == '-' && class_start[2] != ']') {
-            char start_char = *class_start;
-            char end_char = class_start[2];
-            if (c >= start_char && c <= end_char) {
-                matched = true;
-                break;
-            }
-            class_start += 3;
-        } else {
-            if (*class_start == c) {
-                matched = true;
-                break;
-            }
-            class_start++;
-        }
-    }
-
-    return negated ? !matched : matched;
-}
-
-// simple glob: * matches non-/ chars, ** matches anything including /, [] matches character class
-static inline bool glob_match(const char * pattern, const char * str) {
-    if (*pattern == '\0') {
-        return *str == '\0';
-    }
-    if (pattern[0] == '*' && pattern[1] == '*') {
-        const char * p = pattern + 2;
-        if (glob_match(p, str)) return true;
-        if (*str != '\0') return glob_match(pattern, str + 1);
-        return false;
-    }
-    if (*pattern == '*') {
-        const char * p = pattern + 1;
-        for (; *str != '\0' && *str != '/'; str++) {
-            if (glob_match(p, str)) return true;
-        }
-        return glob_match(p, str);
-    }
-    if (*pattern == '?' && *str != '\0' && *str != '/') {
-        return glob_match(pattern + 1, str + 1);
-    }
-    if (*pattern == '[') {
-        const char * class_end = pattern + 1;
-        // If first character after '[' is ']' or '-', treat it as literal
-        if (*class_end == ']' || *class_end == '-') {
-            class_end++;
-        }
-        while (*class_end != '\0' && *class_end != ']') {
-            class_end++;
-        }
-        if (*class_end == ']') {
-            if (*str == '\0') return false;
-            bool matched = glob_class_match(*str, pattern + 1, class_end);
-            return matched && glob_match(class_end + 1, str + 1);
-        } else {
-            if (*str == '[') {
-                return glob_match(pattern + 1, str + 1);
-            }
-            return false;
-        }
-    }
-    if (*pattern == *str) {
-        return glob_match(pattern + 1, str + 1);
-    }
-    return false;
-}
-
-bool glob_match(const std::string & pattern, const std::string & str) {
-    return glob_match(pattern.c_str(), str.c_str());
-}
-
 //
 // Filesystem utils
 //
@@ -1163,7 +1067,7 @@ common_init_result::common_init_result(common_params & params) :

    const llama_vocab * vocab = llama_model_get_vocab(model);

-    // load and optionally apply lora adapters
+    // load and optionally apply lora adapters (must be loaded before context creation)
    for (auto & la : params.lora_adapters) {
        llama_adapter_lora_ptr lora;
        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
@@ -1248,9 +1152,6 @@ llama_context * common_init_result::context() {
 }

 common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
-    if (seq_id < 0 || seq_id >= (int) pimpl->samplers.size()) {
-        return nullptr;
-    }
    return pimpl->samplers[seq_id].get();
 }

@@ -1442,7 +1343,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {

    mparams.progress_callback           = params.load_progress_callback;
    mparams.progress_callback_user_data = params.load_progress_callback_user_data;
-    mparams.no_alloc                    = params.no_alloc;

    return mparams;
 }
--- a/common/common.h
+++ b/common/common.h
@@ -3,14 +3,12 @@
 #pragma once

 #include "ggml-opt.h"
-#include "ggml.h"
 #include "llama-cpp.h"

 #include <set>
 #include <sstream>
 #include <string>
 #include <string_view>
-#include <variant>
 #include <vector>
 #include <map>

@@ -180,43 +178,6 @@ enum common_speculative_type {
    COMMON_SPECULATIVE_TYPE_COUNT          // number of types, unknown type
 };

-// Grammar type enumeration
-enum common_grammar_type {
-    COMMON_GRAMMAR_TYPE_NONE,           // no grammar set
-    COMMON_GRAMMAR_TYPE_USER,           // user-provided GBNF (--grammar / "grammar" API field)
-    COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT,  // auto-generated from JSON schema (--json-schema / "json_schema" API field)
-    COMMON_GRAMMAR_TYPE_TOOL_CALLS,     // auto-generated by chat template parser for function calling
-};
-
-// Grammar variant struct with type and grammar string
-struct common_grammar {
-    common_grammar_type type = COMMON_GRAMMAR_TYPE_NONE;
-    std::string grammar;
-
-    // Default constructor - no grammar
-    common_grammar() = default;
-
-    // Constructor with type and grammar string
-    common_grammar(common_grammar_type t, std::string g) : type(t), grammar(std::move(g)) {
-        GGML_ASSERT(type != COMMON_GRAMMAR_TYPE_NONE || !grammar.empty());
-    }
-
-    // Check if a grammar is set
-    bool empty() const { return type == COMMON_GRAMMAR_TYPE_NONE || grammar.empty(); }
-};
-
-// Returns the raw grammar string, or empty string if no grammar is set.
-inline const std::string & common_grammar_value(const common_grammar & g) {
-    return g.grammar;
-}
-
-// Returns true when the generation_prompt should be prefilled into the grammar sampler.
-// Only output-format and tool-call grammars need prefill; user-supplied grammars must not be prefilled.
-inline bool common_grammar_needs_prefill(const common_grammar & g) {
-    return g.type == COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT
-        || g.type == COMMON_GRAMMAR_TYPE_TOOL_CALLS;
-}
-
 // sampling parameters
 struct common_params_sampling {
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -267,7 +228,7 @@ struct common_params_sampling {
        COMMON_SAMPLER_TYPE_TEMPERATURE,
    };

-    common_grammar              grammar;      // optional grammar constraint (user / output-format / tool-calls)
+    std::string                         grammar; // optional BNF-like grammar to constrain sampling
    bool                                grammar_lazy = false;
    std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
    std::set<llama_token>               preserved_tokens;
@@ -275,15 +236,10 @@ struct common_params_sampling {
    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens

-    // The assistant generation prompt already prefilled into the prompt.
-    // Fed to the grammar sampler (to advance past pre-existing tokens) and used
-    // to determine the reasoning budget sampler's initial state.
-    // Only applied when the grammar is of output-format or tool-calls type.
-    std::string generation_prompt;
-
    // reasoning budget sampler parameters
    // these are populated by the server/CLI based on chat template params
    int32_t                  reasoning_budget_tokens   = -1;   // -1 = disabled, >= 0 = token budget
+    bool                     reasoning_budget_activate_immediately = false;
    std::vector<llama_token> reasoning_budget_start;           // start tag token sequence
    std::vector<llama_token> reasoning_budget_end;             // end tag token sequence
    std::vector<llama_token> reasoning_budget_forced;          // forced sequence (message + end tag)
@@ -573,7 +529,6 @@ struct common_params {

    // server params
    int32_t port                = 8080;          // server listens on this network port
-    bool    reuse_port          = false;         // allow multiple sockets to bind to the same port
    int32_t timeout_read        = 600;           // http read timeout in seconds
    int32_t timeout_write       = timeout_read;  // http write timeout in seconds
    int32_t n_threads_http      = -1;    // number of threads to process HTTP requests (TODO: support threadpool)
@@ -589,7 +544,6 @@ struct common_params {
    std::string chat_template = "";                                                                         // NOLINT
    bool use_jinja = true;                                                                                  // NOLINT
    bool enable_chat_template = true;
-    bool force_pure_content_parser = false;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
    int reasoning_budget = -1;
@@ -614,9 +568,6 @@ struct common_params {
    bool endpoint_props   = false; // only control POST requests, not GET
    bool endpoint_metrics = false;

-    // enable built-in tools
-    std::vector<std::string> server_tools;
-
    // router server configs
    std::string models_dir    = ""; // directory containing models for the router server
    std::string models_preset = ""; // directory containing model presets for the router server
@@ -679,7 +630,6 @@ struct common_params {
    // return false from callback to abort model loading or true to continue
    llama_progress_callback load_progress_callback = NULL;
    void *                  load_progress_callback_user_data = NULL;
-    bool no_alloc = false; // Don't allocate model buffers
 };

 // call once at the start of a program if it uses libcommon
@@ -795,8 +745,6 @@ std::string string_from(const std::vector<int> & values);
 std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
 std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);

-bool glob_match(const std::string & pattern, const std::string & str);
-
 //
 // Filesystem utils
 //
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -1,9 +1,9 @@
 #include "arg.h"

 #include "common.h"
+#include "gguf.h" // for reading GGUF splits
 #include "log.h"
 #include "download.h"
-#include "hf-cache.h"

 #define JSON_ASSERT GGML_ASSERT
 #include <nlohmann/json.hpp>
@@ -15,7 +15,6 @@
 #include <map>
 #include <mutex>
 #include <regex>
-#include <unordered_set>
 #include <string>
 #include <thread>
 #include <vector>
@@ -36,6 +35,8 @@
 #endif
 #endif

+#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
+
 // isatty
 #if defined(_WIN32)
 #include <io.h>
@@ -50,6 +51,31 @@ using json = nlohmann::ordered_json;
 //

 // validate repo name format: owner/repo
+static bool validate_repo_name(const std::string & repo) {
+    static const std::regex repo_regex(R"(^[A-Za-z0-9_.\-]+\/[A-Za-z0-9_.\-]+$)");
+    return std::regex_match(repo, repo_regex);
+}
+
+static std::string get_manifest_path(const std::string & repo, const std::string & tag) {
+    // we use "=" to avoid clashing with other component, while still being allowed on windows
+    std::string fname = "manifest=" + repo + "=" + tag + ".json";
+    if (!validate_repo_name(repo)) {
+        throw std::runtime_error("error: repo name must be in the format 'owner/repo'");
+    }
+    string_replace_all(fname, "/", "=");
+    return fs_get_cache_file(fname);
+}
+
+static std::string read_file(const std::string & fname) {
+    std::ifstream file(fname);
+    if (!file) {
+        throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
+    }
+    std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+    file.close();
+    return content;
+}
+
 static void write_file(const std::string & fname, const std::string & content) {
    const std::string fname_tmp = fname + ".tmp";
    std::ofstream     file(fname_tmp);
@@ -106,7 +132,7 @@ static bool is_http_status_ok(int status) {

 std::pair<std::string, std::string> common_download_split_repo_tag(const std::string & hf_repo_with_tag) {
    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
-    std::string tag = parts.size() > 1 ? parts.back() : "";
+    std::string tag = parts.size() > 1 ? parts.back() : "latest";
    std::string hf_repo = parts[0];
    if (string_split<std::string>(hf_repo, '/').size() != 2) {
        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
@@ -119,9 +145,6 @@ class ProgressBar {
    static inline std::map<const ProgressBar *, int> lines;
    static inline int max_line = 0;

-    std::string filename;
-    size_t len = 0;
-
    static void cleanup(const ProgressBar * line) {
        lines.erase(line);
        if (lines.empty()) {
@@ -138,23 +161,7 @@ class ProgressBar {
    }

 public:
-    ProgressBar(const std::string & url = "") : filename(url) {
-        if (auto pos = filename.rfind('/'); pos != std::string::npos) {
-            filename = filename.substr(pos + 1);
-        }
-        if (auto pos = filename.find('?'); pos != std::string::npos) {
-            filename = filename.substr(0, pos);
-        }
-        for (size_t i = 0; i < filename.size(); ++i) {
-            if ((filename[i] & 0xC0) != 0x80) {
-                if (len++ == 39) {
-                    filename.resize(i);
-                    filename += "…";
-                    break;
-                }
-            }
-        }
-    }
+    ProgressBar() = default;

    ~ProgressBar() {
        std::lock_guard<std::mutex> lock(mutex);
@@ -162,7 +169,11 @@ public:
    }

    void update(size_t current, size_t total) {
-        if (!total || !is_output_a_tty()) {
+        if (!is_output_a_tty()) {
+            return;
+        }
+
+        if (!total) {
            return;
        }

@@ -174,27 +185,28 @@ public:
        }
        int lines_up = max_line - lines[this];

-        size_t bar = 55 - len;
+        size_t width = 50;
        size_t pct = (100 * current) / total;
-        size_t pos = (bar * current) / total;
+        size_t pos = (width * current) / total;
+
+        std::cout << "\033[s";

        if (lines_up > 0) {
            std::cout << "\033[" << lines_up << "A";
        }
-        std::cout << '\r' << "Downloading " << filename << " ";
+        std::cout << "\033[2K\r["
+            << std::string(pos, '=')
+            << (pos < width ? ">" : "")
+            << std::string(width - pos, ' ')
+            << "] " << std::setw(3) << pct << "%  ("
+            << current / (1024 * 1024) << " MB / "
+            << total / (1024 * 1024) << " MB) "
+            << "\033[u";

-        for (size_t i = 0; i < bar; ++i) {
-            std::cout << (i < pos ? "—" : " ");
-        }
-        std::cout << std::setw(4) << pct << "%\033[K";
-
-        if (lines_up > 0) {
-            std::cout << "\033[" << lines_up << "B";
-        }
-        std::cout << '\r' << std::flush;
+        std::cout.flush();

        if (current == total) {
-            cleanup(this);
+             cleanup(this);
        }
    }

@@ -222,7 +234,7 @@ static bool common_pull_file(httplib::Client & cli,
    const char * func = __func__; // avoid __func__ inside a lambda
    size_t downloaded = existing_size;
    size_t progress_step = 0;
-    ProgressBar bar(resolve_path);
+    ProgressBar bar;

    auto res = cli.Get(resolve_path, headers,
        [&](const httplib::Response &response) {
@@ -278,8 +290,7 @@ static bool common_pull_file(httplib::Client & cli,
 static int common_download_file_single_online(const std::string        & url,
                                              const std::string        & path,
                                              const std::string        & bearer_token,
-                                              const common_header_list & custom_headers,
-                                              bool                       skip_etag = false) {
+                                              const common_header_list & custom_headers) {
    static const int max_attempts        = 3;
    static const int retry_delay_seconds = 2;

@@ -299,16 +310,11 @@ static int common_download_file_single_online(const std::string        & url,

    const bool file_exists = std::filesystem::exists(path);

-    if (file_exists && skip_etag) {
-        LOG_DBG("%s: using cached file: %s\n", __func__, path.c_str());
-        return 304; // 304 Not Modified - fake cached response
-    }
-
    std::string last_etag;
    if (file_exists) {
        last_etag = read_etag(path);
    } else {
-        LOG_DBG("%s: no previous model file found %s\n", __func__, path.c_str());
+        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
    }

    auto head = cli.Head(parts.path);
@@ -342,11 +348,11 @@ static int common_download_file_single_online(const std::string        & url,

    if (file_exists) {
        if (etag.empty()) {
-            LOG_DBG("%s: using cached file (no server etag): %s\n", __func__, path.c_str());
+            LOG_INF("%s: using cached file (no server etag): %s\n", __func__, path.c_str());
            return 304; // 304 Not Modified - fake cached response
        }
        if (!last_etag.empty() && last_etag == etag) {
-            LOG_DBG("%s: using cached file (same etag): %s\n", __func__, path.c_str());
+            LOG_INF("%s: using cached file (same etag): %s\n", __func__, path.c_str());
            return 304; // 304 Not Modified - fake cached response
        }
        if (remove(path.c_str()) != 0) {
@@ -355,12 +361,6 @@ static int common_download_file_single_online(const std::string        & url,
        }
    }

-    { // silent
-        std::error_code ec;
-        std::filesystem::path p(path);
-        std::filesystem::create_directories(p.parent_path(), ec);
-    }
-
    const std::string path_temporary = path + ".downloadInProgress";
    int delay = retry_delay_seconds;

@@ -382,7 +382,7 @@ static int common_download_file_single_online(const std::string        & url,
            }
        }

-        LOG_DBG("%s: downloading from %s to %s (etag:%s)...\n",
+        LOG_INF("%s: downloading from %s to %s (etag:%s)...\n",
                __func__, common_http_show_masked_url(parts).c_str(),
                path_temporary.c_str(), etag.c_str());

@@ -391,7 +391,7 @@ static int common_download_file_single_online(const std::string        & url,
                LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
                return -1;
            }
-            if (!etag.empty() && !skip_etag) {
+            if (!etag.empty()) {
                write_etag(path, etag);
            }
            return head->status;
@@ -440,10 +440,9 @@ int common_download_file_single(const std::string & url,
                                const std::string & path,
                                const std::string & bearer_token,
                                bool offline,
-                                const common_header_list & headers,
-                                bool skip_etag) {
+                                const common_header_list & headers) {
    if (!offline) {
-        return common_download_file_single_online(url, path, bearer_token, headers, skip_etag);
+        return common_download_file_single_online(url, path, bearer_token, headers);
    }

    if (!std::filesystem::exists(path)) {
@@ -451,311 +450,197 @@ int common_download_file_single(const std::string & url,
        return -1;
    }

-    LOG_DBG("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
+    LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
    return 304; // Not Modified - fake cached response
 }

-struct gguf_split_info {
-    std::string prefix; // tag included
-    std::string tag;
-    int index;
-    int count;
-};
+// download multiple files from remote URLs to local paths
+// the input is a vector of pairs <url, path>
+static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls,
+                                          const std::string & bearer_token,
+                                          bool offline,
+                                          const common_header_list & headers) {
+    // Prepare download in parallel
+    std::vector<std::future<bool>> futures_download;
+    futures_download.reserve(urls.size());

-static gguf_split_info get_gguf_split_info(const std::string & path) {
-    static const std::regex re_split("^(.+)-([0-9]{5})-of-([0-9]{5})$", std::regex::icase);
-    static const std::regex re_tag("[-.]([A-Z0-9_]+)$", std::regex::icase);
-    std::smatch m;
-
-    std::string prefix = path;
-    if (!string_remove_suffix(prefix, ".gguf")) {
-        return {};
+    for (auto const & item : urls) {
+        futures_download.push_back(
+            std::async(
+                std::launch::async,
+                [&bearer_token, offline, &headers](const std::pair<std::string, std::string> & it) -> bool {
+                    const int http_status = common_download_file_single(it.first, it.second, bearer_token, offline, headers);
+                    return is_http_status_ok(http_status);
+                },
+                item
+            )
+        );
    }

-    int index = 1;
-    int count = 1;
-
-    if (std::regex_match(prefix, m, re_split)) {
-        index = std::stoi(m[2].str());
-        count = std::stoi(m[3].str());
-        prefix = m[1].str();
-    }
-
-    std::string tag;
-    if (std::regex_search(prefix, m, re_tag)) {
-        tag = m[1].str();
-        for (char & c : tag) {
-            c = std::toupper((unsigned char)c);
+    // Wait for all downloads to complete
+    for (auto & f : futures_download) {
+        if (!f.get()) {
+            return false;
        }
    }

-    return {std::move(prefix), std::move(tag), index, count};
+    return true;
 }

-// Q4_0 -> 4, F16 -> 16, NVFP4 -> 4, Q8_K_M -> 8, etc
-static int extract_quant_bits(const std::string & filename) {
-    auto split = get_gguf_split_info(filename);
-
-    auto pos = split.tag.find_first_of("0123456789");
-    if (pos == std::string::npos) {
-        return 0;
-    }
-
-    return std::stoi(split.tag.substr(pos));
-}
-
-static hf_cache::hf_files get_split_files(const hf_cache::hf_files & files,
-                                          const hf_cache::hf_file  & file) {
-    auto split = get_gguf_split_info(file.path);
-
-    if (split.count <= 1) {
-        return {file};
-    }
-    hf_cache::hf_files result;
-
-    for (const auto & f : files) {
-        auto split_f = get_gguf_split_info(f.path);
-        if (split_f.count == split.count && split_f.prefix == split.prefix) {
-            result.push_back(f);
-        }
-    }
-    return result;
-}
-
-static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,
-                                          const std::string        & model) {
-    hf_cache::hf_file best;
-    size_t best_depth = 0;
-    int best_diff = 0;
-    bool found = false;
-
-    auto model_bits = extract_quant_bits(model);
-    auto model_parts = string_split<std::string>(model, '/');
-    auto model_dir = model_parts.end() - 1;
-
-    for (const auto & f : files) {
-        if (!string_ends_with(f.path, ".gguf") ||
-            f.path.find("mmproj") == std::string::npos) {
-            continue;
-        }
-
-        auto mmproj_parts = string_split<std::string>(f.path, '/');
-        auto mmproj_dir = mmproj_parts.end() - 1;
-
-        auto [_, dir] = std::mismatch(model_parts.begin(), model_dir,
-                                      mmproj_parts.begin(), mmproj_dir);
-        if (dir != mmproj_dir) {
-            continue;
-        }
-
-        size_t depth = dir - mmproj_parts.begin();
-        auto bits = extract_quant_bits(f.path);
-        auto diff = std::abs(bits - model_bits);
-
-        if (!found || depth > best_depth || (depth == best_depth && diff < best_diff)) {
-            best = f;
-            best_depth = depth;
-            best_diff = diff;
-            found = true;
-        }
-    }
-    return best;
-}
-
-static bool gguf_filename_is_model(const std::string & filepath) {
-    if (!string_ends_with(filepath, ".gguf")) {
+bool common_download_model(const common_params_model & model,
+                           const std::string & bearer_token,
+                           bool offline,
+                           const common_header_list & headers) {
+    // Basic validation of the model.url
+    if (model.url.empty()) {
+        LOG_ERR("%s: invalid model url\n", __func__);
        return false;
    }

-    std::string filename = filepath;
-    if (auto pos = filename.rfind('/'); pos != std::string::npos) {
-        filename = filename.substr(pos + 1);
+    const int http_status = common_download_file_single(model.url, model.path, bearer_token, offline, headers);
+    if (!is_http_status_ok(http_status)) {
+        return false;
    }

-    return filename.find("mmproj")  == std::string::npos &&
-           filename.find("imatrix") == std::string::npos;
-}
+    // check for additional GGUFs split to download
+    int n_split = 0;
+    {
+        struct gguf_init_params gguf_params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ NULL,
+        };
+        auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
+        if (!ctx_gguf) {
+            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, model.path.c_str());
+            return false;
+        }

-static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
-                                         const std::string        & tag) {
-    std::vector<std::string> tags;
+        auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
+        if (key_n_split >= 0) {
+            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
+        }

-    if (!tag.empty()) {
-        tags.push_back(tag);
-    } else {
-        tags = {"Q4_K_M", "Q4_0"};
+        gguf_free(ctx_gguf);
    }

-    for (const auto & t : tags) {
-        std::regex pattern(t + "[.-]", std::regex::icase);
-        for (const auto & f : files) {
-            if (gguf_filename_is_model(f.path) &&
-                std::regex_search(f.path, pattern)) {
-                return f;
+    if (n_split > 1) {
+        char split_prefix[PATH_MAX] = {0};
+        char split_url_prefix[LLAMA_MAX_URL_LENGTH] = {0};
+
+        // Verify the first split file format
+        // and extract split URL and PATH prefixes
+        {
+            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split);
+                return false;
+            }
+
+            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split);
+                return false;
            }
        }
-    }

-    for (const auto & f : files) {
-        if (gguf_filename_is_model(f.path)) {
-            return f;
-        }
-    }
+        std::vector<std::pair<std::string, std::string>> urls;
+        for (int idx = 1; idx < n_split; idx++) {
+            char split_path[PATH_MAX] = {0};
+            llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);

-    return {};
-}
+            char split_url[LLAMA_MAX_URL_LENGTH] = {0};
+            llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);

-static void list_available_gguf_files(const hf_cache::hf_files & files) {
-    LOG_INF("Available GGUF files:\n");
-    for (const auto & f : files) {
-        if (string_ends_with(f.path, ".gguf")) {
-            LOG_INF(" - %s\n", f.path.c_str());
-        }
-    }
-}
-
-struct hf_plan {
-    hf_cache::hf_files model_files;
-    hf_cache::hf_file mmproj;
-};
-
-static hf_plan get_hf_plan(const common_params_model        & model,
-                           const std::string                & token,
-                           const common_download_model_opts & opts) {
-    hf_plan plan;
-    hf_cache::hf_files all;
-
-    auto [repo, tag] = common_download_split_repo_tag(model.hf_repo);
-
-    if (!opts.offline) {
-        all = hf_cache::get_repo_files(repo, token);
-    }
-    if (all.empty()) {
-        all = hf_cache::get_cached_files(repo);
-    }
-    if (all.empty()) {
-        return plan;
-    }
-
-    hf_cache::hf_file primary;
-
-    if (!model.hf_file.empty()) {
-        for (const auto & f : all) {
-            if (f.path == model.hf_file) {
-                primary = f;
-                break;
+            if (std::string(split_path) == model.path) {
+                continue; // skip the already downloaded file
            }
+
+            urls.push_back({split_url, split_path});
        }
-        if (primary.path.empty()) {
-            LOG_ERR("%s: file '%s' not found in repository\n", __func__, model.hf_file.c_str());
-            list_available_gguf_files(all);
-            return plan;
-        }
-    } else {
-        primary = find_best_model(all, tag);
-        if (primary.path.empty()) {
-            LOG_ERR("%s: no GGUF files found in repository %s\n", __func__, repo.c_str());
-            list_available_gguf_files(all);
-            return plan;
-        }
+
+        // Download in parallel
+        common_download_file_multiple(urls, bearer_token, offline, headers);
    }

-    plan.model_files = get_split_files(all, primary);
-
-    if (opts.download_mmproj) {
-        plan.mmproj = find_best_mmproj(all, primary.path);
-    }
-
-    return plan;
+    return true;
 }

-struct download_task {
-    std::string url;
-    std::string path;
-};
+common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag,
+                                      const std::string & bearer_token,
+                                      bool offline,
+                                      const common_header_list & custom_headers) {
+    // the returned hf_repo is without tag
+    auto [hf_repo, tag] = common_download_split_repo_tag(hf_repo_with_tag);

-static std::vector<download_task> get_url_tasks(const common_params_model & model) {
-    auto split = get_gguf_split_info(model.url);
+    std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;

-    if (split.count <= 1) {
-        return {{model.url, model.path}};
+    // headers
+    common_header_list headers = custom_headers;
+    headers.push_back({"Accept", "application/json"});
+    if (!bearer_token.empty()) {
+        headers.push_back({"Authorization", "Bearer " + bearer_token});
    }
+    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
+    // User-Agent header is already set in common_remote_get_content, no need to set it here

-    auto filename = split.prefix;
-    if (auto pos = split.prefix.rfind('/'); pos != std::string::npos) {
-        filename = split.prefix.substr(pos + 1);
-    }
-
-    auto parent_path = std::filesystem::path(model.path).parent_path();
-    auto prefix_path = (parent_path / filename).string();
-
-    std::vector<download_task> tasks;
-    for (int i = 1; i <= split.count; i++) {
-        auto suffix = string_format("-%05d-of-%05d.gguf", i, split.count);
-        tasks.push_back({split.prefix + suffix, prefix_path + suffix});
-    }
-    return tasks;
-}
-
-common_download_model_result common_download_model(const common_params_model        & model,
-                                                   const std::string                & bearer_token,
-                                                   const common_download_model_opts & opts,
-                                                   const common_header_list         & headers) {
-    common_download_model_result result;
-    std::vector<download_task> tasks;
-    hf_plan hf;
-
-    bool is_hf = !model.hf_repo.empty();
-
-    if (is_hf) {
-        hf = get_hf_plan(model, bearer_token, opts);
-        for (const auto & f : hf.model_files) {
-            tasks.push_back({f.url, f.local_path});
+    // make the request
+    common_remote_params params;
+    params.headers = headers;
+    long res_code = 0;
+    std::string res_str;
+    bool use_cache = false;
+    std::string cached_response_path = get_manifest_path(hf_repo, tag);
+    if (!offline) {
+        try {
+            auto res = common_remote_get_content(url, params);
+            res_code = res.first;
+            res_str = std::string(res.second.data(), res.second.size());
+        } catch (const std::exception & e) {
+            LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
        }
-        if (!hf.mmproj.path.empty()) {
-            tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
+    }
+    if (res_code == 0) {
+        if (std::filesystem::exists(cached_response_path)) {
+            LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
+            res_str = read_file(cached_response_path);
+            res_code = 200;
+            use_cache = true;
+        } else {
+            throw std::runtime_error(
+                offline ? "error: failed to get manifest (offline mode)"
+                : "error: failed to get manifest (check your internet connection)");
        }
-    } else if (!model.url.empty()) {
-        tasks = get_url_tasks(model);
-    } else {
-        result.model_path = model.path;
-        return result;
    }
+    std::string ggufFile;
+    std::string mmprojFile;

-    if (tasks.empty()) {
-        return result;
-    }
+    if (res_code == 200 || res_code == 304) {
+        try {
+            auto j = json::parse(res_str);

-    std::vector<std::future<bool>> futures;
-    for (const auto & task : tasks) {
-        futures.push_back(std::async(std::launch::async,
-            [&task, &bearer_token, offline = opts.offline, &headers, is_hf]() {
-                int status = common_download_file_single(task.url, task.path, bearer_token, offline, headers, is_hf);
-                return is_http_status_ok(status);
+            if (j.contains("ggufFile") && j["ggufFile"].contains("rfilename")) {
+                ggufFile = j["ggufFile"]["rfilename"].get<std::string>();
            }
-        ));
-    }
-
-    for (auto & f : futures) {
-        if (!f.get()) {
-            return {};
+            if (j.contains("mmprojFile") && j["mmprojFile"].contains("rfilename")) {
+                mmprojFile = j["mmprojFile"]["rfilename"].get<std::string>();
+            }
+        } catch (const std::exception & e) {
+            throw std::runtime_error(std::string("error parsing manifest JSON: ") + e.what());
        }
-    }
-
-    if (is_hf) {
-        for (const auto & f : hf.model_files) {
-            hf_cache::finalize_file(f);
-        }
-        result.model_path = hf.model_files[0].final_path;
-
-        if (!hf.mmproj.path.empty()) {
-            result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
+        if (!use_cache) {
+            // if not using cached response, update the cache file
+            write_file(cached_response_path, res_str);
        }
+    } else if (res_code == 401) {
+        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
    } else {
-        result.model_path = model.path;
+        throw std::runtime_error(string_format("error from HF API (%s), response code: %ld, data: %s", url.c_str(), res_code, res_str.c_str()));
    }

-    return result;
+    // check response
+    if (ggufFile.empty()) {
+        throw std::runtime_error("error: model does not have ggufFile");
+    }
+
+    return { hf_repo, ggufFile, mmprojFile };
 }

 //
@@ -880,21 +765,28 @@ std::string common_docker_resolve_model(const std::string & docker) {
 }

 std::vector<common_cached_model_info> common_list_cached_models() {
-    std::unordered_set<std::string> seen;
-    std::vector<common_cached_model_info> result;
-
-    auto files = hf_cache::get_cached_files();
-
-    for (const auto & f : files) {
-        auto split = get_gguf_split_info(f.path);
-        if (split.index != 1 || split.tag.empty() ||
-            split.prefix.find("mmproj") != std::string::npos) {
-            continue;
-        }
-        if (seen.insert(f.repo_id + ":" + split.tag).second) {
-            result.push_back({f.repo_id, split.tag});
+    std::vector<common_cached_model_info> models;
+    const std::string cache_dir = fs_get_cache_directory();
+    const std::vector<common_file_info> files = fs_list(cache_dir, false);
+    for (const auto & file : files) {
+        if (string_starts_with(file.name, "manifest=") && string_ends_with(file.name, ".json")) {
+            common_cached_model_info model_info;
+            model_info.manifest_path = file.path;
+            std::string fname = file.name;
+            string_replace_all(fname, ".json", ""); // remove extension
+            auto parts = string_split<std::string>(fname, '=');
+            if (parts.size() == 4) {
+                // expect format: manifest=<user>=<model>=<tag>=<other>
+                model_info.user  = parts[1];
+                model_info.model = parts[2];
+                model_info.tag   = parts[3];
+            } else {
+                // invalid format
+                continue;
+            }
+            model_info.size = 0; // TODO: get GGUF size, not manifest size
+            models.push_back(model_info);
        }
    }
-
-    return result;
+    return models;
 }
--- a/common/download.h
+++ b/common/download.h
@@ -17,60 +17,54 @@ struct common_remote_params {
 // get remote file content, returns <http_code, raw_response_body>
 std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);

-// split HF repo with tag into <repo, tag>, for example:
-// - "ggml-org/models:F16" -> <"ggml-org/models", "F16">
-// tag is optional and can be empty
+// split HF repo with tag into <repo, tag>
+// for example: "user/model:tag" -> <"user/model", "tag">
+// if tag is not present, default to "latest"
+// example: "user/model" -> <"user/model", "latest">
 std::pair<std::string, std::string> common_download_split_repo_tag(const std::string & hf_repo_with_tag);

-// Result of common_list_cached_models
 struct common_cached_model_info {
-    std::string repo;
+    std::string manifest_path;
+    std::string user;
+    std::string model;
    std::string tag;
+    size_t      size = 0; // GGUF size in bytes
+    // return string representation like "user/model:tag"
+    // if tag is "latest", it will be omitted
    std::string to_string() const {
-        return repo + ":" + tag;
+        return user + "/" + model + (tag == "latest" ? "" : ":" + tag);
    }
 };

-// Options for common_download_model
-struct common_download_model_opts {
-    bool download_mmproj = false;
-    bool offline         = false;
+struct common_hf_file_res {
+    std::string repo; // repo name with ":tag" removed
+    std::string ggufFile;
+    std::string mmprojFile;
 };

-// Result of common_download_model
-struct common_download_model_result {
-    std::string model_path;
-    std::string mmproj_path;
-};
+/**
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
+ *
+ * Return pair of <repo, file> (with "repo" already having tag removed)
+ *
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
+ */
+common_hf_file_res common_get_hf_file(
+    const std::string & hf_repo_with_tag,
+    const std::string & bearer_token,
+    bool offline,
+    const common_header_list & headers = {}
+);

-// Download model from HuggingFace repo or URL
-//
-// input (via model struct):
-// - model.hf_repo: HF repo with optional tag, see common_download_split_repo_tag
-// - model.hf_file: specific file in the repo (requires hf_repo)
-// - model.url: simple download (used if hf_repo is empty)
-// - model.path: local file path
-//
-// tag matching (for HF repos without model.hf_file):
-// - if tag is specified, searches for GGUF matching that quantization
-// - if no tag, searches for Q4_K_M, then Q4_0, then first available GGUF
-//
-// split GGUF: multi-part files like "model-00001-of-00003.gguf" are automatically
-// detected and all parts are downloaded
-//
-// caching:
-// - HF repos: uses HuggingFace cache
-// - URLs: uses ETag-based caching
-//
-// when opts.offline=true, no network requests are made
-// when download_mmproj=true, searches for mmproj in same directory as model or any parent directory
-// then with the closest quantization bits
-//
-// returns result with model_path and mmproj_path (empty on failure)
-common_download_model_result common_download_model(
+// returns true if download succeeded
+bool common_download_model(
    const common_params_model & model,
    const std::string & bearer_token,
-    const common_download_model_opts & opts = {},
+    bool offline,
    const common_header_list & headers = {}
 );

@@ -79,13 +73,11 @@ std::vector<common_cached_model_info> common_list_cached_models();

 // download single file from url to local path
 // returns status code or -1 on error
-// skip_etag: if true, don't read/write .etag files (for HF cache where filename is the hash)
 int common_download_file_single(const std::string & url,
                                const std::string & path,
                                const std::string & bearer_token,
                                bool offline,
-                                const common_header_list & headers = {},
-                                bool skip_etag = false);
+                                const common_header_list & headers = {});

 // resolve and download model from Docker registry
 // return local path to downloaded model file
--- a/common/hf-cache.cpp
+++ b/common/hf-cache.cpp
@@ -1,771 +0,0 @@
-#include "hf-cache.h"
-
-#include "common.h"
-#include "log.h"
-#include "http.h"
-
-#define JSON_ASSERT GGML_ASSERT
-#include <nlohmann/json.hpp>
-
-#include <filesystem>
-#include <fstream>
-#include <atomic>
-#include <regex> // migration only
-#include <string>
-#include <string_view>
-#include <stdexcept>
-
-namespace nl = nlohmann;
-
-#if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#define HOME_DIR "USERPROFILE"
-#include <windows.h>
-#else
-#define HOME_DIR "HOME"
-#include <unistd.h>
-#include <pwd.h>
-#endif
-
-namespace hf_cache {
-
-namespace fs = std::filesystem;
-
-static fs::path get_cache_directory() {
-    static const fs::path cache = []() {
-        struct {
-            const char * var;
-            fs::path path;
-        } entries[] = {
-            {"LLAMA_CACHE",           fs::path()},
-            {"HF_HUB_CACHE",          fs::path()},
-            {"HUGGINGFACE_HUB_CACHE", fs::path()},
-            {"HF_HOME",               fs::path("hub")},
-            {"XDG_CACHE_HOME",        fs::path("huggingface") / "hub"},
-            {HOME_DIR,                fs::path(".cache") / "huggingface" / "hub"}
-        };
-        for (const auto & entry : entries) {
-            if (auto * p = std::getenv(entry.var); p && *p) {
-                fs::path base(p);
-                return entry.path.empty() ? base : base / entry.path;
-            }
-        }
-#ifndef _WIN32
-        const struct passwd * pw = getpwuid(getuid());
-
-        if (pw->pw_dir && *pw->pw_dir) {
-            return fs::path(pw->pw_dir) / ".cache" / "huggingface" / "hub";
-        }
-#endif
-        throw std::runtime_error("Failed to determine HF cache directory");
-    }();
-
-    return cache;
-}
-
-static std::string folder_name_to_repo(const std::string & folder) {
-    constexpr std::string_view prefix = "models--";
-    if (folder.rfind(prefix, 0)) {
-        return {};
-    }
-    std::string result = folder.substr(prefix.length());
-    string_replace_all(result, "--", "/");
-    return result;
-}
-
-static std::string repo_to_folder_name(const std::string & repo_id) {
-    constexpr std::string_view prefix = "models--";
-    std::string result = std::string(prefix) + repo_id;
-    string_replace_all(result, "/", "--");
-    return result;
-}
-
-static fs::path get_repo_path(const std::string & repo_id) {
-    return get_cache_directory() / repo_to_folder_name(repo_id);
-}
-
-static bool is_hex_char(const char c) {
-    return (c >= 'A' && c <= 'F') ||
-           (c >= 'a' && c <= 'f') ||
-           (c >= '0' && c <= '9');
-}
-
-static bool is_hex_string(const std::string & s, size_t expected_len) {
-    if (s.length() != expected_len) {
-        return false;
-    }
-    for (const char c : s) {
-        if (!is_hex_char(c)) {
-            return false;
-        }
-    }
-    return true;
-}
-
-static bool is_alphanum(const char c) {
-    return (c >= 'A' && c <= 'Z') ||
-           (c >= 'a' && c <= 'z') ||
-           (c >= '0' && c <= '9');
-}
-
-static bool is_special_char(char c) {
-    return c == '/' || c == '.' || c == '-';
-}
-
-// base chars [A-Za-z0-9_] are always valid
-// special chars [/.-] must be surrounded by base chars
-// exactly one '/' required
-static bool is_valid_repo_id(const std::string & repo_id) {
-    if (repo_id.empty() || repo_id.length() > 256) {
-        return false;
-    }
-    int slash = 0;
-    bool special = true;
-
-    for (const char c : repo_id) {
-        if (is_alphanum(c) || c == '_') {
-            special = false;
-        } else if (is_special_char(c)) {
-            if (special) {
-                return false;
-            }
-            slash += (c == '/');
-            special = true;
-        } else {
-            return false;
-        }
-    }
-    return !special && slash == 1;
-}
-
-static bool is_valid_hf_token(const std::string & token) {
-    if (token.length() < 37 || token.length() > 256 ||
-        !string_starts_with(token, "hf_")) {
-        return false;
-    }
-    for (size_t i = 3; i < token.length(); ++i) {
-        if (!is_alphanum(token[i])) {
-            return false;
-        }
-    }
-    return true;
-}
-
-static bool is_valid_commit(const std::string & hash) {
-    return is_hex_string(hash, 40);
-}
-
-static bool is_valid_oid(const std::string & oid) {
-    return is_hex_string(oid, 40) || is_hex_string(oid, 64);
-}
-
-static bool is_valid_subpath(const fs::path & path, const fs::path & subpath) {
-    if (subpath.is_absolute()) {
-        return false; // never do a / b with b absolute
-    }
-    auto b = fs::absolute(path).lexically_normal();
-    auto t = (b / subpath).lexically_normal();
-    auto [b_end, _] = std::mismatch(b.begin(), b.end(), t.begin(), t.end());
-
-    return b_end == b.end();
-}
-
-static void safe_write_file(const fs::path & path, const std::string & data) {
-    fs::path path_tmp = path.string() + ".tmp";
-
-    if (path.has_parent_path()) {
-        fs::create_directories(path.parent_path());
-    }
-
-    std::ofstream file(path_tmp);
-    file << data;
-    file.close();
-
-    std::error_code ec;
-
-    if (!file.fail()) {
-        fs::rename(path_tmp, path, ec);
-    }
-    if (file.fail() || ec) {
-        fs::remove(path_tmp, ec);
-        throw std::runtime_error("failed to write file: " + path.string());
-    }
-}
-
-static nl::json api_get(const std::string & url,
-                        const std::string & token) {
-    auto [cli, parts] = common_http_client(url);
-
-    httplib::Headers headers = {
-        {"User-Agent", "llama-cpp/" + build_info},
-        {"Accept", "application/json"}
-    };
-
-    if (is_valid_hf_token(token)) {
-        headers.emplace("Authorization", "Bearer " + token);
-    } else if (!token.empty()) {
-        LOG_WRN("%s: invalid token, authentication disabled\n", __func__);
-    }
-
-    if (auto res = cli.Get(parts.path, headers)) {
-        auto body = res->body;
-
-        if (res->status == 200) {
-            return nl::json::parse(res->body);
-        }
-        try {
-            body = nl::json::parse(res->body)["error"].get<std::string>();
-        } catch (...) { }
-
-        throw std::runtime_error("GET failed (" + std::to_string(res->status) + "): " + body);
-    } else {
-        throw std::runtime_error("HTTPLIB failed: " + httplib::to_string(res.error()));
-    }
-}
-
-static std::string get_repo_commit(const std::string & repo_id,
-                                   const std::string & token) {
-    try {
-        auto endpoint = get_model_endpoint();
-        auto json = api_get(endpoint + "api/models/" + repo_id + "/refs", token);
-
-        if (!json.is_object() ||
-            !json.contains("branches") || !json["branches"].is_array()) {
-            LOG_WRN("%s: missing 'branches' for '%s'\n", __func__, repo_id.c_str());
-            return {};
-        }
-
-        fs::path refs_path = get_repo_path(repo_id) / "refs";
-        std::string name;
-        std::string commit;
-
-        for (const auto & branch : json["branches"]) {
-            if (!branch.is_object() ||
-                !branch.contains("name") || !branch["name"].is_string() ||
-                !branch.contains("targetCommit") || !branch["targetCommit"].is_string()) {
-                continue;
-            }
-            std::string _name = branch["name"].get<std::string>();
-            std::string _commit = branch["targetCommit"].get<std::string>();
-
-            if (!is_valid_subpath(refs_path, _name)) {
-                LOG_WRN("%s: skip invalid branch: %s\n", __func__, _name.c_str());
-                continue;
-            }
-            if (!is_valid_commit(_commit)) {
-                LOG_WRN("%s: skip invalid commit: %s\n", __func__, _commit.c_str());
-                continue;
-            }
-
-            if (_name == "main") {
-                name = _name;
-                commit = _commit;
-                break;
-            }
-
-            if (name.empty() || commit.empty()) {
-                name = _name;
-                commit = _commit;
-            }
-        }
-
-        if (name.empty() || commit.empty()) {
-            LOG_WRN("%s: no valid branch for '%s'\n", __func__, repo_id.c_str());
-            return {};
-        }
-
-        safe_write_file(refs_path / name, commit);
-        return commit;
-
-    } catch (const nl::json::exception & e) {
-        LOG_ERR("%s: JSON error: %s\n", __func__, e.what());
-    } catch (const std::exception & e) {
-        LOG_ERR("%s: error: %s\n", __func__, e.what());
-    }
-    return {};
-}
-
-hf_files get_repo_files(const std::string & repo_id,
-                        const std::string & token) {
-    if (!is_valid_repo_id(repo_id)) {
-        LOG_WRN("%s: invalid repository: %s\n", __func__, repo_id.c_str());
-        return {};
-    }
-
-    std::string commit = get_repo_commit(repo_id, token);
-    if (commit.empty()) {
-        LOG_WRN("%s: failed to resolve commit for %s\n", __func__, repo_id.c_str());
-        return {};
-    }
-
-    fs::path blobs_path = get_repo_path(repo_id) / "blobs";
-    fs::path commit_path = get_repo_path(repo_id) / "snapshots" / commit;
-
-    hf_files files;
-
-    try {
-        auto endpoint = get_model_endpoint();
-        auto json = api_get(endpoint + "api/models/" + repo_id + "/tree/" + commit + "?recursive=true", token);
-
-        if (!json.is_array()) {
-            LOG_WRN("%s: response is not an array for '%s'\n", __func__, repo_id.c_str());
-            return {};
-        }
-
-        for (const auto & item : json) {
-            if (!item.is_object() ||
-                !item.contains("type") || !item["type"].is_string() || item["type"] != "file" ||
-                !item.contains("path") || !item["path"].is_string()) {
-                continue;
-            }
-
-            hf_file file;
-            file.repo_id = repo_id;
-            file.path = item["path"].get<std::string>();
-
-            if (!is_valid_subpath(commit_path, file.path)) {
-                LOG_WRN("%s: skip invalid path: %s\n", __func__, file.path.c_str());
-                continue;
-            }
-
-            if (item.contains("lfs") && item["lfs"].is_object()) {
-                if (item["lfs"].contains("oid") && item["lfs"]["oid"].is_string()) {
-                    file.oid = item["lfs"]["oid"].get<std::string>();
-                }
-                if (item["lfs"].contains("size") && item["lfs"]["size"].is_number()) {
-                    file.size = item["lfs"]["size"].get<size_t>();
-                }
-            } else if (item.contains("oid") && item["oid"].is_string()) {
-                file.oid = item["oid"].get<std::string>();
-            }
-            if (file.size == 0 && item.contains("size") && item["size"].is_number()) {
-                file.size = item["size"].get<size_t>();
-            }
-
-            if (!file.oid.empty() && !is_valid_oid(file.oid)) {
-                LOG_WRN("%s: skip invalid oid: %s\n", __func__, file.oid.c_str());
-                continue;
-            }
-
-            file.url = endpoint + repo_id + "/resolve/" + commit + "/" + file.path;
-
-            fs::path final_path = commit_path / file.path;
-            file.final_path = final_path.string();
-
-            if (!file.oid.empty() && !fs::exists(final_path)) {
-                fs::path local_path = blobs_path / file.oid;
-                file.local_path = local_path.string();
-            } else {
-                file.local_path = file.final_path;
-            }
-
-            files.push_back(file);
-        }
-    } catch (const nl::json::exception & e) {
-        LOG_ERR("%s: JSON error: %s\n", __func__, e.what());
-    } catch (const std::exception & e) {
-        LOG_ERR("%s: error: %s\n", __func__, e.what());
-    }
-    return files;
-}
-
-static std::string get_cached_ref(const fs::path & repo_path) {
-    fs::path refs_path = repo_path / "refs";
-    if (!fs::is_directory(refs_path)) {
-        return {};
-    }
-    std::string fallback;
-
-    for (const auto & entry : fs::directory_iterator(refs_path)) {
-        if (!entry.is_regular_file()) {
-            continue;
-        }
-        std::ifstream f(entry.path());
-        std::string commit;
-        if (!f || !std::getline(f, commit) || commit.empty()) {
-            continue;
-        }
-        if (!is_valid_commit(commit)) {
-            LOG_WRN("%s: skip invalid commit: %s\n", __func__, commit.c_str());
-            continue;
-        }
-        if (entry.path().filename() == "main") {
-            return commit;
-        }
-        if (fallback.empty()) {
-            fallback = commit;
-        }
-    }
-    return fallback;
-}
-
-hf_files get_cached_files(const std::string & repo_id) {
-    fs::path cache_dir = get_cache_directory();
-    if (!fs::exists(cache_dir)) {
-        return {};
-    }
-
-    if (!repo_id.empty() && !is_valid_repo_id(repo_id)) {
-        LOG_WRN("%s: invalid repository: %s\n", __func__, repo_id.c_str());
-        return {};
-    }
-
-    hf_files files;
-
-    for (const auto & repo : fs::directory_iterator(cache_dir)) {
-        if (!repo.is_directory()) {
-            continue;
-        }
-        fs::path snapshots_path = repo.path() / "snapshots";
-
-        if (!fs::exists(snapshots_path)) {
-            continue;
-        }
-        std::string _repo_id = folder_name_to_repo(repo.path().filename().string());
-
-        if (!is_valid_repo_id(_repo_id)) {
-            continue;
-        }
-        if (!repo_id.empty() && _repo_id != repo_id) {
-            continue;
-        }
-        std::string commit = get_cached_ref(repo.path());
-        fs::path commit_path = snapshots_path / commit;
-
-        if (commit.empty() || !fs::is_directory(commit_path)) {
-            continue;
-        }
-        for (const auto & entry : fs::recursive_directory_iterator(commit_path)) {
-            if (!entry.is_regular_file() && !entry.is_symlink()) {
-                continue;
-            }
-            fs::path path = entry.path().lexically_relative(commit_path);
-
-            if (!path.empty()) {
-                hf_file file;
-                file.repo_id = _repo_id;
-                file.path = path.generic_string();
-                file.local_path = entry.path().string();
-                file.final_path = file.local_path;
-                files.push_back(std::move(file));
-            }
-        }
-    }
-
-    return files;
-}
-
-std::string finalize_file(const hf_file & file) {
-    static std::atomic<bool> symlinks_disabled{false};
-
-    std::error_code ec;
-    fs::path local_path(file.local_path);
-    fs::path final_path(file.final_path);
-
-    if (local_path == final_path || fs::exists(final_path, ec)) {
-        return file.final_path;
-    }
-
-    if (!fs::exists(local_path, ec)) {
-        return file.final_path;
-    }
-
-    fs::create_directories(final_path.parent_path(), ec);
-
-    if (!symlinks_disabled) {
-        fs::path target = fs::relative(local_path, final_path.parent_path(), ec);
-        if (!ec) {
-            fs::create_symlink(target, final_path, ec);
-        }
-        if (!ec) {
-            return file.final_path;
-        }
-    }
-
-    if (!symlinks_disabled.exchange(true)) {
-        LOG_WRN("%s: failed to create symlink: %s\n", __func__, ec.message().c_str());
-        LOG_WRN("%s: switching to degraded mode\n", __func__);
-    }
-
-    fs::rename(local_path, final_path, ec);
-    if (ec) {
-        LOG_WRN("%s: failed to move file to snapshots: %s\n", __func__, ec.message().c_str());
-        fs::copy(local_path, final_path, ec);
-        if (ec) {
-            LOG_ERR("%s: failed to copy file to snapshots: %s\n", __func__, ec.message().c_str());
-        }
-    }
-    return file.final_path;
-}
-
-// delete everything after this line, one day
-
-// copied from download.cpp without the tag part
-struct gguf_split_info {
-    std::string prefix; // tag included
-    int index;
-    int count;
-};
-
-static gguf_split_info get_gguf_split_info(const std::string & path) {
-    static const std::regex re_split("^(.+)-([0-9]{5})-of-([0-9]{5})$", std::regex::icase);
-    std::smatch m;
-
-    std::string prefix = path;
-    if (!string_remove_suffix(prefix, ".gguf")) {
-        return {};
-    }
-
-    int index = 1;
-    int count = 1;
-
-    if (std::regex_match(prefix, m, re_split)) {
-        index = std::stoi(m[2].str());
-        count = std::stoi(m[3].str());
-        prefix = m[1].str();
-    }
-
-    return {std::move(prefix), index, count};
-}
-
-static std::pair<std::string, std::string> parse_manifest_name(std::string & filename) {
-    static const std::regex re(R"(^manifest=([^=]+)=([^=]+)=.*\.json$)");
-    std::smatch match;
-    if (std::regex_match(filename, match, re)) {
-        return {match[1].str(), match[2].str()};
-    }
-    return {};
-}
-
-static std::string make_old_cache_filename(const std::string & owner,
-                                           const std::string & repo,
-                                           const std::string & filename) {
-    auto result = owner + "_" + repo + "_" + filename;
-    string_replace_all(result, "/", "_");
-    return result;
-}
-
-struct migrate_file {
-    std::string path;
-    std::string sha256;
-    size_t size;
-    fs::path old_path;
-    fs::path etag_path;
-    const hf_file * file;
-};
-
-using migrate_files = std::vector<migrate_file>;
-
-static bool collect_file(const fs::path    & old_cache,
-                         const std::string & owner,
-                         const std::string & repo,
-                         const std::string & path,
-                         const std::string & sha256,
-                         const hf_files    & files,
-                         migrate_files     & to_migrate) {
-
-    const hf_file * file = nullptr;
-
-    for (const auto & f : files) {
-        if (f.path == path) {
-            file = &f;
-            break;
-        }
-    }
-
-    std::string old_filename = make_old_cache_filename(owner, repo, path);
-    fs::path old_path = old_cache / old_filename;
-    fs::path etag_path = old_path.string() + ".etag";
-
-    if (!fs::exists(old_path)) {
-        if (file && fs::exists(file->final_path)) {
-            return true;
-        }
-        LOG_WRN("%s: %s not found in old cache or HF cache\n", __func__, old_filename.c_str());
-        return false;
-    }
-
-    if (!file) {
-        LOG_WRN("%s: %s not found in current repo\n", __func__, old_filename.c_str());
-        return false;
-    }
-
-    if (!sha256.empty() && !file->oid.empty() && sha256 != file->oid) {
-        LOG_WRN("%s: %s is not up to date (sha256 mismatch)\n", __func__, old_filename.c_str());
-        return false;
-    }
-
-    if (file->size > 0) {
-        size_t size = fs::file_size(old_path);
-        if (size != file->size) {
-            LOG_WRN("%s: %s has wrong size %zu (expected %zu)\n", __func__, old_filename.c_str(), size, file->size);
-            return false;
-        }
-    }
-
-    to_migrate.push_back({path, sha256, file->size, old_path, etag_path, file});
-    return true;
-}
-
-static bool collect_files(const fs::path    & old_cache,
-                          const std::string & owner,
-                          const std::string & repo,
-                          const nl::json    & node,
-                          const hf_files    & files,
-                          migrate_files     & to_migrate) {
-
-    if (!node.contains("rfilename") ||
-        !node.contains("lfs")       ||
-        !node["lfs"].contains("sha256")) {
-        return true;
-    }
-
-    std::string path = node["rfilename"];
-    std::string sha256 = node["lfs"]["sha256"];
-
-    auto split = get_gguf_split_info(path);
-
-    if (split.count <= 1) {
-        return collect_file(old_cache, owner, repo, path, sha256, files, to_migrate);
-    }
-
-    std::vector<std::pair<std::string, std::string>> splits;
-
-    for (const auto & f : files) {
-        auto split_f = get_gguf_split_info(f.path);
-        if (split_f.count == split.count && split_f.prefix == split.prefix) {
-            // sadly the manifest only provides the sha256 of the first file (index == 1)
-            // the rest will be verified using the size...
-            std::string f_sha256 = (split_f.index == 1) ? sha256 : "";
-            splits.emplace_back(f.path, f_sha256);
-        }
-    }
-
-    if ((int)splits.size() != split.count) {
-        LOG_WRN("%s: expected %d split files but found %d in repo\n", __func__, split.count, (int)splits.size());
-        return false;
-    }
-
-    for (const auto & [f_path, f_sha256] : splits) {
-        if (!collect_file(old_cache, owner, repo, f_path, f_sha256, files, to_migrate)) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-static bool migrate_file(const migrate_file & file) {
-    std::error_code ec;
-
-    fs::path new_path(file.file->local_path);
-    fs::create_directories(new_path.parent_path(), ec);
-
-    if (!fs::exists(new_path, ec)) {
-        fs::rename(file.old_path, new_path, ec);
-        if (ec) {
-            fs::copy_file(file.old_path, new_path, ec);
-            if (ec) {
-                LOG_ERR("%s: failed to move/copy %s: %s\n", __func__, file.old_path.string().c_str(), ec.message().c_str());
-                return false;
-            }
-        }
-        fs::remove(file.old_path, ec);
-    }
-    fs::remove(file.etag_path, ec);
-
-    std::string filename = finalize_file(*file.file);
-    LOG_INF("%s: migrated %s -> %s\n", __func__, file.old_path.filename().string().c_str(), filename.c_str());
-    return true;
-}
-
-void migrate_old_cache_to_hf_cache(const std::string & token, bool offline) {
-    fs::path old_cache = fs_get_cache_directory();
-    if (!fs::exists(old_cache)) {
-        return;
-    }
-
-    if (offline) {
-        LOG_WRN("%s: skipping migration in offline mode (will run when online)\n", __func__);
-        return; // -hf is not going to work
-    }
-
-    bool warned = false;
-
-    for (const auto & entry : fs::directory_iterator(old_cache)) {
-        if (!entry.is_regular_file()) {
-            continue;
-        }
-        auto filename = entry.path().filename().string();
-        auto [owner, repo] = parse_manifest_name(filename);
-
-        if (owner.empty() || repo.empty()) {
-            continue;
-        }
-
-        if (!warned) {
-            warned = true;
-            LOG_WRN("================================================================================\n"
-                    "WARNING: Migrating cache to HuggingFace cache directory\n"
-                    "  Old cache: %s\n"
-                    "  New cache: %s\n"
-                    "This one-time migration moves models previously downloaded with -hf\n"
-                    "from the legacy llama.cpp cache to the standard HuggingFace cache.\n"
-                    "Models downloaded with --model-url are not affected.\n"
-                    "================================================================================\n",
-                    old_cache.string().c_str(), get_cache_directory().string().c_str());
-        }
-
-        auto repo_id = owner + "/" + repo;
-        auto files = get_repo_files(repo_id, token);
-
-        if (files.empty()) {
-            LOG_WRN("%s: could not get repo files for %s, skipping\n", __func__, repo_id.c_str());
-            continue;
-        }
-
-        migrate_files to_migrate;
-        bool ok = true;
-
-        try {
-            std::ifstream manifest(entry.path());
-            auto json = nl::json::parse(manifest);
-            for (const char * key : {"ggufFile", "mmprojFile"}) {
-                if (json.contains(key)) {
-                    if (!collect_files(old_cache, owner, repo, json[key], files, to_migrate)) {
-                        ok = false;
-                        break;
-                    }
-                }
-            }
-        } catch (const std::exception & e) {
-            LOG_WRN("%s: failed to parse manifest %s: %s\n", __func__, filename.c_str(), e.what());
-            continue;
-        }
-
-        if (!ok) {
-            LOG_WRN("%s: migration skipped: one or more files failed validation\n", __func__);
-            continue;
-        }
-
-        for (const auto & file : to_migrate) {
-            if (!migrate_file(file)) {
-                ok = false;
-                break;
-            }
-        }
-
-        if (!ok) {
-            LOG_WRN("%s: migration failed: could not migrate all files\n", __func__);
-            continue;
-        }
-
-        LOG_INF("%s: migration complete, deleting manifest: %s\n", __func__, entry.path().string().c_str());
-        fs::remove(entry.path());
-    }
-}
-
-} // namespace hf_cache
--- a/common/hf-cache.h
+++ b/common/hf-cache.h
@@ -1,36 +0,0 @@
-#pragma once
-
-#include <string>
-#include <vector>
-
-// Ref: https://huggingface.co/docs/hub/local-cache.md
-
-namespace hf_cache {
-
-struct hf_file {
-    std::string path;
-    std::string url;
-    std::string local_path;
-    std::string final_path;
-    std::string oid;
-    std::string repo_id;
-    size_t size = 0; // only for the migration
-};
-
-using hf_files = std::vector<hf_file>;
-
-// Get files from HF API
-hf_files get_repo_files(
-    const std::string & repo_id,
-    const std::string & token
-);
-
-hf_files get_cached_files(const std::string & repo_id = {});
-
-// Create snapshot path (link or move/copy) and return it
-std::string finalize_file(const hf_file & file);
-
-// TODO: Remove later
-void migrate_old_cache_to_hf_cache(const std::string & token, bool offline = false);
-
-} // namespace hf_cache
--- a/common/jinja/parser.cpp
+++ b/common/jinja/parser.cpp
@@ -53,13 +53,6 @@ private:
        return tokens[current + offset];
    }

-    const token & next() {
-        if (current >= tokens.size()) {
-            throw parser_exception("Parser Error: Unexpected EOF", source, tokens.empty() ? 0 : tokens.back().pos);
-        }
-        return tokens[current++];
-    }
-
    token expect(token::type type, const std::string&  error) {
        const auto & t = peek();
        if (t.t != type) {
@@ -97,9 +90,9 @@ private:
        size_t start_pos = current;
        switch (peek().t) {
            case token::comment:
-                return mk_stmt<comment_statement>(start_pos, next().value);
+                return mk_stmt<comment_statement>(start_pos, tokens[current++].value);
            case token::text:
-                return mk_stmt<string_literal>(start_pos, next().value);
+                return mk_stmt<string_literal>(start_pos, tokens[current++].value);
            case token::open_statement:
                return parse_jinja_statement();
            case token::open_expression:
@@ -126,7 +119,8 @@ private:
        }

        size_t start_pos = current;
-        std::string name = next().value;
+        std::string name = peek().value;
+        current++; // consume identifier

        statement_ptr result;
        if (name == "set") {
@@ -208,7 +202,7 @@ private:
            // Ignore generation blocks (transformers-specific)
            // See https://github.com/huggingface/transformers/pull/30650 for more information.
            result = mk_stmt<noop_statement>(start_pos);
-            ++current;
+            current++;

        } else {
            throw std::runtime_error("Unknown statement: " + name);
@@ -223,7 +217,7 @@ private:
        statements body;

        if (is(token::equals)) {
-            ++current;
+            current++;
            value = parse_expression_sequence();
        } else {
            // parsing multiline set here
@@ -286,7 +280,7 @@ private:
        exprs.push_back(primary ? parse_primary_expression() : parse_expression());
        bool is_tuple = is(token::comma);
        while (is(token::comma)) {
-            ++current; // consume comma
+            current++; // consume comma
            exprs.push_back(primary ? parse_primary_expression() : parse_expression());
        }
        return is_tuple ? mk_stmt<tuple_literal>(start_pos, std::move(exprs)) : std::move(exprs[0]);
@@ -296,7 +290,7 @@ private:
        // e.g., `message` in `for message in messages`
        auto loop_var = parse_expression_sequence(true); // should be an identifier/tuple
        if (!is_identifier("in")) throw std::runtime_error("Expected 'in'");
-        ++current; // consume 'in'
+        current++;

        // `messages` in `for message in messages`
        auto iterable = parse_expression();
@@ -311,8 +305,7 @@ private:
        }

        if (is_statement({"else"})) {
-            ++current; // consume {%
-            ++current; // consume 'else'
+            current += 2;
            expect(token::close_statement, "Expected %}");
            while (!is_statement({"endfor"})) {
                alternate.push_back(parse_any());
@@ -354,7 +347,7 @@ private:
        auto left = parse_logical_and_expression();
        while (is_identifier("or")) {
            size_t start_pos = current;
-            token op = next();
+            token op = tokens[current++];
            left = mk_stmt<binary_expression>(start_pos, op, std::move(left), parse_logical_and_expression());
        }
        return left;
@@ -364,7 +357,7 @@ private:
        auto left = parse_logical_negation_expression();
        while (is_identifier("and")) {
            size_t start_pos = current;
-            auto op = next();
+            auto op = tokens[current++];
            left = mk_stmt<binary_expression>(start_pos, op, std::move(left), parse_logical_negation_expression());
        }
        return left;
@@ -374,7 +367,7 @@ private:
        // Try parse unary operators
        if (is_identifier("not")) {
            size_t start_pos = current;
-            auto op = next();
+            auto op = tokens[current++];
            return mk_stmt<unary_expression>(start_pos, op, parse_logical_negation_expression());
        }
        return parse_comparison_expression();
@@ -389,12 +382,11 @@ private:
            size_t start_pos = current;
            if (is_identifier("not") && peek(1).t == token::identifier && peek(1).value == "in") {
                op = {token::identifier, "not in", tokens[current].pos};
-                ++current; // consume 'not'
-                ++current; // consume 'in'
+                current += 2;
            } else if (is_identifier("in")) {
-                op = next();
+                op = tokens[current++];
            } else if (is(token::comparison_binary_operator)) {
-                op = next();
+                op = tokens[current++];
            } else break;
            left = mk_stmt<binary_expression>(start_pos, op, std::move(left), parse_additive_expression());
        }
@@ -405,7 +397,7 @@ private:
        auto left = parse_multiplicative_expression();
        while (is(token::additive_binary_operator)) {
            size_t start_pos = current;
-            auto op = next();
+            auto op = tokens[current++];
            left = mk_stmt<binary_expression>(start_pos, op, std::move(left), parse_multiplicative_expression());
        }
        return left;
@@ -415,7 +407,7 @@ private:
        auto left = parse_test_expression();
        while (is(token::multiplicative_binary_operator)) {
            size_t start_pos = current;
-            auto op = next();
+            auto op = tokens[current++];
            left = mk_stmt<binary_expression>(start_pos, op, std::move(left), parse_test_expression());
        }
        return left;
@@ -425,9 +417,9 @@ private:
        auto operand = parse_filter_expression();
        while (is_identifier("is")) {
            size_t start_pos = current;
-            ++current; // consume 'is'
+            current++;
            bool negate = false;
-            if (is_identifier("not")) { ++current; negate = true; }
+            if (is_identifier("not")) { current++; negate = true; }
            auto test_id = parse_primary_expression();
            // FIXME: tests can also be expressed like this: if x is eq 3
            if (is(token::open_paren)) test_id = parse_call_expression(std::move(test_id));
@@ -440,7 +432,7 @@ private:
        auto operand = parse_call_member_expression();
        while (is(token::pipe)) {
            size_t start_pos = current;
-            ++current; // consume pipe
+            current++;
            auto filter = parse_primary_expression();
            if (is(token::open_paren)) filter = parse_call_expression(std::move(filter));
            operand = mk_stmt<filter_expression>(start_pos, std::move(operand), std::move(filter));
@@ -498,7 +490,7 @@ private:
    statement_ptr parse_member_expression(statement_ptr object) {
        size_t start_pos = current;
        while (is(token::dot) || is(token::open_square_bracket)) {
-            auto op = next();
+            auto op = tokens[current++];
            bool computed = op.t == token::open_square_bracket;
            statement_ptr prop;
            if (computed) {
@@ -539,15 +531,12 @@ private:
            statement_ptr step = slices.size() > 2 ? std::move(slices[2]) : nullptr;
            return mk_stmt<slice_expression>(start_pos, std::move(start), std::move(stop), std::move(step));
        }
-        if (slices.empty()) {
-            return mk_stmt<blank_expression>(start_pos);
-        }
        return std::move(slices[0]);
    }

    statement_ptr parse_primary_expression() {
        size_t start_pos = current;
-        auto t = next();
+        auto t = tokens[current++];
        switch (t.t) {
            case token::numeric_literal:
                if (t.value.find('.') != std::string::npos) {
@@ -558,7 +547,7 @@ private:
            case token::string_literal: {
                std::string val = t.value;
                while (is(token::string_literal)) {
-                    val += next().value;
+                    val += tokens[current++].value;
                }
                return mk_stmt<string_literal>(start_pos, val);
            }
@@ -573,9 +562,9 @@ private:
                statements vals;
                while (!is(token::close_square_bracket)) {
                    vals.push_back(parse_expression());
-                    if (is(token::comma)) ++current;
+                    if (is(token::comma)) current++;
                }
-                ++current;
+                current++;
                return mk_stmt<array_literal>(start_pos, std::move(vals));
            }
            case token::open_curly_bracket: {
@@ -584,9 +573,9 @@ private:
                    auto key = parse_expression();
                    expect(token::colon, "Expected :");
                    pairs.push_back({std::move(key), parse_expression()});
-                    if (is(token::comma)) ++current;
+                    if (is(token::comma)) current++;
                }
-                ++current;
+                current++;
                return mk_stmt<object_literal>(start_pos, std::move(pairs));
            }
            default:
--- a/common/jinja/runtime.cpp
+++ b/common/jinja/runtime.cpp
@@ -667,9 +667,8 @@ value macro_statement::execute_impl(context & ctx) {
                if (is_stmt<identifier>(this->args[i])) {
                    // normal parameter
                    std::string param_name = cast_stmt<identifier>(this->args[i])->val;
-                    value param_value = args.get_kwarg_or_pos(param_name, i);
-                    JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
-                    macro_ctx.set_val(param_name, param_value);
+                    JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), args.get_pos(i)->type().c_str());
+                    macro_ctx.set_val(param_name, args.get_pos(i));
                } else if (is_stmt<keyword_argument_expression>(this->args[i])) {
                    // default argument used as normal parameter
                    auto kwarg = cast_stmt<keyword_argument_expression>(this->args[i]);
@@ -677,9 +676,8 @@ value macro_statement::execute_impl(context & ctx) {
                        throw std::runtime_error("Keyword argument key must be an identifier in macro '" + name + "'");
                    }
                    std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
-                    value param_value = args.get_kwarg_or_pos(param_name, i);
-                    JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
-                    macro_ctx.set_val(param_name, param_value);
+                    JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), args.get_pos(i)->type().c_str());
+                    macro_ctx.set_val(param_name, args.get_pos(i));
                } else {
                    throw std::runtime_error("Invalid parameter type in macro '" + name + "'");
                }
@@ -771,15 +769,10 @@ value member_expression::execute_impl(context & ctx) {
    }

    JJ_DEBUG("Member expression on object type %s, property type %s", object->type().c_str(), property->type().c_str());
-    value val = mk_val<value_undefined>("object_property");
-
-    if (property->is_undefined()) {
-        JJ_DEBUG("%s", "Member expression property is undefined, returning undefined");
-        return val;
-    }
-
    ensure_key_type_allowed(property);

+    value val = mk_val<value_undefined>("object_property");
+
    if (is_val<value_undefined>(object)) {
        JJ_DEBUG("%s", "Accessing property on undefined object, returning undefined");
        return val;
--- a/common/jinja/runtime.h
+++ b/common/jinja/runtime.h
@@ -263,14 +263,6 @@ struct comment_statement : public statement {

 // Expressions

-// Represents an omitted expression in a computed member, e.g. `a[]`.
-struct blank_expression : public expression {
-    std::string type() const override { return "BlankExpression"; }
-    value execute_impl(context &) override {
-        return mk_val<value_undefined>();
-    }
-};
-
 struct member_expression : public expression {
    statement_ptr object;
    statement_ptr property;
--- a/common/jinja/value.h
+++ b/common/jinja/value.h
@@ -451,7 +451,7 @@ struct value_array_t : public value_t {
    }
 protected:
    virtual bool equivalent(const value_t & other) const override {
-        return typeid(*this) == typeid(other) && is_hashable() && other.is_hashable() && std::equal(val_arr.begin(), val_arr.end(), other.val_arr.begin(), other.val_arr.end(), value_equivalence());
+        return typeid(*this) == typeid(other) && is_hashable() && other.is_hashable() && std::equal(val_arr.begin(), val_arr.end(), other.val_arr.begin(), value_equivalence());
    }
 };
 using value_array = std::shared_ptr<value_array_t>;
@@ -587,7 +587,7 @@ struct value_object_t : public value_t {
    }
 protected:
    virtual bool equivalent(const value_t & other) const override {
-        return typeid(*this) == typeid(other) && is_hashable() && other.is_hashable() && std::equal(val_obj.begin(), val_obj.end(), other.val_obj.begin(), other.val_obj.end(), value_equivalence());
+        return typeid(*this) == typeid(other) && is_hashable() && other.is_hashable() && std::equal(val_obj.begin(), val_obj.end(), other.val_obj.begin(), value_equivalence());
    }
 };
 using value_object = std::shared_ptr<value_object_t>;
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -416,30 +416,15 @@ private:
                    i++;
                } else if (c == '(') {
                    i++;
-                    if (i < length && sub_pattern[i] == '?') {
-                        if (i + 1 < length && sub_pattern[i + 1] == ':') {
-                            i += 2; // skip "?:" for non-capturing group, treat as regular group
-                        } else {
-                            // lookahead/lookbehind (?=, ?!, ?<=, ?<!) - not supported
+                    if (i < length) {
+                        if (sub_pattern[i] == '?') {
                            _warnings.push_back("Unsupported pattern syntax");
-                            // skip to matching ')' to avoid UB on empty seq
-                            int depth = 1;
-                            while (i < length && depth > 0) {
-                                if (sub_pattern[i] == '\\' && i + 1 < length) {
-                                    i += 2; // skip escaped character
-                                } else {
-                                    if (sub_pattern[i] == '(') depth++;
-                                    else if (sub_pattern[i] == ')') depth--;
-                                    i++;
-                                }
-                            }
-                            continue;
                        }
                    }
                    seq.emplace_back("(" + to_rule(transform()) + ")", false);
                } else if (c == ')') {
                    i++;
-                    if (start > 0 && sub_pattern[start - 1] != '(' && (start < 2 || sub_pattern[start - 2] != '?' || sub_pattern[start - 1] != ':')) {
+                    if (start > 0 && sub_pattern[start - 1] != '(') {
                        _errors.push_back("Unbalanced parentheses");
                    }
                    return join_seq();
--- a/common/ngram-map.h
+++ b/common/ngram-map.h
@@ -51,7 +51,7 @@ struct common_ngram_map_value {
 // statistics of a n-gram
 struct common_ngram_map_key {
    size_t   key_idx;   // index of key n-gram in token-history
-    size_t   stat_idx;  // index of last token of statistics computation (key_num, values)
+    size_t   stat_idx;  // index of last token of stastistics computation (key_num, values)

    uint16_t key_num;   // number of occurrences of this key n-gram in token-history
    common_ngram_map_value values[COMMON_NGRAM_MAX_VALUES]; // some known values after the key
--- a/common/peg-parser.cpp
+++ b/common/peg-parser.cpp
@@ -1557,36 +1557,6 @@ static std::unordered_set<std::string> collect_reachable_rules(

 // GBNF generation implementation
 void common_peg_arena::build_grammar(const common_grammar_builder & builder, bool lazy) const {
-    auto schema_delegates = [](const common_peg_schema_parser & s) -> bool {
-        if (!s.schema) {
-            return true;
-        }
-        if (s.raw && s.schema->contains("type") && s.schema->at("type").is_string() && s.schema->at("type") == "string") {
-            return true;
-        }
-        return false;
-    };
-
-    // Unwrap the parser so we can properly check if it's a sequence or choice
-    auto effective_parser = [&](common_peg_parser_id id) -> const common_peg_parser_variant & {
-        while (true) {
-            const auto & p = parsers_.at(id);
-            if (const auto * tag = std::get_if<common_peg_tag_parser>(&p)) {
-                id = tag->child;
-            } else if (const auto * atomic = std::get_if<common_peg_atomic_parser>(&p)) {
-                id = atomic->child;
-            } else if (const auto * schema = std::get_if<common_peg_schema_parser>(&p)) {
-                if (schema_delegates(*schema)) {
-                    id = schema->child;
-                } else {
-                    return p;
-                }
-            } else {
-                return p;
-            }
-        }
-    };
-
    // Generate GBNF for a parser
    std::function<std::string(common_peg_parser_id)> to_gbnf = [&](common_peg_parser_id id) -> std::string {
        const auto & parser = parsers_.at(id);
@@ -1607,7 +1577,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
                        s += " ";
                    }
                    auto child_gbnf = to_gbnf(child);
-                    const auto & child_parser = effective_parser(child);
+                    const auto & child_parser = parsers_.at(child);
                    if (std::holds_alternative<common_peg_choice_parser>(child_parser) ||
                        std::holds_alternative<common_peg_sequence_parser>(child_parser)) {
                        s += "(" + child_gbnf + ")";
@@ -1623,7 +1593,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
                        s += " | ";
                    }
                    auto child_gbnf = to_gbnf(child);
-                    const auto & child_parser = effective_parser(child);
+                    const auto & child_parser = parsers_.at(child);
                    if (std::holds_alternative<common_peg_choice_parser>(child_parser)) {
                        s += "(" + child_gbnf + ")";
                    } else {
@@ -1633,7 +1603,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
                return s;
            } else if constexpr (std::is_same_v<T, common_peg_repetition_parser>) {
                auto child_gbnf = to_gbnf(p.child);
-                const auto & child_parser = effective_parser(p.child);
+                const auto & child_parser = parsers_.at(p.child);
                if (std::holds_alternative<common_peg_choice_parser>(child_parser) ||
                    std::holds_alternative<common_peg_sequence_parser>(child_parser)) {
                    child_gbnf = "(" + child_gbnf + ")";
@@ -1693,10 +1663,15 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
                }
                return gbnf_excluding_pattern(p.delimiters);
            } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
-                if (schema_delegates(p)) {
-                    return to_gbnf(p.child);
+                if (p.schema) {
+                    if (p.raw && p.schema->contains("type") && p.schema->at("type").is_string() && p.schema->at("type") == "string") {
+                        // TODO: Implement more comprehensive grammar generation for raw strings.
+                        // For now, use the grammar emitted from the underlying parser.
+                        return to_gbnf(p.child);
+                    }
+                    return builder.add_schema(p.name, *p.schema);
                }
-                return builder.add_schema(p.name, *p.schema);
+                return to_gbnf(p.child);
            } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
                return p.name;
            } else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
--- a/common/reasoning-budget.cpp
+++ b/common/reasoning-budget.cpp
@@ -115,11 +115,9 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
            break;
        }
        case REASONING_BUDGET_FORCING:
-            ctx->force_pos++;
-            if (ctx->force_pos >= ctx->forced_tokens.size()) {
-                ctx->state = REASONING_BUDGET_DONE;
-                LOG_INF("reasoning-budget: forced sequence complete, done\n");
-            }
+            // force_pos is advanced in apply(), not here.
+            // This ensures the first forced token isn't skipped when the sampler
+            // is initialized directly in FORCING state (e.g. COUNTING + budget=0)
            break;
        case REASONING_BUDGET_DONE:
            break;
@@ -146,6 +144,14 @@ static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_tok
            cur_p->data[i].logit = -INFINITY;
        }
    }
+
+    // advance to next forced token (done here rather than in accept so that
+    // the first forced token isn't skipped when starting in FORCING state)
+    ctx->force_pos++;
+    if (ctx->force_pos >= ctx->forced_tokens.size()) {
+        ctx->state = REASONING_BUDGET_DONE;
+        LOG_INF("reasoning-budget: forced sequence complete, done\n");
+    }
 }

 static void common_reasoning_budget_reset(struct llama_sampler * smpl) {
@@ -157,15 +163,9 @@ static void common_reasoning_budget_reset(struct llama_sampler * smpl) {
    ctx->force_pos = 0;
 }

-// forward declaration for use in clone
-static struct llama_sampler * common_reasoning_budget_init_state(
-        const struct llama_vocab * vocab, const std::vector<llama_token> & start_tokens,
-        const std::vector<llama_token> & end_tokens, const std::vector<llama_token> & forced_tokens,
-        int32_t budget, common_reasoning_budget_state initial_state);
-
 static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) {
    const auto * ctx = (const common_reasoning_budget_ctx *) smpl->ctx;
-    return common_reasoning_budget_init_state(
+    return common_reasoning_budget_init(
        ctx->vocab,
        ctx->start_matcher.tokens,
        ctx->end_matcher.tokens,
@@ -191,13 +191,13 @@ static struct llama_sampler_i common_reasoning_budget_i = {
    /* .backend_set_input = */ nullptr,
 };

-static struct llama_sampler * common_reasoning_budget_init_state(
-        const struct llama_vocab             * vocab,
-        const std::vector<llama_token>       & start_tokens,
-        const std::vector<llama_token>       & end_tokens,
-        const std::vector<llama_token>       & forced_tokens,
-        int32_t                                budget,
-        common_reasoning_budget_state          initial_state) {
+struct llama_sampler * common_reasoning_budget_init(
+        const struct llama_vocab       * vocab,
+        const std::vector<llama_token> & start_tokens,
+        const std::vector<llama_token> & end_tokens,
+        const std::vector<llama_token> & forced_tokens,
+        int32_t                          budget,
+        common_reasoning_budget_state    initial_state) {
    // promote COUNTING with budget <= 0 to FORCING
    if (initial_state == REASONING_BUDGET_COUNTING && budget <= 0) {
        initial_state = REASONING_BUDGET_FORCING;
@@ -217,48 +217,3 @@ static struct llama_sampler * common_reasoning_budget_init_state(
        }
    );
 }
-
-struct llama_sampler * common_reasoning_budget_init(
-        const struct llama_vocab       * vocab,
-        const std::vector<llama_token> & start_tokens,
-        const std::vector<llama_token> & end_tokens,
-        const std::vector<llama_token> & forced_tokens,
-        int32_t                          budget,
-        const std::vector<llama_token> & prefill_tokens) {
-    // Determine initial state from prefill: COUNTING if the prefill begins with
-    // the start sequence but does not also contain the end sequence after it.
-    common_reasoning_budget_state initial_state = REASONING_BUDGET_IDLE;
-    if (!prefill_tokens.empty() && !start_tokens.empty() &&
-            prefill_tokens.size() >= start_tokens.size() &&
-            std::equal(start_tokens.begin(), start_tokens.end(), prefill_tokens.begin())) {
-        initial_state = REASONING_BUDGET_COUNTING;
-        // If the end sequence also follows the start in the prefill, reasoning
-        // was opened and immediately closed — stay IDLE.
-        if (!end_tokens.empty() &&
-                prefill_tokens.size() >= start_tokens.size() + end_tokens.size()) {
-            auto end_start = prefill_tokens.end() - (ptrdiff_t) end_tokens.size();
-            if (end_start >= prefill_tokens.begin() + (ptrdiff_t) start_tokens.size() &&
-                    std::equal(end_tokens.begin(), end_tokens.end(), end_start)) {
-                initial_state = REASONING_BUDGET_IDLE;
-            }
-        }
-    }
-    return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state);
-}
-
-struct llama_sampler * common_reasoning_budget_init(
-        const struct llama_vocab       * vocab,
-        const std::vector<llama_token> & start_tokens,
-        const std::vector<llama_token> & end_tokens,
-        const std::vector<llama_token> & forced_tokens,
-        int32_t                          budget,
-        common_reasoning_budget_state    initial_state) {
-    return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state);
-}
-
-common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl) {
-    if (!smpl) {
-        return REASONING_BUDGET_IDLE;
-    }
-    return ((const common_reasoning_budget_ctx *)smpl->ctx)->state;
-}
--- a/common/reasoning-budget.h
+++ b/common/reasoning-budget.h
@@ -24,26 +24,14 @@ enum common_reasoning_budget_state {
 //   DONE:         passthrough forever
 //
 // Parameters:
-//   vocab          - vocabulary (used for UTF-8 boundary detection; can be nullptr)
-//   start_tokens   - token sequence that activates counting
-//   end_tokens     - token sequence for natural deactivation
-//   forced_tokens  - token sequence forced when budget expires
-//   budget         - max tokens allowed in the reasoning block
-//   prefill_tokens - tokens already present in the prompt (generation prompt);
-//                    used to determine the initial state: COUNTING if they begin
-//                    with start_tokens (but don't also end with end_tokens),
-//                    IDLE otherwise. COUNTING with budget <= 0 is promoted to FORCING.
+//   vocab         - vocabulary (used for UTF-8 boundary detection; can be nullptr)
+//   start_tokens  - token sequence that activates counting
+//   end_tokens    - token sequence for natural deactivation
+//   forced_tokens - token sequence forced when budget expires
+//   budget        - max tokens allowed in the reasoning block
+//   initial_state - initial state of the sampler (e.g. IDLE or COUNTING)
+//                   note: COUNTING with budget <= 0 is promoted to FORCING
 //
-struct llama_sampler * common_reasoning_budget_init(
-        const struct llama_vocab       * vocab,
-        const std::vector<llama_token> & start_tokens,
-        const std::vector<llama_token> & end_tokens,
-        const std::vector<llama_token> & forced_tokens,
-        int32_t                          budget,
-        const std::vector<llama_token> & prefill_tokens = {});
-
-// Variant that takes an explicit initial state (used by tests and clone).
-// COUNTING with budget <= 0 is promoted to FORCING.
 struct llama_sampler * common_reasoning_budget_init(
        const struct llama_vocab       * vocab,
        const std::vector<llama_token> & start_tokens,
@@ -51,5 +39,3 @@ struct llama_sampler * common_reasoning_budget_init(
        const std::vector<llama_token> & forced_tokens,
        int32_t                          budget,
        common_reasoning_budget_state    initial_state);
-
-common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,17 +1,13 @@
 #include "sampling.h"

 #include "common.h"
-#include "ggml.h"
 #include "log.h"
 #include "reasoning-budget.h"

 #include <algorithm>
-#include <cctype>
-#include <climits>
 #include <cmath>
 #include <cstring>
 #include <unordered_map>
-#include <vector>

 // the ring buffer works similarly to std::deque, but with a fixed capacity
 // TODO: deduplicate with llama-impl.h
@@ -110,7 +106,6 @@ struct common_sampler {
    common_params_sampling params;

    struct llama_sampler * grmr;
-    struct llama_sampler * rbudget;
    struct llama_sampler * chain;

    ring_buffer<llama_token> prev;
@@ -190,15 +185,13 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
    lparams.no_perf = params.no_perf;

    llama_sampler * grmr = nullptr;
-    llama_sampler * rbudget = nullptr;
    llama_sampler * chain = llama_sampler_chain_init(lparams);

    std::vector<llama_sampler *> samplers;

-    const std::string & grammar_str = common_grammar_value(params.grammar);
-    if (grammar_str.compare(0, 11, "%llguidance") == 0) {
+    if (params.grammar.compare(0, 11, "%llguidance") == 0) {
 #ifdef LLAMA_USE_LLGUIDANCE
-        grmr = llama_sampler_init_llg(vocab, "lark", grammar_str.c_str());
+        grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
 #else
        GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
@@ -247,55 +240,26 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
            trigger_patterns_c.push_back(regex.c_str());
        }

-        if (!grammar_str.empty()) {
+        if (!params.grammar.empty()) {
             if (params.grammar_lazy) {
-                 grmr = llama_sampler_init_grammar_lazy_patterns(vocab, grammar_str.c_str(), "root",
+                 grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
                         trigger_patterns_c.data(), trigger_patterns_c.size(),
                         trigger_tokens.data(), trigger_tokens.size());
             } else {
-                 grmr = llama_sampler_init_grammar(vocab, grammar_str.c_str(), "root");
+                 grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
             }
        }
    }

-    // Feed generation prompt tokens to the grammar sampler so it advances past
-    // tokens the template already placed in the prompt.
-    // Only applies to output-format and tool-call grammars; user-supplied grammars must not be prefilled.
-    std::vector<llama_token> prefill_tokens;
-    if (!params.generation_prompt.empty() && common_grammar_needs_prefill(params.grammar)) {
-        GGML_ASSERT(vocab != nullptr);
-        prefill_tokens = common_tokenize(vocab, params.generation_prompt, false, true);
-        if (!prefill_tokens.empty()) {
-            std::string first_token = common_token_to_piece(vocab, prefill_tokens[0], true);
-            if (std::isspace(first_token[0]) && !std::isspace(params.generation_prompt[0])) {
-                // Some tokenizers will add a space before the first special token, need to remove
-                prefill_tokens = std::vector<llama_token>(prefill_tokens.begin() + 1, prefill_tokens.end());
-            }
-        }
-
-        if (grmr && !params.grammar_lazy) {
-            try {
-                for (const auto & token : prefill_tokens) {
-                    llama_sampler_accept(grmr, token);
-                    LOG_DBG("%s: accepted prefill token (%d)\n", __func__, token);
-                }
-            } catch (std::exception &e) {
-                LOG_ERR("%s: error initializing grammar sampler for grammar:\n%s\n\nGeneration prompt:\n'%s'\n", __func__,
-                    common_grammar_value(params.grammar).c_str(), params.generation_prompt.c_str());
-                throw e;
-            }
-        }
-    }
-
-    // reasoning budget sampler
-    if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty()) {
-        rbudget = common_reasoning_budget_init(
+    // reasoning budget sampler — added first so it can force tokens before other samplers
+    if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) {
+        samplers.push_back(common_reasoning_budget_init(
            vocab,
            params.reasoning_budget_start,
            params.reasoning_budget_end,
            params.reasoning_budget_forced,
-            params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens,
-            prefill_tokens);
+            params.reasoning_budget_tokens,
+            params.reasoning_budget_activate_immediately ? REASONING_BUDGET_COUNTING : REASONING_BUDGET_IDLE));
    }

    if (params.has_logit_bias()) {
@@ -383,16 +347,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
        params.backend_sampling = false;
    }

-    if (rbudget && params.backend_sampling) {
-        LOG_WRN("%s: backend sampling is not compatible with reasoning budget, disabling\n", __func__);
-
-        params.backend_sampling = false;
-    }
-
    auto * result = new common_sampler {
        /* .params  = */ params,
        /* .grmr    = */ grmr,
-        /* .rbudget = */ rbudget,
        /* .chain   = */ chain,
        /* .prev    = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
        /* .cur     = */ {},
@@ -408,27 +365,11 @@ void common_sampler_free(struct common_sampler * gsmpl) {
    }

    llama_sampler_free(gsmpl->grmr);
-    llama_sampler_free(gsmpl->rbudget);
    llama_sampler_free(gsmpl->chain);

    delete gsmpl;
 }

-static bool grammar_should_apply(struct common_sampler * gsmpl) {
-    if (!gsmpl->grmr) {
-        return false;
-    }
-    if (!gsmpl->rbudget) {
-        return true;
-    }
-    if (gsmpl->params.grammar_lazy) {
-        // if grammar is lazy, only apply when reasoning budget is not active
-        const auto state = common_reasoning_budget_get_state(gsmpl->rbudget);
-        return state == REASONING_BUDGET_IDLE || state == REASONING_BUDGET_DONE;
-    }
-    return true;
-}
-
 void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
    if (!gsmpl) {
        return;
@@ -436,11 +377,6 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo

    const auto tm = gsmpl->tm();

-    // grammar_should_apply() checks the reasoning budget state, so calculate this before we accept
-    accept_grammar = accept_grammar && grammar_should_apply(gsmpl);
-
-    llama_sampler_accept(gsmpl->rbudget, token);
-
    if (gsmpl->grmr && accept_grammar) {
        llama_sampler_accept(gsmpl->grmr, token);
    }
@@ -462,7 +398,6 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
    return new common_sampler {
        /* .params  = */ gsmpl->params,
        /* .grmr    = */ llama_sampler_clone(gsmpl->grmr),
-        /* .rbudget = */ llama_sampler_clone(gsmpl->rbudget),
        /* .chain   = */ llama_sampler_clone(gsmpl->chain),
        /* .prev    = */ gsmpl->prev,
        /* .cur     = */ gsmpl->cur,
@@ -532,7 +467,6 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    llama_token id = LLAMA_TOKEN_NULL;

    auto & grmr  = gsmpl->grmr;
-    auto & rbudget = gsmpl->rbudget;
    auto & chain = gsmpl->chain;
    auto & cur_p = gsmpl->cur_p; // initialized by set_logits

@@ -544,8 +478,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
        if (id != LLAMA_TOKEN_NULL) {
            LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);

-            GGML_ASSERT(!gsmpl->grmr    && "using grammar in combination with backend sampling is not supported");
-            GGML_ASSERT(!gsmpl->rbudget && "using reasoning budget in combination with backend sampling is not supported");
+            GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");

            // TODO: simplify
            gsmpl->cur.resize(1);
@@ -558,10 +491,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co

    gsmpl->set_logits(ctx, idx);

-    // apply reasoning budget first
-    llama_sampler_apply(rbudget, &cur_p);
-
-    if (grammar_first && grammar_should_apply(gsmpl)) {
+    if (grammar_first) {
        llama_sampler_apply(grmr, &cur_p);
    }

@@ -569,7 +499,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co

    id = cur_p.data[cur_p.selected].id;

-    if (grammar_first || !grammar_should_apply(gsmpl)) {
+    if (grammar_first) {
        return id;
    }

@@ -590,12 +520,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
    gsmpl->set_logits(ctx, idx);

-    llama_sampler_apply(rbudget,  &cur_p);
-
-    if (grammar_should_apply(gsmpl)) {
-        llama_sampler_apply(grmr,  &cur_p);
-    }
-
+    llama_sampler_apply(grmr,  &cur_p);
    llama_sampler_apply(chain, &cur_p);

    GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -31,10 +31,10 @@ import gguf
 from gguf.vocab import MistralTokenizerType, MistralVocab

 try:
-    from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found, ty:unresolved-import]
-    from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found, ty:unresolved-import]
-    from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found, ty:unresolved-import]
-    from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found, ty:unresolved-import]
+    from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports]
+    from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # pyright: ignore[reportMissingImports]
+    from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
+    from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
        SentencePieceTokenizer,
    )

@@ -45,9 +45,9 @@ except ImportError:
    _MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)

    _mistral_common_installed = False
-    TokenizerVersion: Any = None
-    Tekkenizer: Any = None
-    SentencePieceTokenizer: Any = None
+    TokenizerVersion = None
+    Tekkenizer = None
+    SentencePieceTokenizer = None
    _mistral_import_error_msg = (
        "Mistral format requires `mistral-common` to be installed. Please run "
        "`pip install mistral-common[image,audio]` to install it."
@@ -145,7 +145,6 @@ class ModelBase:
        self.model_name = model_name
        self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
        self._is_nvfp4 = False
-        self._is_mxfp4 = False

        # Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
        # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
@@ -221,7 +220,7 @@ class ModelBase:
                    if weight_map is None or not isinstance(weight_map, dict):
                        raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
                    tensor_names_from_index.update(weight_map.keys())
-                    part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None) # ty: ignore[invalid-assignment]
+                    part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None)
                    part_names = sorted(part_dict.keys())
            else:
                weight_map = {}
@@ -486,7 +485,7 @@ class ModelBase:
            elif quant_method == "modelopt":
                # Mixed-precision ModelOpt models: NVFP4 tensors are handled by
                # _generate_nvfp4_tensors; FP8 tensors have 1D weight_scale and
-                # are dequantized here. k/v scale tensors are unused.
+                # are dequantized here. input_scale tensors are unused.
                for name in self.model_tensors.keys():
                    if name.endswith(".weight_scale"):
                        weight_name = name.removesuffix("_scale")
@@ -494,7 +493,7 @@ class ModelBase:
                        s = self.model_tensors[name]
                        self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None)
                        tensors_to_remove.append(name)
-                    if name.endswith((".k_scale", ".v_scale")):
+                    if name.endswith((".input_scale", ".k_scale", ".v_scale")):
                        tensors_to_remove.append(name)
            elif quant_method is not None:
                raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
@@ -542,6 +541,7 @@ class ModelBase:
        raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+
        new_name = self.map_tensor_name(name)

        # Handle gate/up expert tensor fusion if enabled
@@ -606,12 +606,7 @@ class ModelBase:
    def _nvfp4_scale2_is_trivial(scale2: Tensor) -> bool:
        return scale2.numel() <= 1 and abs(float(scale2.float().sum()) - 1.0) < 1e-6

-    def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor):
-        if "language_model." in name:
-            name = name.replace("language_model.", "")
-
-        new_name = self.map_tensor_name(name)
-
+    def _repack_nvfp4(self, new_name: str, weight: Tensor, scale: Tensor, scale2: Tensor):
        raw, shape = self._nvfp4_pack(weight, scale)
        logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4")
        self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
@@ -623,18 +618,10 @@ class ModelBase:
            logger.info(f"  + {scale_name} (per-tensor NVFP4 scale2, shape [{scale2_f32.size}])")
            self.gguf_writer.add_tensor(scale_name, scale2_f32)

-        # Emit per-tensor input_scale as a separate F32 tensor when non-trivial
-        if not self._nvfp4_scale2_is_trivial(input_scale):
-            input_scale_f32 = input_scale.float().numpy().flatten()
-            input_scale_name = new_name.replace(".weight", ".input_scale")
-            logger.info(f"  + {input_scale_name} (per-tensor NVFP4 input_scale, shape [{input_scale_f32.size}])")
-            self.gguf_writer.add_tensor(input_scale_name, input_scale_f32)
-
    def _generate_nvfp4_tensors(self):
        # Per-layer expert merging to avoid holding all experts in memory
        expert_blocks: dict[tuple[int, str], list[tuple[int, np.ndarray]]] = {}
        expert_scales: dict[tuple[int, str], list[tuple[int, float]]] = {}
-        expert_input_scales: dict[tuple[int, str], list[tuple[int, float]]] = {}
        expert_shapes: dict[tuple[int, str], list[int]] = {}
        n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0
        consumed: list[str] = []
@@ -644,7 +631,6 @@ class ModelBase:
                continue
            scale_name = name.replace(".weight", ".weight_scale")
            scale2_name = name.replace(".weight", ".weight_scale_2")
-            input_scale_name = name.replace(".weight", ".input_scale")
            if scale_name not in self.model_tensors:
                continue
            # Force eager materialization of lazy tensors
@@ -656,14 +642,11 @@ class ModelBase:
                continue

            scale2 = LazyTorchTensor.to_eager(self.model_tensors.get(scale2_name, lambda: torch.tensor(1.0))())
-            input_scale = LazyTorchTensor.to_eager(self.model_tensors.get(input_scale_name, lambda: torch.tensor(1.0))())

            # Mark tensors for removal from model_tensors (already written to gguf)
            consumed.extend([name, scale_name])
            if scale2_name in self.model_tensors:
                consumed.append(scale2_name)
-            if input_scale_name in self.model_tensors:
-                consumed.append(input_scale_name)

            # Check if this is a per-expert tensor
            m = re.search(r'\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)\.weight$', name)
@@ -679,37 +662,34 @@ class ModelBase:
                if key not in expert_blocks:
                    expert_blocks[key] = []
                    expert_scales[key] = []
-                    expert_input_scales[key] = []
                    expert_shapes[key] = shape
                expert_blocks[key].append((expert_id, raw.copy()))
                # Collect per-expert scale2 (scalar per expert)
                expert_scales[key].append((expert_id, float(scale2.float().sum())))
-                # Collect per-expert input_scale (scalar per expert)
-                expert_input_scales[key].append((expert_id, float(input_scale.float().sum())))

                # Flush when all experts for this (layer, proj) are collected
                if n_experts > 0 and len(expert_blocks[key]) >= n_experts:
-                    self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type)
+                    self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_shapes, bid, proj_type)
            else:
-                self._repack_nvfp4(name, weight, scale, scale2, input_scale)
+                new_name = self.map_tensor_name(name)
+                self._repack_nvfp4(new_name, weight, scale, scale2)

        # Flush any remaining experts (fallback if n_experts was unknown)
        for (bid, proj_type) in list(expert_blocks.keys()):
-            self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type)
+            self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_shapes, bid, proj_type)

        # Remove consumed tensors so get_tensors/modify_tensors won't see them
        for name in consumed:
            self.model_tensors.pop(name, None)

-        # Remove any remaining unused auxiliary tensors
+        # Remove unused auxiliary tensors (input_scale, k_scale, v_scale)
        for name in list(self.model_tensors.keys()):
-            if name.endswith((".k_scale", ".v_scale")):
+            if name.endswith((".input_scale", ".k_scale", ".v_scale")):
                del self.model_tensors[name]

-    def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type):
+    def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_shapes, bid, proj_type):
        experts = expert_blocks.pop(key)
        scales = expert_scales.pop(key)
-        input_scales = expert_input_scales.pop(key)
        shape = expert_shapes.pop(key)

        experts.sort(key=lambda x: x[0])
@@ -727,20 +707,11 @@ class ModelBase:
            logger.info(f"  + {scale_name} (per-expert NVFP4 scale2, shape [{len(scales)}])")
            self.gguf_writer.add_tensor(scale_name, scale_vals)

-        # Emit per-expert input_scale tensor if any expert has non-trivial input_scale
-        input_scales.sort(key=lambda x: x[0])
-        input_scale_vals = np.array([s[1] for s in input_scales], dtype=np.float32)
-        if not np.allclose(input_scale_vals, 1.0, atol=1e-6):
-            input_scale_name = new_name.replace(".weight", ".input_scale")
-            logger.info(f"  + {input_scale_name} (per-expert NVFP4 input_scale, shape [{len(input_scales)}])")
-            self.gguf_writer.add_tensor(input_scale_name, input_scale_vals)
-
        del experts, merged

    def prepare_tensors(self):
        # detect NVFP4 quantization (ModelOpt format)
        quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
-        quant_method = (self.hparams.get("quantization_config") or {}).get("quant_method")
        quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {}
        quant_config_file = self.dir_model / "hf_quant_config.json"

@@ -757,7 +728,6 @@ class ModelBase:
                quant_algo = "NVFP4"

        self._is_nvfp4 = quant_algo == "NVFP4"
-        self._is_mxfp4 = quant_method == "mxfp4"

        # NVFP4 weights are repacked and written directly to gguf_writer.
        # This must run before dequant_model so NVFP4 tensors are removed
@@ -906,12 +876,6 @@ class ModelBase:
        if self.metadata.name is None:
            self.metadata.name = self.dir_model.name

-        if self.ftype in (gguf.LlamaFileType.ALL_F32, gguf.LlamaFileType.MOSTLY_F16, gguf.LlamaFileType.MOSTLY_BF16):
-            if self._is_nvfp4:
-                self.ftype = gguf.LlamaFileType.MOSTLY_NVFP4
-            elif self._is_mxfp4:
-                self.ftype = gguf.LlamaFileType.MOSTLY_MXFP4_MOE
-
        # Generate parameter weight class (useful for leader boards) if not yet determined
        if self.metadata.size_label is None and total_params > 0:
            self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
@@ -974,9 +938,6 @@ class ModelBase:
        if "thinker_config" in config:
            # rename for Qwen2.5-Omni
            config["text_config"] = config["thinker_config"]["text_config"]
-        if "language_config" in config:
-            # rename for DeepSeekOCR
-            config["text_config"] = config["language_config"]
        if "lfm" in config:
            # rename for LFM2-Audio
            config["text_config"] = config["lfm"]
@@ -1101,10 +1062,6 @@ class TextModel(ModelBase):
            self.gguf_writer.add_head_count_kv(n_head_kv)
            logger.info(f"gguf: key-value head count = {n_head_kv}")

-        if self.hparams.get("is_causal") is False:
-            self.gguf_writer.add_causal_attention(False)
-            logger.info("gguf: causal attention = False")
-
        # TODO: Handle "sliding_attention" similarly when models start implementing it
        rope_params = self.rope_parameters.get("full_attention", self.rope_parameters)
        if (rope_type := rope_params.get("rope_type")) is not None:
@@ -1164,7 +1121,7 @@ class TextModel(ModelBase):
        if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None:
            self.gguf_writer.add_expert_count(n_experts)
            logger.info(f"gguf: expert count = {n_experts}")
-        if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token", "top_k_experts"], optional=True)) is not None:
+        if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token"], optional=True)) is not None:
            self.gguf_writer.add_expert_used_count(n_experts_used)
            logger.info(f"gguf: experts used count = {n_experts_used}")
        if (n_expert_groups := self.hparams.get("n_group")) is not None:
@@ -1338,9 +1295,6 @@ class TextModel(ModelBase):
        if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df":
            # ref: https://huggingface.co/aari1995/German_Semantic_V3
            res = "jina-v2-de"
-        if chkhsh == "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4":
-            # ref: https://huggingface.co/evilfreelancer/ruGPT3XL
-            res = "gpt-2"
        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
            res = "llama-bpe"
@@ -1536,9 +1490,6 @@ class TextModel(ModelBase):
        if chkhsh == "e4d54df1ebc1f2b91acd986c5b51aa50837d5faf7c7398e73c1f9e9ee5d19869":
            # ref: https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601
            res = "kanana2"
-        if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015":
-            # ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B
-            res = "f2llmv2"

        if res is None:
            logger.warning("\n")
@@ -2107,7 +2058,7 @@ class MmprojModel(ModelBase):
    preprocessor_config: dict[str, Any]
    global_config: dict[str, Any]

-    n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "layers", "encoder_layers", "vt_num_hidden_layers"]
+    n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers", "vt_num_hidden_layers"]

    has_vision_encoder: bool = True # by default
    has_audio_encoder: bool = False
@@ -4309,16 +4260,6 @@ class Qwen25OmniModel(Qwen2VLVisionModel):

@ModelBase.register("InternVisionModel")
 class InternVisionModel(MmprojModel):
-
-    min_dynamic_tiles: int = 0
-    max_dynamic_tiles: int = 0
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        assert self.hparams_vision is not None
-        self.min_dynamic_tiles = self.global_config.get("min_dynamic_patch", 0)
-        self.max_dynamic_tiles = self.global_config.get("max_dynamic_patch", 0)
-
    def set_gguf_parameters(self):
        assert self.hparams_vision is not None
        if isinstance(self.hparams_vision['image_size'], list):
@@ -4341,11 +4282,6 @@ class InternVisionModel(MmprojModel):
        downsample_ratio = self.global_config.get("downsample_ratio")
        assert downsample_ratio is not None
        self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
-        # older models may not have min/max_dynamic_patch in config
-        if self.min_dynamic_tiles > 0:
-            self.gguf_writer.add_vision_preproc_min_tiles(self.min_dynamic_tiles)
-        if self.max_dynamic_tiles > 0:
-            self.gguf_writer.add_vision_preproc_max_tiles(self.max_dynamic_tiles)

    def tensor_force_quant(self, name, new_name, bid, n_dims):
        if ".position_embd." in new_name:
@@ -4608,7 +4544,7 @@ class Qwen2MoeModel(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


-@ModelBase.register("Qwen3ForCausalLM", "Qwen3Model")
+@ModelBase.register("Qwen3ForCausalLM")
 class Qwen3Model(Qwen2Model):
    model_arch = gguf.MODEL_ARCH.QWEN3

@@ -5041,97 +4977,6 @@ class _LinearAttentionVReorderBase(Qwen3NextModel):
        perm[dim], perm[dim + 1] = perm[dim + 1], perm[dim]
        return tensor.permute(*perm).contiguous().reshape(*shape)

-    def _transform_nvfp4_weight(self, name: str, weight: Tensor, scale: Tensor) -> tuple[Tensor, Tensor]:
-        if not name.endswith((
-            ".linear_attn.in_proj_qkv.weight",
-            ".linear_attn.in_proj_z.weight",
-            ".linear_attn.in_proj_a.weight",
-            ".linear_attn.in_proj_b.weight",
-            ".linear_attn.out_proj.weight",
-        )):
-            return weight, scale
-
-        num_k_heads = self.hparams["linear_num_key_heads"]
-        num_v_heads = self.hparams["linear_num_value_heads"]
-        head_k_dim = self.hparams["linear_key_head_dim"]
-        head_v_dim = self.hparams["linear_value_head_dim"]
-        num_v_per_k = num_v_heads // num_k_heads
-
-        def unpack_nibbles(qs: Tensor) -> Tensor:
-            lo = torch.bitwise_and(qs, 0x0F)
-            hi = torch.bitwise_right_shift(qs, 4)
-            return torch.stack((lo, hi), dim=-1).reshape(*qs.shape[:-1], qs.shape[-1] * 2)
-
-        def pack_nibbles(codes: Tensor) -> Tensor:
-            codes = codes.reshape(*codes.shape[:-1], codes.shape[-1] // 2, 2)
-            lo = torch.bitwise_and(codes[..., 0], 0x0F)
-            hi = torch.bitwise_left_shift(torch.bitwise_and(codes[..., 1], 0x0F), 4)
-            return torch.bitwise_or(lo, hi).contiguous()
-
-        def apply_col_perm(qs: Tensor, scales: Tensor, col_perm: Tensor) -> tuple[Tensor, Tensor]:
-            assert qs.ndim >= 2
-            assert scales.ndim >= 2
-
-            k = qs.shape[-1] * 2
-            assert col_perm.numel() == k
-            assert k % 16 == 0
-
-            group_cols = col_perm.reshape(-1, 16)
-            group_starts = group_cols[:, 0]
-            expected = group_starts.unsqueeze(1) + torch.arange(16, dtype=col_perm.dtype)
-            assert torch.equal(group_cols, expected)
-            assert torch.all(group_starts % 16 == 0)
-
-            group_perm = (group_starts // 16).to(dtype=torch.long)
-            expected_groups = torch.arange(scales.shape[-1], dtype=torch.long)
-            assert group_perm.numel() == scales.shape[-1]
-            assert torch.equal(torch.sort(group_perm).values, expected_groups)
-
-            codes = unpack_nibbles(qs)
-            codes = codes.index_select(-1, col_perm.to(device=qs.device, dtype=torch.long))
-            qs = pack_nibbles(codes)
-            scales = scales.index_select(-1, group_perm.to(device=scales.device))
-            return qs, scales
-
-        def reorder_rows(qs: Tensor, scales: Tensor, head_dim: int) -> tuple[Tensor, Tensor]:
-            row_perm = self._reorder_v_heads(
-                torch.arange(num_v_heads * head_dim, dtype=torch.long).unsqueeze(-1),
-                0, num_k_heads, num_v_per_k, head_dim,
-            ).squeeze(-1)
-            return (
-                qs.index_select(0, row_perm.to(device=qs.device)),
-                scales.index_select(0, row_perm.to(device=scales.device)),
-            )
-
-        if name.endswith(".linear_attn.in_proj_qkv.weight"):
-            q_dim = head_k_dim * num_k_heads
-            k_dim = head_k_dim * num_k_heads
-            q = weight[:q_dim]
-            k = weight[q_dim:q_dim + k_dim]
-            v = weight[q_dim + k_dim:]
-            q_scale = scale[:q_dim]
-            k_scale = scale[q_dim:q_dim + k_dim]
-            v_scale = scale[q_dim + k_dim:]
-            v, v_scale = reorder_rows(v, v_scale, head_v_dim)
-            return torch.cat([q, k, v], dim=0), torch.cat([q_scale, k_scale, v_scale], dim=0)
-
-        if name.endswith(".linear_attn.in_proj_z.weight"):
-            weight, scale = reorder_rows(weight, scale, head_v_dim)
-        elif name.endswith((".linear_attn.in_proj_a.weight", ".linear_attn.in_proj_b.weight")):
-            weight, scale = reorder_rows(weight, scale, 1)
-        elif name.endswith(".linear_attn.out_proj.weight"):
-            col_perm = self._reorder_v_heads(
-                torch.arange(num_v_heads * head_v_dim, dtype=torch.long).unsqueeze(0),
-                1, num_k_heads, num_v_per_k, head_v_dim,
-            ).squeeze(0)
-            weight, scale = apply_col_perm(weight, scale, col_perm)
-
-        return weight, scale
-
-    def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor):
-        weight, scale = self._transform_nvfp4_weight(name, weight, scale)
-        super()._repack_nvfp4(name, weight, scale, scale2, input_scale)
-
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        num_k_heads = self.hparams.get("linear_num_key_heads", 0)
        num_v_heads = self.hparams.get("linear_num_value_heads", 0)
@@ -5221,47 +5066,6 @@ class GPT2Model(TextModel):
        yield from super().modify_tensors(data_torch, new_name, bid)


-@ModelBase.register("RuGPT3XLForCausalLM")
-class RuGPT3XLModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.GPT2
-
-    _qkv_parts: list[dict[str, Tensor]] | None = None
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # Fuse separate Q, K, V projections into a single QKV tensor
-        if ".self_attn.q_proj." in name or ".self_attn.k_proj." in name or ".self_attn.v_proj." in name:
-            suffix = "weight" if name.endswith(".weight") else "bias"
-            part = "q" if ".q_proj." in name else ("k" if ".k_proj." in name else "v")
-            key = f"{part}.{suffix}"
-
-            assert bid is not None
-            if self._qkv_parts is None:
-                self._qkv_parts = [{} for _ in range(self.block_count)]
-            self._qkv_parts[bid][key] = data_torch
-
-            q_key, k_key, v_key = f"q.{suffix}", f"k.{suffix}", f"v.{suffix}"
-            if all(k in self._qkv_parts[bid] for k in [q_key, k_key, v_key]):
-                q = self._qkv_parts[bid].pop(q_key)
-                k = self._qkv_parts[bid].pop(k_key)
-                v = self._qkv_parts[bid].pop(v_key)
-                data_torch = torch.cat([q, k, v], dim=0)
-                name = self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid, f".{suffix}")
-                logger.debug(f"Fused Q/K/V {suffix} for layer {bid} -> {name}")
-            else:
-                return
-
-        yield from super().modify_tensors(data_torch, name, bid)
-
-    def prepare_tensors(self):
-        super().prepare_tensors()
-
-        if self._qkv_parts is not None:
-            # flatten `list[dict[str, Tensor]]` into `list[str]`
-            parts = [f"({i}){k}" for i, d in enumerate(self._qkv_parts) for k in d.keys()]
-            if len(parts) > 0:
-                raise ValueError(f"Unprocessed Q/K/V parts: {parts}")
-
-
@ModelBase.register("PhiForCausalLM")
 class Phi2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.PHI2
@@ -6074,7 +5878,7 @@ class InternLM2Model(TextModel):
            logger.error(f'Error: Missing {tokenizer_path}')
            sys.exit(1)

-        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
+        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
        sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
        add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix

@@ -6395,7 +6199,7 @@ class BertModel(TextModel):

            vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
        else:
-            sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
+            sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
            sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
            assert sentencepiece_model.trainer_spec.model_type == 1  # UNIGRAM

@@ -6878,9 +6682,7 @@ class Gemma2Model(TextModel):
@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
 class Gemma3Model(TextModel):
    model_arch = gguf.MODEL_ARCH.GEMMA3
-
-    def norm_shift(self, name: str) -> float:
-        return 1.0 if name.endswith("norm.weight") else 0.0  # Gemma3RMSNorm adds 1.0 to the norm value
+    norm_shift = 1.0  # Gemma3RMSNorm adds 1.0 to the norm value

    def set_vocab(self):
        if (self.dir_model / "tokenizer.model").is_file():
@@ -6918,22 +6720,17 @@ class Gemma3Model(TextModel):

        # remove OOV (out-of-vocabulary) rows in token_embd
        if "embed_tokens.weight" in name:
-            n_vocab_real = -1
            if (self.dir_model / "tokenizer.model").is_file():
                tokens = self._create_vocab_sentencepiece()[0]
-                n_vocab_real = len(tokens)
            else:
-                with open(self.dir_model / "tokenizer.json", "r", encoding="utf-8") as f:
-                    tokenizer_json = json.load(f)
-                    n_vocab_real = len(tokenizer_json["model"]["vocab"]) + len(tokenizer_json["added_tokens"])
-            data_torch = data_torch[:n_vocab_real]
+                tokens = self.get_vocab_base()[0]
+            data_torch = data_torch[:len(tokens)]

        # ref code in Gemma3RMSNorm
        # output = output * (1.0 + self.weight.float())
        # note: this is not the case on gemma3n
-        f_shift = self.norm_shift(name)
-        if f_shift != 0.0:
-            data_torch = data_torch + f_shift
+        if name.endswith("norm.weight"):
+            data_torch = data_torch + self.norm_shift

        yield from super().modify_tensors(data_torch, name, bid)

@@ -7107,71 +6904,6 @@ class ConformerAudioModel(MmprojModel):
            assert data_torch.shape[2] == 1
            data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])

-        mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min"))
-        yield (mapped_name, data_torch)
-
-
-@ModelBase.register("DeepseekOCRForCausalLM")
-class DeepseekOCRVisionModel(MmprojModel):
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        hparams = self.hparams
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR)
-        # default values below are taken from HF tranformers code
-        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
-        self.gguf_writer.add_vision_use_gelu(True)
-        # calculate proj_scale_factor (used by tinygemma3 test model)
-        image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
-        n_per_side = int(image_seq_length ** 0.5)
-        image_size = self.hparams["image_size"]
-        patch_size = self.hparams["patch_size"]
-        proj_scale_factor = (image_size // patch_size) // n_per_side
-        if proj_scale_factor > 0 and proj_scale_factor != 4:
-            # we only need to write this if it's not the default value
-            # in this case, we are converting a test model
-            self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)
-        # @bluebread: there's no window_size in config but just add it here anyway
-        self.gguf_writer.add_vision_window_size(self.hparams.get("window_size", 14))
-
-        # SAM configuration
-        sam_hparams = hparams['sam']
-        self.gguf_writer.add_vision_sam_layers_count(sam_hparams['layers'])
-        self.gguf_writer.add_vision_sam_embedding_length(sam_hparams['width'])
-        self.gguf_writer.add_vision_sam_head_count(sam_hparams['heads'])
-
-    def get_vision_config(self) -> dict[str, Any]:
-        vision_config: dict[str, Any] | None = self.global_config.get("vision_config")
-
-        if not vision_config:
-            raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found")
-
-        vision_config['sam'] = vision_config['width']['sam_vit_b']
-        vision_config.update(vision_config['width']['clip-l-14-224'])
-        vision_config['hidden_size'] = vision_config['width']
-        vision_config['num_heads'] = vision_config['heads']
-        vision_config['intermediate_size'] = vision_config['heads'] * 4
-
-        return vision_config
-
-    def tensor_force_quant(self, name, new_name, bid, n_dims):
-        if ".embeddings." in name or 'pos_embed' in name:
-            return gguf.GGMLQuantizationType.F32
-        if ".rel_pos_h" in name or '.rel_pos_w' in name:
-            return gguf.GGMLQuantizationType.F32
-        if ".neck." in name or ".net_" in name:
-            return gguf.GGMLQuantizationType.F32
-        return super().tensor_force_quant(name, new_name, bid, n_dims)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # Only process vision-related tensors, skip language model tensors
-        # Vision components: sam_model, vision_model, projector, image_newline, view_seperator
-        # Language model components to skip: lm_head, embed_tokens, layers, norm
-        if name.startswith(("lm_head.", "model.embed_tokens.", "model.layers.", "model.norm.")):
-            return
-
-        if name.endswith("pos_embed") or name.endswith("rel_pos_h") or name.endswith("rel_pos_w"):
-            name += ".weight"
-
        yield from super().modify_tensors(data_torch, name, bid)


@@ -7297,6 +7029,7 @@ class Gemma3nVisionAudioModel(ConformerAudioModel):
@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration")
 class Gemma3NModel(Gemma3Model):
    model_arch = gguf.MODEL_ARCH.GEMMA3N
+    norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code

    _altup_proj: list[Tensor] = []
    _altup_unembd: list[Tensor] = []
@@ -7315,10 +7048,6 @@ class Gemma3NModel(Gemma3Model):
            torch.Tensor(), # to be replaced
        ]

-    def norm_shift(self, name: str) -> float:
-        del name
-        return 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
-
    def set_vocab(self):
        # For Gemma3n multimodal models, we need the FULL vocab_size (262400)
        # which includes special tokens from 262144-262399 for vision/audio.
@@ -7436,212 +7165,6 @@ class Gemma3NModel(Gemma3Model):
        yield from super().modify_tensors(data_torch, name, bid)


-@ModelBase.register("Gemma4ForConditionalGeneration")
-class Gemma4Model(Gemma3Model):
-    model_arch = gguf.MODEL_ARCH.GEMMA4
-
-    def norm_shift(self, name: str) -> float:
-        del name # unused
-        return 0.0
-
-    def set_vocab(self):
-        vocab = gguf.LlamaHfVocab(self.dir_model)
-        tokens = []
-        scores = []
-        toktypes = []
-        visible_tokens = {"<|channel>", "<channel|>", "<|tool_call>", "<tool_call|>", "<|tool_response>", "<tool_response|>", "<|\"|>"}
-
-        for text, score, toktype in vocab.all_tokens():
-            tokens.append(text)
-            scores.append(score)
-            text_str = text.decode()
-            if text_str in visible_tokens:
-                # always render these tokens, so that the chat parser can read them
-                toktypes.append(gguf.TokenType.USER_DEFINED)
-                logger.info(f"Token '{text_str}' is set to USER_DEFINED")
-            else:
-                toktypes.append(toktype)
-
-        assert len(tokens) == vocab.vocab_size
-
-        # TODO @ngxson : there are some known (rare) issues with the tokenizer during development
-        # but I don't have time to dive into them right now;
-        # using a dedicated tokenizer name so that we can fix later without re-converting GGUF
-        self.gguf_writer.add_tokenizer_model("gemma4")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_scores(scores)
-        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
-        special_vocab.add_to_gguf(self.gguf_writer)
-        self.gguf_writer.add_add_space_prefix(False)
-        self.gguf_writer.add_add_bos_token(False) # already added via the chat template
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-
-        num_kv_shared_layers = self.hparams["num_kv_shared_layers"]
-        self.gguf_writer.add_shared_kv_layers(num_kv_shared_layers)
-
-        # per-layer embedding is optional
-        n_pl_embd = self.hparams.get("hidden_size_per_layer_input") or 0
-        self.gguf_writer.add_embedding_length_per_layer_input(n_pl_embd)
-
-        swa_layers = [t == "sliding_attention" for t in self.hparams["layer_types"]]
-        self.gguf_writer.add_sliding_window_pattern(swa_layers)
-
-        head_dim_full = self.hparams["global_head_dim"]
-        head_dim_swa = self.hparams["head_dim"]
-        # correct the head dim for global/swa layers
-        self.gguf_writer.add_key_length(head_dim_full)
-        self.gguf_writer.add_value_length(head_dim_full)
-        self.gguf_writer.add_key_length_swa(head_dim_swa)
-        self.gguf_writer.add_value_length_swa(head_dim_swa)
-
-        expert_intermediate_size = self.find_hparam(["expert_intermediate_size", "moe_intermediate_size"])
-        if expert_intermediate_size is not None:
-            self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
-
-        # if use_double_wide_mlp is set, we need to adjust the value for kv shared layers
-        use_double_wide_mlp = self.hparams.get("use_double_wide_mlp", False)
-        first_kv_shared_layer_idx = self.block_count - num_kv_shared_layers
-        if use_double_wide_mlp:
-            n_ff = self.hparams["intermediate_size"]
-            n_ff_arr = [n_ff if il < first_kv_shared_layer_idx else n_ff * 2 for il in range(self.block_count)]
-            self.gguf_writer.add_feed_forward_length(n_ff_arr)
-
-        # handle num_global_key_value_heads
-        num_key_value_heads_full = self.hparams.get("num_global_key_value_heads")
-        num_key_value_heads_swa = self.hparams.get("num_key_value_heads")
-        if num_key_value_heads_full is not None and num_key_value_heads_swa is not None:
-            value_arr = [num_key_value_heads_swa if is_swa else num_key_value_heads_full for is_swa in swa_layers]
-            self.gguf_writer.add_head_count_kv(value_arr)
-
-        # handle n_rot differently for global vs swa layers
-        partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0)
-        n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors
-        n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa)
-        self.gguf_writer.add_rope_dimension_count(n_rot_full)
-        self.gguf_writer.add_rope_dimension_count_swa(n_rot_swa)
-
-    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        # full layer uses "proportional" rope with partial_rotary_factor=0.25
-        # the expected ordering is cc000000ss000000 (c = cos, s = sin, 0 = unrotated),
-        # but ggml neox only supports ccss000000000000, and we cannot rearrange the head because that will break use_alternative_attention
-        # solution is to set specific freq_factors for the unrotated dims
-
-        # IMPORTANT: this ROPE_FREQS tensor is ONLY used by the full_attention layers
-        rope_params_full = self.hparams["rope_parameters"]["full_attention"]
-        assert rope_params_full["rope_type"] == "proportional"
-        head_dim_full = (self.hparams["global_head_dim"])
-        partial_rotary_factor_full = rope_params_full["partial_rotary_factor"]
-        n_rot_full = int(head_dim_full * partial_rotary_factor_full / 2)
-        n_unrot_full = int(head_dim_full / 2) - n_rot_full
-        values = [1.0] * n_rot_full + [1e30] * n_unrot_full
-        rope_freqs_full = torch.tensor(values, dtype=torch.float32)
-        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), rope_freqs_full)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if name.endswith("per_dim_scale") or name.endswith("layer_scalar"):
-            name = name + ".weight"
-
-        if "language_model." not in name and "rope_freqs" not in name:
-            return # skip non-language model tensors
-
-        name = name.replace("language_model.", "")
-        if name.endswith("router.scale"):
-            name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_INP, bid, ".scale")
-            yield (name, data_torch)
-            return
-        if ".per_expert_scale" in name:
-            # convert per-expert scale to FFN down scale
-            name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN_EXP, bid, ".scale")
-            yield (name, data_torch)
-            return
-        if ".experts." in name and not name.endswith(".weight"):
-            name += ".weight"
-
-        yield from super().modify_tensors(data_torch, name, bid)
-
-
-@ModelBase.register("Gemma4ForConditionalGeneration")
-class Gemma4VisionAudioModel(MmprojModel):
-    has_audio_encoder = True
-    has_vision_encoder = True
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        assert self.hparams_vision is not None
-        self.hparams_vision["image_size"] = 224 # unused, but set to avoid error
-
-        # remap audio hparams
-        if self.hparams_audio:
-            self.hparams_audio["feat_in"] = self.hparams_audio.get("input_feat_size", 128)
-            self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4
-        else:
-            self.has_audio_encoder = False
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-
-        # vision params
-        self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4V)
-        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
-
-        # audio params
-        if self.hparams_audio:
-            self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
-            self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
-            self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
-
-    def is_audio_tensor(self, name: str) -> bool:
-        return "audio_tower" in name or "embed_audio" in name
-
-    def tensor_force_quant(self, name, new_name, bid, n_dims):
-        if self.is_audio_tensor(name):
-            if ".conv" in name or "_conv" in name and ".weight" in name:
-                return gguf.GGMLQuantizationType.F32
-        if "position_embedding_table" in name:
-            return gguf.GGMLQuantizationType.F32
-        return super().tensor_force_quant(name, new_name, bid, n_dims)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid # unused
-
-        if name.startswith("model.language_model."):
-            return # skip
-
-        if len(data_torch.shape) == 0:
-            # convert scalar tensors (input/output_mix/max) to 1D tensors
-            data_torch = data_torch.unsqueeze(0)
-
-        if self.is_audio_tensor(name):
-            assert self.hparams_audio is not None
-            name = name.replace("model.audio_tower.", "conformer.")
-            name = name.replace(".linear.", ".")
-            if name.endswith("per_dim_key_scale") or name.endswith("per_dim_scale"):
-                name = name + ".weight"
-                data_torch = torch.nn.functional.softplus(data_torch)
-            if "lconv1d.depthwise_conv1d" in name and name.endswith(".weight"):
-                assert data_torch.shape[1] == 1
-                data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
-            mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min"))
-            yield (mapped_name, data_torch)
-
-        else:
-            name = name.replace("model.vision_tower.encoder.", "vision_model.model.")
-            name = name.replace(".linear.weight", ".weight")
-            if name.endswith("layer_scalar") or name.endswith("position_embedding_table"):
-                name = name + ".weight"
-            if name.endswith("patch_embedder.input_proj.weight"):
-                n_embd, ksize_sq_c = data_torch.shape
-                patch_size = int((ksize_sq_c // 3) ** 0.5)
-                data_torch = data_torch.reshape(n_embd, patch_size, patch_size, 3)
-                data_torch = data_torch.permute(0, 3, 1, 2).contiguous()
-            mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min"))
-            yield (mapped_name, data_torch)
-
-
@ModelBase.register("Starcoder2ForCausalLM")
 class StarCoder2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.STARCODER2
@@ -8729,19 +8252,6 @@ class DeepseekV2Model(TextModel):

    merge_expert = True

-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        hparams: dict = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
-        self.origin_hf_arch = hparams.get('architectures', [None])[0]
-
-        # special handling for Deepseek OCR
-        if self.origin_hf_arch == "DeepseekOCRForCausalLM":
-            self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
-            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
-            self.gguf_writer.add_architecture()
-            # default jinja template
-            self.gguf_writer.add_chat_template("{% for m in messages %}{{m['content']}}{% endfor %}")
-
    def set_vocab(self):
        try:
            self._set_vocab_gpt2()
@@ -8797,15 +8307,9 @@ class DeepseekV2Model(TextModel):
            raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")

    def set_gguf_parameters(self):
-        is_ocr = (self.model_arch == gguf.MODEL_ARCH.DEEPSEEK2OCR)

-        if is_ocr:
-            self.hparams['rope_theta'] = self.hparams.get('rope_theta', 10000.0)
-        else:
-            # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group)
-            self.hparams["num_key_value_heads"] = 1
-
-        self.hparams['rms_norm_eps'] = self.hparams.get('rms_norm_eps', 1e-6)
+        # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group)
+        self.hparams["num_key_value_heads"] = 1

        super().set_gguf_parameters()
        hparams = self.hparams
@@ -8819,18 +8323,16 @@ class DeepseekV2Model(TextModel):
            # Default: if no MoE, all layers are dense; if MoE, none are dense
            first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0
        self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
-        kv_lora_rank = hparams.get("kv_lora_rank", 512)
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
        if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
            self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
+        self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])

        # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
-        if not is_ocr:
-            self.gguf_writer.add_kv_lora_rank(kv_lora_rank)
-            self.gguf_writer.add_key_length(kv_lora_rank + hparams["qk_rope_head_dim"])
-            self.gguf_writer.add_value_length(kv_lora_rank)
-            self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
-            self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
+        self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"])
+        self.gguf_writer.add_value_length(hparams["kv_lora_rank"])
+        self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
+        self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])

        # MoE parameters (required by C++ code for DEEPSEEK2 arch)
        # For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length
@@ -8862,15 +8364,8 @@ class DeepseekV2Model(TextModel):
    _experts: list[dict[str, Tensor]] | None = None

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # skip vision tensors and remove "language_model." for Kimi-VL and Kimi-K2.5, and DeepSeek-OCR
-        if ("vision_tower" in name
-                or "multi_modal_projector" in name
-                or "mm_projector" in name
-                or "vision_model" in name
-                or "image_newline" in name
-                or "model.projector" in name
-                or "sam_model" in name
-                or "view_seperator" in name):
+        # skip vision tensors and remove "language_model." for Kimi-VL and Kimi-K2.5
+        if "vision_tower" in name or "multi_modal_projector" in name or "mm_projector" in name:
            return
        if name.startswith("siglip2.") or name.startswith("merger."):
            return
@@ -9381,7 +8876,7 @@ class T5Model(TextModel):
        if not tokenizer_path.is_file():
            raise FileNotFoundError(f"File not found: {tokenizer_path}")

-        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
+        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
        sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())

        # some models like Pile-T5 family use BPE tokenizer instead of Unigram
@@ -9518,7 +9013,7 @@ class T5EncoderModel(TextModel):
        if not tokenizer_path.is_file():
            raise FileNotFoundError(f"File not found: {tokenizer_path}")

-        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
+        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
        sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())

        # some models like Pile-T5 family use BPE tokenizer instead of Unigram
@@ -11626,7 +11121,8 @@ class GptOssModel(TextModel):

    # TODO: remove once MXFP4 is supported more generally
    def dequant_model(self):
-        if self._is_mxfp4:
+        quant_config = self.hparams.get("quantization_config")
+        if quant_config is not None and quant_config.get("quant_method") == "mxfp4":
            return
        return super().dequant_model()

@@ -12779,7 +12275,6 @@ class LazyTorchTensor(gguf.LazyBase):
            kwargs = {}

        if func is torch.Tensor.numpy:
-            assert len(args)
            return args[0].numpy()

        return cls._wrap_fn(func)(*args, **kwargs)
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -154,7 +154,6 @@ models = [
    {"name": "qwen35",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3.5-9B-Instruct", },
    {"name": "joyai-llm",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
    {"name": "kanana2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
-    {"name": "f2llmv2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
@@ -178,7 +177,6 @@ pre_computed_hashes = [
    {"name": "grok-2",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
    # jina-v2-de variants
    {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
-    {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/evilfreelancer/ruGPT3XL", "chkhsh": "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4"},
 ]


--- a/convert_llama_ggml_to_gguf.py
+++ b/convert_llama_ggml_to_gguf.py
@@ -112,11 +112,11 @@ class Tensor:
        (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
        assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
        assert name_len < 4096, 'Absurd tensor name length'
-        self.dtype = gguf.GGMLQuantizationType(dtype)
-        quant = gguf.GGML_QUANT_SIZES.get(self.dtype)
+        quant = gguf.GGML_QUANT_SIZES.get(dtype)
        assert quant is not None, 'Unknown tensor type'
        (blksize, tysize) = quant
        offset += 12
+        self.dtype= gguf.GGMLQuantizationType(dtype)
        self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
        offset += 4 * n_dims
        self.name = bytes(data[offset:offset + name_len])
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -199,13 +199,10 @@ class LoraTorchTensor:
            kwargs = {}

        if func is torch.permute:
-            assert len(args)
            return type(args[0]).permute(*args, **kwargs)
        elif func is torch.reshape:
-            assert len(args)
            return type(args[0]).reshape(*args, **kwargs)
        elif func is torch.stack:
-            assert len(args)
            assert isinstance(args[0], Sequence)
            dim = kwargs.get("dim", 0)
            assert dim == 0
@@ -214,7 +211,6 @@ class LoraTorchTensor:
                torch.stack([b._lora_B for b in args[0]], dim),
            )
        elif func is torch.cat:
-            assert len(args)
            assert isinstance(args[0], Sequence)
            dim = kwargs.get("dim", 0)
            assert dim == 0
@@ -366,7 +362,7 @@ if __name__ == '__main__':
            logger.error(f"Model {hparams['architectures'][0]} is not supported")
            sys.exit(1)

-        class LoraModel(model_class):  # ty: ignore[unsupported-base]
+        class LoraModel(model_class):
            model_arch = model_class.model_arch

            lora_alpha: float
--- a/docs/autoparser.md
+++ b/docs/autoparser.md
@@ -14,7 +14,7 @@ The unified auto-parser uses a pure differential, compositional approach (inspir
 **Analysis + Parser Building in Two Steps**:

 1. `autoparser::autoparser tmpl_analysis(tmpl)` — runs all differential comparisons and populates the analysis structs
-2. `autoparser::peg_generator::generate_parser(tmpl, generation_params, tmpl_analysis)` — uses the analysis to build a PEG parser and optional GBNF grammar
+2. `autoparser::peg_generator::generate_parser(tmpl, params, tmpl_analysis)` — uses the analysis to build a PEG parser and optional GBNF grammar

 ## Data Structures

@@ -34,7 +34,7 @@ All structs are defined in [common/chat-auto-parser.h](common/chat-auto-parser.h

 ### `analyze_tools` and its sub-structs

- [common/chat-auto-parser.h:176-194](common/chat-auto-parser.h#L176-L194) — `tool_format_analysis`: `mode` enum, `section_start/end`, `per_call_start/end`, JSON field names (`function_field`, `name_field`, `args_field`, `id_field`, `gen_id_field`), and format flags (`fun_name_is_key`, `tools_array_wrapped`)
+- [common/chat-auto-parser.h:176-194](common/chat-auto-parser.h#L176-L194) — `tool_format_analysis`: `mode` enum, `section_start/end`, `per_call_start/end`, JSON field names (`function_field`, `name_field`, `args_field`, `id_field`, `gen_id_field`), and format flags (`fun_name_is_key`, `tools_array_wrapped`, `uses_python_dicts`)
 - [common/chat-auto-parser.h:196-200](common/chat-auto-parser.h#L196-L200) — `tool_function_analysis`: `name_prefix`, `name_suffix`, `close` markers around function names
 - [common/chat-auto-parser.h:202-210](common/chat-auto-parser.h#L202-L210) — `tool_arguments_analysis`: `start/end` container markers, `name_prefix/suffix`, `value_prefix/suffix`, `separator`
 - [common/chat-auto-parser.h:212-217](common/chat-auto-parser.h#L212-L217) — `tool_id_analysis`: `pos` enum, `prefix`/`suffix` markers around call ID values
@@ -47,21 +47,12 @@ All structs are defined in [common/chat-auto-parser.h](common/chat-auto-parser.h
 | Value           | Description                                                                       |
 |-----------------|-----------------------------------------------------------------------------------|
 | `NONE`          | No reasoning markers detected                                                     |
-| `TAG_BASED`     | Tag-based: `<think>...</think>` (start can be empty for delimiter-style formats)  |
+| `TAG_BASED`     | Standard tag-based: `<think>...</think>`                                          |
+| `DELIMITER`     | Delimiter-based: reasoning ends at a delimiter (e.g., `[BEGIN FINAL RESPONSE]`)   |
+| `FORCED_OPEN`   | Template ends with open reasoning tag when `enable_thinking=true`                 |
+| `FORCED_CLOSED` | `enable_thinking=false` emits both tags; `enable_thinking=true` emits only start  |
 | `TOOLS_ONLY`    | Reasoning only appears in tool call responses, not plain content                  |

-**Generation Prompt & Reasoning Prefill**: Computed in `common_chat_templates_apply_jinja` before invoking either the specialized handlers or the auto-parser, by rendering the template twice — once with `add_generation_prompt=false` and once with `add_generation_prompt=true` — and storing the diff suffix as `generation_params::generation_prompt`. This string is propagated into `common_chat_params::generation_prompt` and `common_chat_parser_params::generation_prompt`.
-
-The generation prompt is prepended to model output before PEG parsing via `wrap_for_generation_prompt()`. The portion *before* the reasoning start marker (if any) is prepended as a literal to ensure any boilerplate added by the template is consumed. The full string is also fed to the grammar sampler via `llama_sampler_accept` (stored in `common_params_sampling::grammar_prefill`), advancing the grammar past tokens already in the prompt. It is used to determine the reasoning budget sampler's initial state — COUNTING if the prefill tokens begin with the reasoning start sequence (but don't also contain the end sequence), IDLE otherwise.
-
-**`grammar_prefill`** (`common_params_sampling`): The generation prompt string tokenized and accepted by the grammar sampler at init time. Only applied when `grammar_external` is false (i.e., the grammar was not set explicitly by the user).
-
-Three outcomes for reasoning-prefill handling (in `generate_parser()`):
-
-1. **Start+end in generation prompt** (e.g. `<think></think>\n`): the parser sees reasoning as opened and immediately closed; whitespace-only reasoning content is discarded.
-2. **Only start in generation prompt** (e.g. `<think>\n`): the parser sees reasoning as already open.
-3. **Start marker present but not at the end** (e.g. Apriel's `<|begin_assistant|>` followed by boilerplate): the marker is a template artifact; the start literal is cleared so reasoning uses delimiter-style (end-only). For templates that ignore `add_generation_prompt` (empty diff), the rendered `data.prompt` is used as fallback — but only for non-TOOLS_ONLY modes, since in TOOLS_ONLY the start tag is model-generated and may appear in prior conversation turns.
-
 **`content_mode`**: How the template wraps assistant content.

 | Value                    | Description                                                    |
@@ -270,16 +261,16 @@ Text is segmentized into markers and non-marker fragments using `segmentize_mark

 - Searches `diff.right` (output with reasoning) for the reasoning content needle
 - Uses PEG parsers to find surrounding markers:
-  - If both pre/post markers found in `diff.right` → `TAG_BASED`
-  - If both found but post marker only in the full output B → `TAG_BASED` (template forces markers; handled via prefill)
-  - If only post marker found → `TAG_BASED` (delimiter-style, empty start)
+  - If both pre/post markers found in `diff.right` → `TAG_BASED` (both tags visible in diff = no forced close)
+  - If both found but post marker only in the full output B → `FORCED_CLOSED`
+  - If only post marker found → `DELIMITER`
 - Sets `reasoning.start` and `reasoning.end`

 **R2 — `compare_thinking_enabled()`**: Compares `enable_thinking=false` vs `true` with a generation prompt.

- Detects template-added reasoning markers: `enable_thinking=true` appends a non-empty marker → sets `reasoning.start`, mode = `TAG_BASED`
- Handles the reverse case (`enable_thinking=false` appends the marker instead): extracts both start (from the preceding segment) and end markers; mode = `TAG_BASED`
- The reasoning prefill (markers added by the template) is later extracted in `common_chat_templates_apply_jinja` and prepended to model output before parsing
+- Detects `FORCED_OPEN`: `enable_thinking=true` adds a non-empty marker at the end of the prompt (where model will start generating) — sets `reasoning.start`, mode = `FORCED_OPEN`
+- Detects `FORCED_CLOSED`: `enable_thinking=false` produces both start+end markers; `enable_thinking=true` produces only start marker
+- Handles the reverse case: if both start and end are still empty, looks for a single-segment diff on each side to extract both markers

 **R3 — `compare_reasoning_scope()`**: Compares assistant message with reasoning+text-content vs reasoning+tool-calls.

@@ -352,7 +343,7 @@ Classification logic:

 A workaround array in `common/chat-diff-analyzer.cpp` applies post-hoc patches after analysis. Each workaround is a lambda that inspects the template source and overrides analysis results. Current workarounds:

-1. **Old Qwen/DeepSeek thinking templates** — source contains `content.split('</think>')` but not `<SPECIAL_12>`: sets `reasoning.mode = TAG_BASED` with `<think>`/`</think>` markers if no reasoning was detected
+1. **Old Qwen/DeepSeek thinking templates** — source contains `content.split('</think>')`: sets `reasoning.mode = FORCED_OPEN` with `<think>`/`</think>` markers if no reasoning was detected
 2. **Granite 3.3** — source contains specific "Write your thoughts" text: forces `TAG_BASED` reasoning with `<think>`/`</think>` and `WRAPPED_WITH_REASONING` content with `<response>`/`</response>`
 3. **Cohere Command R+** — source contains `<|CHATBOT_TOKEN|>`: sets `ALWAYS_WRAPPED` content mode if no content start is already set
 4. **Functionary 3.1** — source contains `set has_code_interpreter`: forces `PLAIN` content, specific `per_call_start/end`, clears preserved tokens to only keep Functionary-specific markers
@@ -364,13 +355,12 @@ Each analyzer struct (`analyze_reasoning`, `analyze_content`, `analyze_tools`) i

 #### Reasoning Parser (`analyze_reasoning::build_parser`)

-| Mode                                          | Parser                                                                    |
-|-----------------------------------------------|---------------------------------------------------------------------------|
-| Not extracting reasoning                      | `eps()`                                                                   |
-| `TAG_BASED` or `TOOLS_ONLY` (non-empty start) | `optional(start + reasoning(until(end)) + end + space())`                 |
-| `TAG_BASED` or `TOOLS_ONLY` (empty start)     | `optional(reasoning(until(end)) + end + space())` — delimiter-style       |
-
-Note: The start marker may be empty either because the analyzer detected delimiter-style reasoning, or because `generate_parser()` cleared a template artifact start marker (see Generation Prompt & Reasoning Prefill above). Whitespace-only reasoning content (e.g. from a `<think></think>` prefill) is discarded by the mapper.
+| Mode                              | Parser                                                              |
+|-----------------------------------|---------------------------------------------------------------------|
+| Not extracting reasoning          | `eps()`                                                             |
+| `FORCED_OPEN` or `FORCED_CLOSED`  | `reasoning(until(end)) + end` — opening tag was in the prompt       |
+| `TAG_BASED` or `TOOLS_ONLY`       | `optional(start + reasoning(until(end)) + end)`                     |
+| `DELIMITER`                       | `optional(reasoning(until(end)) + end)` — no start marker           |

 #### Content Parser (`analyze_content::build_parser`)

@@ -420,7 +410,9 @@ All three tool parsers return:
 reasoning + optional(content(until(trigger_marker))) + tool_calls + end()
 ```

-Each returned parser is wrapped by `wrap_for_generation_prompt()`, which prepends a literal for any boilerplate prefix of the generation prompt (the portion before the reasoning start marker).
+### Python Dict Format
+
+When `format.uses_python_dicts` is true (detected when single-quoted strings appear in JSON argument context), `build_parser()` pre-registers a `json-string` rule that accepts both single-quoted and double-quoted strings. This is done before any `p.json()` call so all JSON parsing inherits the flexible rule.

 ## Mapper

@@ -429,22 +421,22 @@ Each returned parser is wrapped by `wrap_for_generation_prompt()`, which prepend
 - **Buffered arguments**: Before `tool_name` is known, argument text goes to `args_buffer`; once the name is set, the buffer is flushed to `current_tool->arguments`
 - **`args_target()`**: Returns a reference to whichever destination is currently active (buffer or tool args), eliminating branching
 - **`closing_quote_pending`**: Tracks whether a closing `"` needs to be appended when a string argument value is finalized (for schema-declared string types in tagged format)
- **Whitespace-only reasoning**: Reasoning content that consists entirely of whitespace (e.g. from a `<think></think>` prefill) is cleared so the message shows no reasoning
+- **Quote normalization**: Python-style quotes (`'key': 'value'`) are converted to JSON (`"key": "value"`)
 - **Brace auto-closing**: At tool close, unclosed `{` braces are closed automatically

 ## Files

-| File                                      | Purpose                                                                         |
-|-------------------------------------------|---------------------------------------------------------------------------------|
-| `common/chat-auto-parser.h`               | All analysis structs, enums, `autoparser`, `peg_generator`, `generation_params` |
-| `common/chat-auto-parser-generator.cpp`   | Parser generator: `generate_parser()` and `build_parser()` methods              |
-| `common/chat-diff-analyzer.cpp`           | Differential analysis implementation and workarounds                            |
-| `common/chat-auto-parser-helpers.h/cpp`   | `calculate_diff_split()`, `segmentize_markers()`, `compare_variants()`,         |
-|                                           | `wrap_for_generation_prompt()`, string helpers                                  |
-| `common/chat-peg-parser.h/cpp`            | `common_chat_peg_builder`, `common_chat_peg_mapper`, and helpers                |
-| `common/chat.cpp`                         | Entry point: `common_chat_templates_apply_jinja()`                              |
-| `tools/parser/debug-template-parser.cpp`  | Debug tool for template analysis                                                |
-| `tools/parser/template-analysis.cpp`      | Template analysis tool                                                          |
+| File                                      | Purpose                                                              |
+|-------------------------------------------|----------------------------------------------------------------------|
+| `common/chat-auto-parser.h`               | All analysis structs, enums, `autoparser`, `peg_generator`, `templates_params` |
+| `common/chat-auto-parser-generator.cpp`   | Parser generator: `generate_parser()` and `build_parser()` methods   |
+| `common/chat-diff-analyzer.cpp`           | Differential analysis implementation and workarounds                 |
+| `common/chat-auto-parser-helpers.h/cpp`   | `calculate_diff_split()`, `segmentize_markers()`,                    |
+|                                           | `compare_variants()`, string helpers                                 |
+| `common/chat-peg-parser.h/cpp`            | `common_chat_peg_builder`, `common_chat_peg_mapper`, and helpers     |
+| `common/chat.cpp`                         | Entry point: `common_chat_templates_apply_jinja()`                   |
+| `tools/parser/debug-template-parser.cpp`  | Debug tool for template analysis                                     |
+| `tools/parser/template-analysis.cpp`      | Template analysis tool                                               |

 ## Testing & Debugging

@@ -524,10 +516,10 @@ To support a new template format:

 ## Edge Cases and Quirks

-1. **Generation Prompt & Reasoning Prefill**: The generation prompt is extracted by diffing `add_generation_prompt=false` vs `true` in `common_chat_templates_apply_jinja`, so it contains exactly what the template appends — avoiding false positives from prior conversation turns.
+1. **Forced Thinking**: When `enable_thinking=true` and the model prompt ends with an open reasoning tag (e.g., `<think>`), the parser enters forced thinking mode and immediately expects reasoning content without waiting for a start marker.
 2. **Per-Call vs Per-Section Markers**: Some templates wrap each tool call individually (`per_call_start/end`); others wrap the entire section (`section_start/end`). T2 (`check_per_call_markers()`) disambiguates by checking if the second call in a two-call output starts with the section marker.
-3. **Tag Boundary Fixing**: `calculate_diff_split()` iteratively adjusts prefix/suffix boundaries to avoid splitting `<tag>` or `[marker]` tokens, ensuring clean extraction.
-4. **Call ID Side Effects**: When a call ID is detected, `per_call_end` may have been incorrectly set to include the call ID suffix. T7 clears `per_call_end` in this case.
-5. **Tool Analysis Gating**: `analyze_tools` is only constructed (and all tool analysis phases run) when `jinja_caps.supports_tool_calls` is true. Within tool analysis, `check_per_call_markers()` (T2) only runs if `jinja_caps.supports_parallel_tool_calls`.
-6. **`analyze_arguments()` Gating**: Within tool analysis, A1 and A2 (argument name/value marker extraction) only run for `TAG_WITH_TAGGED` format. `extract_argument_separator()` and `extract_args_markers()` run for all non-`JSON_NATIVE` formats.
-7. **Undetected Tool Format**: If `analyze_tools` concludes tool calling is supported but cannot determine the format, `build_parser()` logs an error and returns `eps()` (graceful degradation) rather than aborting.
+3. **Python Dict Format**: The Seed template family uses single-quoted JSON (`'key': 'value'`). The `uses_python_dicts` flag causes the PEG builder to register a flexible `json-string` rule accepting both quote styles before any JSON rules are built.
+4. **Tag Boundary Fixing**: `calculate_diff_split()` iteratively adjusts prefix/suffix boundaries to avoid splitting `<tag>` or `[marker]` tokens, ensuring clean extraction.
+5. **Call ID Side Effects**: When a call ID is detected, `per_call_end` may have been incorrectly set to include the call ID suffix. T7 clears `per_call_end` in this case.
+6. **Tool Analysis Gating**: `analyze_tools` is only constructed (and all tool analysis phases run) when `jinja_caps.supports_tool_calls` is true. Within tool analysis, `check_per_call_markers()` (T2) only runs if `jinja_caps.supports_parallel_tool_calls`.
+7. **`analyze_arguments()` Gating**: Within tool analysis, A1 and A2 (argument name/value marker extraction) only run for `TAG_WITH_TAGGED` format. `extract_argument_separator()` and `extract_args_markers()` run for all non-`JSON_NATIVE` formats.
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -42,22 +42,12 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi

 ### Ascend NPU

-You can retrieve your Ascend device IDs using the following command:
+**Verified devices**

-```sh
-lspci -n | grep -Eo '19e5:d[0-9a-f]{3}' | cut -d: -f2
-```
-
-**Devices**
-
-| Device Id | Product Series | Product Models | Chip Model | Verified Status |
-|:---------:|----------------|----------------|:----------:|:---------------:|
-|    d803   | Atlas A3 Train |                |    910C    |                 |
-|    d803   | Atlas A3 Infer |                |    910C    |                 |
-|    d802   | Atlas A2 Train |                |    910B    |                 |
-|    d802   | Atlas A2 Infer | Atlas 300I A2  |    910B    |     Support     |
-|    d801   | Atlas Train    |                |     910    |                 |
-|    d500   | Atlas Infer    | Atlas 300I Duo |    310P    |     Support     |
+| Ascend NPU                    | Status  |
+|:-----------------------------:|:-------:|
+| Atlas 300T A2                 | Support |
+| Atlas 300I Duo                | Support |

 *Notes:*

@@ -67,9 +57,6 @@ lspci -n | grep -Eo '19e5:d[0-9a-f]{3}' | cut -d: -f2

 ## Model Supports

-<details>
-<summary>Text-only</summary>
-
 | Model Name                  | FP16  | Q4_0 | Q8_0 |
 |:----------------------------|:-----:|:----:|:----:|
 | Llama-2                     |   √   |   √  |   √  |
@@ -131,11 +118,8 @@ lspci -n | grep -Eo '19e5:d[0-9a-f]{3}' | cut -d: -f2
 | Trillion-7B-preview         |   √   |   √  |   √  |
 | Ling models                 |   √   |   √  |   √  |

-</details>
-
-<details>
-<summary>Multimodal</summary>

+**Multimodal**
 | Model Name                  | FP16  | Q4_0 | Q8_0 |
 |:----------------------------|:-----:|:----:|:----:|
 | LLaVA 1.5 models, LLaVA 1.6 models      |   x   |   x  |   x  |
@@ -150,22 +134,15 @@ lspci -n | grep -Eo '19e5:d[0-9a-f]{3}' | cut -d: -f2
 |  GLM-EDGE                   |   √   |   √  |   √  |
 |  Qwen2-VL                   |   √   |   √  |   √  |

-</details>
-


 ## DataType Supports

-| DataType               | 910B    | 310P    |
-|:----------------------:|:-------:|:-------:|
-| FP16                   | Support | Support |
-| Q8_0                   | Support | Partial |
-| Q4_0                   | Support | Partial |
-| BF16                   | Support |         |
-
-> **310P note**
-> - `Q8_0`: data transform / buffer path is implemented, and `GET_ROWS` is supported, but quantized `MUL_MAT` / `MUL_MAT_ID` are not supported.
-> - `Q4_0`: data transform / buffer path is implemented, but quantized `MUL_MAT` / `MUL_MAT_ID` are not supported.
+| DataType               | Status  |
+|:----------------------:|:-------:|
+| FP16                   | Support |
+| Q8_0                   | Support |
+| Q4_0                   | Support |

 ## Docker

@@ -183,20 +160,7 @@ npu-smi info

 # Select the cards that you want to use, make sure these cards are not used by someone.
 # Following using cards of device0.
-docker run --name llamacpp \
-  --device /dev/davinci0 \
-  --device /dev/davinci_manager \
-  --device /dev/devmm_svm \
-  --device /dev/hisi_hdc \
-  -v /usr/local/dcmi:/usr/local/dcmi \
-  -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-  -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-  -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-  -v /PATH_TO_YOUR_MODELS/:/app/models \
-  -it llama-cpp-cann \
-  -m /app/models/MODEL_PATH \
-  -ngl 32 \
-  -p "Building a website can be done in 10 simple steps:"
+docker run --name llamacpp --device /dev/davinci0  --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
 ```

 *Notes:*
@@ -207,57 +171,69 @@ docker run --name llamacpp \

 ### I. Setup Environment

-1. **Configure Ascend user and group**
+1. **Install Ascend Driver and firmware**

    ```sh
-    sudo groupadd HwHiAiUser
+    # create driver running user.
+    sudo groupadd -g HwHiAiUser
    sudo useradd -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser -s /bin/bash
    sudo usermod -aG HwHiAiUser $USER
+
+    # download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
+    # and install driver.
+    sudo sh Ascend-hdk-910b-npu-driver_x.x.x_linux-{arch}.run --full --install-for-all
    ```

-2. **Install dependencies**
-
-    **Ubuntu/Debian:**
+    Once installed, run `npu-smi info` to check whether driver is installed successfully.
    ```sh
-    sudo apt-get update
-    sudo apt-get install -y gcc python3 python3-pip linux-headers-$(uname -r)
+    +-------------------------------------------------------------------------------------------+
+    | npu-smi 24.1.rc2               Version: 24.1.rc2                                          |
+    +----------------------+---------------+----------------------------------------------------+
+    | NPU   Name           | Health        | Power(W)    Temp(C)           Hugepages-Usage(page)|
+    | Chip                 | Bus-Id        | AICore(%)   Memory-Usage(MB)  HBM-Usage(MB)        |
+    +======================+===============+====================================================+
+    | 2     xxx            | OK            | 64.4        51                15   / 15            |
+    | 0                    | 0000:01:00.0  | 0           1873 / 15077      0    / 32768         |
+    +======================+===============+====================================================+
+    | 5     xxx            | OK            | 64.0        52                15   / 15            |
+    | 0                    | 0000:81:00.0  | 0           1874 / 15077      0    / 32768         |
+    +======================+===============+====================================================+
+    | No running processes found in NPU 2                                                       |
+    +======================+===============+====================================================+
+    | No running processes found in NPU 5                                                       |
+    +======================+===============+====================================================+
    ```

-    **RHEL/CentOS:**
+2. **Install Ascend Firmware**
    ```sh
-    sudo yum makecache
-    sudo yum install -y gcc python3 python3-pip kernel-headers-$(uname -r) kernel-devel-$(uname -r)
+    # download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
+    # and install driver.
+    sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full
    ```
-
-3. **Install CANN (driver + toolkit)**
-
-    > The `Ascend-cann` package includes both the driver and toolkit.
-    > `$ARCH` can be `x86_64` or `aarch64`, `$CHIP` can be `910b` or `310p`.
-
+    If the following message appears, firmware is installed successfully.
    ```sh
-    wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.5.T63/Ascend-cann_8.5.0_linux-$ARCH.run
-    sudo bash ./Ascend-cann_8.5.0_linux-$ARCH.run --install
-
-    wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.5.T63/Ascend-cann-$CHIP-ops_8.5.0_linux-$ARCH.run
-    sudo bash ./Ascend-cann-$CHIP-ops_8.5.0_linux-$ARCH.run --install
+    Firmware package installed successfully!
    ```

-4. **Verify installation**

+3. **Install CANN toolkit and kernels**
+
+    CANN toolkit and kernels can be obtained from the official [CANN Toolkit](https://www.hiascend.com/zh/developer/download/community/result?module=cann) page.
+
+    Please download the corresponding version that satified your system. The minimum version required is 8.0.RC2.alpha002 and here is the install command.
    ```sh
-    npu-smi info
+    pip3 install attrs numpy decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions
+    sh Ascend-cann-toolkit_8.0.RC2.alpha002_linux-aarch64.run --install
+    sh Ascend-cann-kernels-910b_8.0.RC2.alpha002_linux.run --install
    ```

-    If device information is displayed correctly, the driver is functioning properly.
-
+    Set Ascend Variables:
    ```sh
-    # Set environment variables (adjust path if needed)
-    source /usr/local/Ascend/cann/set_env.sh
-
-    python3 -c "import acl; print(acl.get_soc_name())"
+    echo "source ~/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc
+    source ~/.bashrc
    ```

-    If the command outputs the chip model, the installation was successful.
+Upon a successful installation, CANN is enabled for the available ascend devices.

 ### II. Build llama.cpp

--- a/docs/backend/OPENVINO.md
+++ b/docs/backend/OPENVINO.md
@@ -1,9 +1,6 @@
 # OpenVINO Backend for llama.cpp
-
-> [!NOTE]
-> Performance and memory optimizations, accuracy validation, broader quantization coverage, broader operator and model support are work in progress.
-
-[OpenVINO](https://docs.openvino.ai/) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge. [OpenVINO backend for llama.cpp](../../src/ggml-openvino) enables hardware-accelerated inference on **Intel® CPUs, GPUs, and NPUs** while remaining compatible with the existing **GGUF model ecosystem**. The backend translates GGML compute graphs into OpenVINO graphs and leverages graph compilation, kernel fusion, and device-specific optimizations to improve inference performance on supported Intel hardware.
+[OpenVINO](https://docs.openvino.ai/) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge.
+This document describes the [OpenVINO backend for llama.cpp](../../src/ggml-openvino), which enables hardware-accelerated inference on **Intel® CPUs, GPUs, and NPUs** while remaining compatible with the existing **GGUF model ecosystem**. The backend translates GGML compute graphs into OpenVINO graphs and leverages graph compilation, kernel fusion, and device-specific optimizations to improve inference performance on supported Intel hardware.

 The OpenVINO backend is implemented in `ggml/src/ggml-openvino` and provides a translation layer for core GGML operations. The OpenVINO backend replaces the standard GGML graph execution path with Intel's OpenVINO inference engine. This approach allows the same GGUF model file to run on Intel CPUs, Intel GPUs (integrated and discrete), and Intel NPUs without changes to the model or the rest of the llama.cpp stack. When a `ggml_cgraph` is dispatched to OpenVINO backend, it:

@@ -182,73 +179,31 @@ curl -L https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/L

 When using the OpenVINO backend, the first inference token may have slightly higher latency due to on-the-fly conversion to the OpenVINO graph. Subsequent tokens and runs will be faster.

-> [!NOTE]
-> Default context size is set to the model training context, which may be very large. For example, 131072 for Llama 3.2 1B, which may result in lower performance, especially on edge/laptop devices. Use `-c` to limit context size in supported llama.cpp tools for better performance. For example, `-c 512`.
-
 ```bash
 # If device is unset or unavailable, defaults to CPU.
 # If the system has multiple GPUs, use GPU.0 or GPU.1 to explicitly target a specific GPU.

 # Linux
 export GGML_OPENVINO_DEVICE=GPU
-# Enable stateful execution with GPU device to avoid known stateless execution failures.
-export GGML_OPENVINO_STATEFUL_EXECUTION=1
 # To run llama-simple:
 ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
 # To run in chat mode:
-./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 1024
-# To run llama-bench, -fa 1 is needed
-GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-bench -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -fa 1
-
-# NPU: keep context small to avoid failures from very large model context windows.
-export GGML_OPENVINO_DEVICE=NPU
-./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 512
+./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf

 # Windows Command Line
 set GGML_OPENVINO_DEVICE=GPU
-# Enable stateful execution with GPU device to avoid known stateless execution failures.
-set GGML_OPENVINO_STATEFUL_EXECUTION=1
 # Windows PowerShell
 $env:GGML_OPENVINO_DEVICE = "GPU"
-$env:GGML_OPENVINO_STATEFUL_EXECUTION = "1"

 # To run llama-simple
 build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "
 # To run in chat mode:
-build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -c 1024
-# To run llama-bench, -fa 1 is needed
-build\ReleaseOV\bin\llama-bench.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -fa 1
+build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf"

-# NPU: keep context small to avoid failures from very large model context windows.
-# Windows Command Line
-set GGML_OPENVINO_DEVICE=NPU
-# Windows PowerShell
-$env:GGML_OPENVINO_DEVICE = "NPU"
-build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -c 512
 ```
 > [!NOTE]
 > On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html) for more details.

-### Known Issues and Current Workarounds
-
- GPU stateless execution is currently affected by a known issue.
-  - Workaround: set `GGML_OPENVINO_STATEFUL_EXECUTION=1` when using GPU device.
- NPU failures can happen when context size is too large. Recent llama.cpp behavior may resolve context size to the model training context (for example, 131072 for Llama 3.2 1B), which is too large for current NPU usage and can also stress laptop CPU/GPU on larger models. To inspect the selected context size, run `llama-cli` or `llama-server` with `-lv 3`.
-  - Workaround: explicitly set context size, for ex. `-c 1024` for NPU runs. Performance will be better with lower context size.
- Additional NPU limitations:
-  - Model caching is not yet supported.
-  - `llama-server -np > 1` (multiple parallel sequences) is not supported.
-  - `llama-perplexity` is only supported with `-b 512` or smaller.
- `--context-shift` with `llama-cli` is currently not supported with OpenVINO backend across CPU, GPU, and NPU devices.
- Encoder models (embedding, reranking) are not supported with the current OpenVINO backend implementation.
- `-fa 1` is required when running llama-bench with the OpenVINO backend.
-  - `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1`
- `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
- For Intel GPU, NPU detection in containers, GPU, NPU user-space drivers/libraries must be present inside the image. We will include in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile)
-
-> [!NOTE]
-> The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved.
-

 ### Docker Build

@@ -274,42 +229,31 @@ docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_p
 Run llama.cpp with OpenVINO backend Docker container.
 Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below.

-> [!NOTE]
-> Intel GPU, NPU detection in containers will be included in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile).
-
 ```bash
 #  Run Docker container
-docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
+docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf

 # With Intel GPU access (iGPU or dGPU)
 docker run --rm -it -v ~/models:/models \
 --device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
--env=GGML_OPENVINO_DEVICE=GPU --env=GGML_OPENVINO_STATEFUL_EXECUTION=1 \
-llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
+llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf

 # With Intel NPU access
-docker run --rm -it -v ~/models:/models \
+docker run --rm -it --env GGML_OPENVINO_DEVICE=NPU -v ~/models:/models \
 --device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
--env=GGML_OPENVINO_DEVICE=NPU \
-llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
+llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
 ```

-Run Llama.cpp Server with OpenVINO Backend.
-> [!NOTE]
-> `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
-
+Run Llama.cpp Server with OpenVINO Backend:
 ```bash
 # Run the Server Docker container
-docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 1024
-# Or Using llama-server executable
-./build/ReleaseOV/bin/llama-server -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf --port 8080 -c 1024
+docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
+
+# In a NEW terminal, test the server with curl

 # If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost
 export NO_PROXY=localhost,127.0.0.1

-# Option 1: Open your browser to http://localhost:8080 to access the web UI for the llama.cpp server.
-# Option 2: In a NEW terminal, test the server with curl
-
 # Test health endpoint
 curl -f http://localhost:8080/health

@@ -351,7 +295,6 @@ The OpenVINO backend can be configured using the following environment variables
 export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
 export GGML_OPENVINO_PROFILING=1
 export GGML_OPENVINO_DEVICE=GPU
-export GGML_OPENVINO_STATEFUL_EXECUTION=1

 ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "

@@ -359,27 +302,38 @@ export GGML_OPENVINO_STATEFUL_EXECUTION=1
 set GGML_OPENVINO_CACHE_DIR=C:\tmp\ov_cache
 set GGML_OPENVINO_PROFILING=1
 set GGML_OPENVINO_DEVICE=GPU
-set GGML_OPENVINO_STATEFUL_EXECUTION=1

 # Windows PowerShell
 $env:GGML_OPENVINO_CACHE_DIR = "C:\tmp\ov_cache"
 $env:GGML_OPENVINO_PROFILING = "1"
 $env:GGML_OPENVINO_DEVICE = "GPU"
-$env:GGML_OPENVINO_STATEFUL_EXECUTION = "1"

 build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "

 ```

+#### llama-bench
+
+```bash
+# -fa 1 is required when running llama-bench with the OpenVINO backend.
+GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1
+```
+
+### NPU Notes
+
+- Model caching is not yet supported
+- Does not support llama-server -np > 1 (multiple parallel sequences)
+- Only supports llama-perplexity -b 512 or smaller
+
 ## Llama.cpp Tools

 The following tools work with the OpenVINO backend on CPU, GPU, NPU:
- llama-bench
- llama-cli
- llama-completion
- llama-perplexity
- llama-server
 - llama-simple
+- llama-run
+- llama-cli
+- llama-server
+- llama-bench
+- llama-perplexity

 ## Work in Progress

--- a/docs/build.md
+++ b/docs/build.md
@@ -728,7 +728,7 @@ To read documentation for how to build on Android, [click here](./android.md)

 ## WebGPU [In Progress]

-The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `18eb229`.
+The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `bed1a61`.

 In the llama.cpp directory, build with CMake:

--- a/docs/docker.md
+++ b/docs/docker.md
@@ -13,30 +13,21 @@ We have three Docker images available for this project:

 Additionally, there the following images, similar to the above:

- `ghcr.io/ggml-org/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA 12 support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:full-cuda13`: Same as `full` but compiled with CUDA 13 support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA 12 support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:light-cuda13`: Same as `light` but compiled with CUDA 13 support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA 12 support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:server-cuda13`: Same as `server` but compiled with CUDA 13 support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggml-org/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggml-org/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggml-org/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggml-org/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggml-org/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggml-org/llama.cpp:full-intel`: Same as `full` but compiled with SYCL support. (platforms: `linux/amd64`)
 - `ghcr.io/ggml-org/llama.cpp:light-intel`: Same as `light` but compiled with SYCL support. (platforms: `linux/amd64`)
 - `ghcr.io/ggml-org/llama.cpp:server-intel`: Same as `server` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-vulkan`: Same as `full` but compiled with Vulkan support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:light-vulkan`: Same as `light` but compiled with Vulkan support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:server-vulkan`: Same as `server` but compiled with Vulkan support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:full-openvino`: Same as `full` but compiled with OpenVino support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-openvino`: Same as `light` but compiled with OpenVino support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-openvino`: Same as `server` but compiled with OpenVino support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-s390x`: Identical to `full`, an alias for the `s390x` platform. (platforms: `linux/s390x`)
- `ghcr.io/ggml-org/llama.cpp:light-s390x`: Identical to `light`, an alias for the `s390x` platform. (platforms: `linux/s390x`)
- `ghcr.io/ggml-org/llama.cpp:server-s390x`: Identical to `server`, an alias for the `s390x` platform. (platforms: `linux/s390x`)
+- `ghcr.io/ggml-org/llama.cpp:full-vulkan`: Same as `full` but compiled with Vulkan support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:light-vulkan`: Same as `light` but compiled with Vulkan support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:server-vulkan`: Same as `server` but compiled with Vulkan support. (platforms: `linux/amd64`)

 The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).

@@ -88,7 +79,7 @@ You may want to pass in some different `ARGS`, depending on the CUDA environment

 The defaults are:

- `CUDA_VERSION` set to `12.8.1`
+- `CUDA_VERSION` set to `12.4.0`
 - `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures

 The resulting images, are essentially the same as the non-CUDA images:
--- a/docs/multimodal.md
+++ b/docs/multimodal.md
@@ -31,13 +31,6 @@ llama-server -m gemma-3-4b-it-Q4_K_M.gguf --mmproj mmproj-gemma-3-4b-it-Q4_K_M.g
 llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
 ```

-> [!IMPORTANT]
->
-> OCR models are trained with specific prompt and input structure, please refer to these discussions for more info:
-> - PaddleOCR-VL: https://github.com/ggml-org/llama.cpp/pull/18825
-> - GLM-OCR: https://github.com/ggml-org/llama.cpp/pull/19677
-> - Deepseek-OCR: https://github.com/ggml-org/llama.cpp/pull/17400
-
 ## Pre-quantized models

 These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/collections/ggml-org/multimodal-ggufs-68244e01ff1f39e5bebeeedc
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -12,9 +12,9 @@ Legend:
 - 🟡 Partially supported by this backend
 - ❌ Not supported by this backend

-| Operation | BLAS | CANN | CPU | CUDA | MTL | OpenCL | SYCL | Vulkan | WebGPU | ZenDNN | zDNN |
+| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | WebGPU | ZenDNN | zDNN |
 |-----------|------|------|------|------|------|------|------|------|------|------|------|
-|                              ABS | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
@@ -23,63 +23,63 @@ Legend:
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                            CLAMP | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                             CONT | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
+|                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
 |                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                              COS | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |               CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                           CUMSUM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
-|                             DIAG | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                             DIAG | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                              ELU | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                              EXP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
 |                             FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
 |                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                  GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
+|                  GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                      GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             GELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
+|                             GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
 |                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                           IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                          L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                          L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
-|                              LOG | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
+|                              LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
-|                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
-|                              NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
+|                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                             NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
 |                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                     OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                         OUT_PROD | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 |
 |                              PAD | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                          POOL_1D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                          POOL_1D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             RELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
@@ -91,31 +91,31 @@ Legend:
 |                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                              SET | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
+|                              SET | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                         SET_ROWS | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
-|                              SGN | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             SILU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                              SIN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
-|                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
-|                              SQR | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                             SQRT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                         SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                         SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
-|                             STEP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              SUM | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |                         SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                       SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             TANH | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
--- a/docs/ops/Metal.csv
+++ b/docs/ops/Metal.csv
--- a/docs/ops/WebGPU.csv
+++ b/docs/ops/WebGPU.csv
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -24,12 +24,12 @@ int main(int argc, char ** argv) {
    params.prompt = "Hello my name is";
    params.n_predict = 32;

-    common_init();
-
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BATCHED, print_usage)) {
        return 1;
    }

+    common_init();
+
    // number of parallel batches
    int n_parallel = params.n_parallel;

--- a/examples/debug/debug.cpp
+++ b/examples/debug/debug.cpp
@@ -213,12 +213,12 @@ static bool run(llama_context * ctx, const common_params & params) {
 int main(int argc, char ** argv) {
    common_params params;

-    common_init();
-
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DEBUG, print_usage)) {
        return 1;
    }

+    common_init();
+
    llama_backend_init();
    llama_numa_init(params.numa);

--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@@ -545,12 +545,11 @@ int main(int argc, char ** argv) {

    common_params params;

-    common_init();
-
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DIFFUSION)) {
        return 1;
    }

+    common_init();
    llama_backend_init();

    llama_model_params model_params = llama_model_default_params();
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -99,12 +99,12 @@ int main(int argc, char ** argv) {

    common_params params;

-    common_init();
-
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
        return 1;
    }

+    common_init();
+
    params.embedding = true;

    // get max number of sequences per batch
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -15,18 +15,13 @@ static bool run(llama_context * ctx, const common_params & params) {

    const bool add_bos = llama_vocab_get_add_bos(vocab);

-    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos, true);
+    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);

    if (tokens.empty()) {
        LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
        return false;
    }

-    LOG_INF("number of input tokens = %zu\n", tokens.size());
-    for (size_t i = 0; i < tokens.size(); ++i) {
-        LOG_INF("  %d\n", tokens[i]);
-    }
-
    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
        LOG_ERR("%s : failed to eval\n", __func__);
        return false;
@@ -42,12 +37,12 @@ int main(int argc, char ** argv) {

    common_params params;

-    common_init();
-
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
        return 1;
    }

+    common_init();
+
    llama_backend_init();
    llama_numa_init(params.numa);

--- a/examples/idle/idle.cpp
+++ b/examples/idle/idle.cpp
@@ -19,12 +19,12 @@ static void print_usage(int /*argc*/, char ** argv) {
 int main(int argc, char ** argv) {
    common_params params;

-    common_init();
-
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
        return 1;
    }

+    common_init();
+
    // init LLM

    llama_backend_init();
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -28,6 +28,9 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
    return f'({result})?' if min_items == 0 else result

 def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True):
+    has_min = min_value != None
+    has_max = max_value != None
+
    def digit_range(from_char: str, to_char: str):
        out.append("[")
        if from_char == to_char:
@@ -103,7 +106,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
                out.append(to_str[i])
                out.append("]")

-    if min_value is not None and max_value is not None:
+    if has_min and has_max:
        if min_value < 0 and max_value < 0:
            out.append("\"-\" (")
            _generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True)
@@ -130,7 +133,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou

    less_decimals = max(decimals_left - 1, 1)

-    if min_value is not None:
+    if has_min:
        if min_value < 0:
            out.append("\"-\" (")
            _generate_min_max_int(None, -min_value, out, decimals_left, top_level=False)
@@ -174,7 +177,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
                more_digits(length - 1, less_decimals)
        return

-    if max_value is not None:
+    if has_max:
        if max_value >= 0:
            if top_level:
                out.append("\"-\" [1-9] ")
--- a/examples/llama.android/lib/src/main/cpp/ai_chat.cpp
+++ b/examples/llama.android/lib/src/main/cpp/ai_chat.cpp
@@ -365,13 +365,13 @@ Java_com_arm_aichat_internal_InferenceEngineImpl_processSystemPrompt(
    const auto *system_prompt = env->GetStringUTFChars(jsystem_prompt, nullptr);
    LOGd("%s: System prompt received: \n%s", __func__, system_prompt);
    std::string formatted_system_prompt(system_prompt);
+    env->ReleaseStringUTFChars(jsystem_prompt, system_prompt);

    // Format system prompt if applicable
    const bool has_chat_template = common_chat_templates_was_explicit(g_chat_templates.get());
    if (has_chat_template) {
        formatted_system_prompt = chat_add_and_format(ROLE_SYSTEM, system_prompt);
    }
-    env->ReleaseStringUTFChars(jsystem_prompt, system_prompt);

    // Tokenize system prompt
    const auto system_tokens = common_tokenize(g_context, formatted_system_prompt,
@@ -414,13 +414,13 @@ Java_com_arm_aichat_internal_InferenceEngineImpl_processUserPrompt(
    const auto *const user_prompt = env->GetStringUTFChars(juser_prompt, nullptr);
    LOGd("%s: User prompt received: \n%s", __func__, user_prompt);
    std::string formatted_user_prompt(user_prompt);
+    env->ReleaseStringUTFChars(juser_prompt, user_prompt);

    // Format user prompt if applicable
    const bool has_chat_template = common_chat_templates_was_explicit(g_chat_templates.get());
    if (has_chat_template) {
        formatted_user_prompt = chat_add_and_format(ROLE_USER, user_prompt);
    }
-    env->ReleaseStringUTFChars(juser_prompt, user_prompt);

    // Decode formatted user prompts
    auto user_tokens = common_tokenize(g_context, formatted_user_prompt, has_chat_template, has_chat_template);
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -43,12 +43,12 @@ int main(int argc, char ** argv) {

    common_params params;

-    common_init();
-
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
        return 1;
    }

+    common_init();
+
    const int W = 15; // lookahead window
    const int N = 5;  // n-gram size
    const int G = 15; // max verification n-grams
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -12,8 +12,6 @@ int main(int argc, char ** argv){

    common_params params;

-    common_init();
-
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
        return 1;
    }
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -18,12 +18,12 @@ int main(int argc, char ** argv){

    common_params params;

-    common_init();
-
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
        return 1;
    }

+    common_init();
+
    const int n_draft = params.speculative.n_max;

    // init llama.cpp
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -18,12 +18,12 @@ int main(int argc, char ** argv){

    common_params params;

-    common_init();
-
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
        return 1;
    }

+    common_init();
+
    // max. number of additional tokens to draft if match is found
    const int n_draft = params.speculative.n_max;

--- a/examples/model-conversion/scripts/causal/compare-logits.py
+++ b/examples/model-conversion/scripts/causal/compare-logits.py
@@ -7,7 +7,7 @@ import os

 # Add utils directory to path for direct script execution
 sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
-from common import get_model_name_from_env_path, compare_tokens, exit_with_warning  # type: ignore[import-not-found, ty:unresolved-import]
+from common import get_model_name_from_env_path, compare_tokens, exit_with_warning  # type: ignore[import-not-found]

 def quick_logits_check(pytorch_file, llamacpp_file):
    """Lightweight sanity check before NMSE"""
--- a/examples/model-conversion/scripts/embedding/run-original-model.py
+++ b/examples/model-conversion/scripts/embedding/run-original-model.py
@@ -64,7 +64,7 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device
        print("Using SentenceTransformer to apply all numbered layers")
        model = SentenceTransformer(model_path)
        tokenizer = model.tokenizer
-        config = model[0].auto_model.config
+        config = model[0].auto_model.config  # type: ignore
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
@@ -108,8 +108,8 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device
        print(f"Model file: {type(model).__module__}")

        # Verify the model is using the correct sliding window
-        if hasattr(model.config, 'sliding_window'):
-            print(f"Model's sliding_window: {model.config.sliding_window}")
+        if hasattr(model.config, 'sliding_window'):  # type: ignore
+            print(f"Model's sliding_window: {model.config.sliding_window}")  # type: ignore
        else:
            print("Model config does not have sliding_window attribute")

@@ -152,7 +152,7 @@ def main():
        device = next(model.parameters()).device
    else:
        # For SentenceTransformer, get device from the underlying model
-        device = next(model[0].auto_model.parameters()).device
+        device = next(model[0].auto_model.parameters()).device  # type: ignore

    model_name = os.path.basename(model_path)

@@ -177,7 +177,7 @@ def main():
                print(f"{token_id:6d} -> '{token_str}'")

            print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}")
-            print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}")
+            print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}")  # type: ignore
        else:
            # Standard approach: use base model output only
            encoded = tokenizer(
@@ -205,12 +205,12 @@ def main():
            print(f"Embedding dimension: {all_embeddings.shape[1]}")

        if len(all_embeddings.shape) == 1:
-            n_embd = all_embeddings.shape[0]
+            n_embd = all_embeddings.shape[0]  # type: ignore
            n_embd_count = 1
            all_embeddings = all_embeddings.reshape(1, -1)
        else:
-            n_embd = all_embeddings.shape[1]
-            n_embd_count = all_embeddings.shape[0]
+            n_embd = all_embeddings.shape[1]  # type: ignore
+            n_embd_count = all_embeddings.shape[0]  # type: ignore

        print()

--- a/examples/model-conversion/scripts/utils/check-nmse.py
+++ b/examples/model-conversion/scripts/utils/check-nmse.py
@@ -5,7 +5,7 @@ import sys
 import os
 import argparse
 from pathlib import Path
-from common import get_model_name_from_env_path  # type: ignore[import-not-found, ty:unresolved-import]
+from common import get_model_name_from_env_path  # type: ignore[import-not-found]

 def calculate_nmse(reference, test):
    mse = np.mean((test - reference) ** 2)
--- a/examples/model-conversion/scripts/utils/compare_tokens.py
+++ b/examples/model-conversion/scripts/utils/compare_tokens.py
@@ -2,7 +2,7 @@

 import argparse
 import sys
-from common import compare_tokens  # type: ignore[import-not-found, ty:unresolved-import]
+from common import compare_tokens  # type: ignore


 def parse_arguments():
--- a/examples/model-conversion/scripts/utils/semantic_check.py
+++ b/examples/model-conversion/scripts/utils/semantic_check.py
@@ -7,7 +7,7 @@ import importlib
 from pathlib import Path

 from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
-from common import compare_tokens, exit_with_warning  # type: ignore[import-not-found, ty:unresolved-import]
+from common import compare_tokens, exit_with_warning  # type: ignore[import-not-found]

 unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')

--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -163,12 +163,12 @@ int main(int argc, char ** argv) {
    params.n_predict = 128;
    params.n_junk = 1;

-    common_init();
-
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
        return 1;
    }

+    common_init();
+
    // number of simultaneous "clients" to simulate
    const int32_t n_clients = params.n_parallel;

--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -25,12 +25,12 @@ int main(int argc, char ** argv) {
    params.n_keep = 32;
    params.i_pos  = -1;

-    common_init();
-
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
        return 1;
    }

+    common_init();
+
    int n_junk = params.n_junk;
    int n_keep = params.n_keep;
    int n_grp  = params.grp_attn_n;
--- a/examples/pydantic_models_to_grammar.py
+++ b/examples/pydantic_models_to_grammar.py
@@ -6,7 +6,7 @@ import re
 from copy import copy
 from enum import Enum
 from inspect import getdoc, isclass
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union, get_args, get_origin, get_type_hints
+from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints

 from docstring_parser import parse
 from pydantic import BaseModel, create_model
@@ -1158,7 +1158,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):

        # Assert that the parameter has a type annotation
        if param.annotation == inspect.Parameter.empty:
-            raise TypeError(f"""Parameter '{param.name}' in function '{getattr(func, "__name__", "")}' lacks a type annotation""")
+            raise TypeError(f"Parameter '{param.name}' in function '{func.__name__}' lacks a type annotation")

        # Find the parameter's description in the docstring
        param_doc = next((d for d in docstring.params if d.arg_name == param.name), None)
@@ -1166,7 +1166,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
        # Assert that the parameter has a description
        if not param_doc or not param_doc.description:
            raise ValueError(
-                f"""Parameter '{param.name}' in function '{getattr(func, "__name__", "")}' lacks a description in the docstring""")
+                f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring")

        # Add parameter details to the schema
        param_docs.append((param.name, param_doc))
@@ -1177,7 +1177,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
        dynamic_fields[param.name] = (
            param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
    # Creating the dynamic model
-    dynamic_model = create_model(f"{getattr(func, '__name__')}", **dynamic_fields)
+    dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)

    for name, param_doc in param_docs:
        dynamic_model.model_fields[name].description = param_doc.description
@@ -1285,7 +1285,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
                    if items != {}:
                        array = {"properties": items}
                        array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
-                        fields[field_name] = (list[array_type], ...)  # ty: ignore[invalid-type-form]
+                        fields[field_name] = (List[array_type], ...)
                    else:
                        fields[field_name] = (list, ...)
                elif field_type == "object":
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
arthw	2985be3324	update hw info	2026-03-31 09:24:40 +08:00
arthw	8dc96153c3	enhance FA stable in UT	2026-03-17 15:57:02 +08:00