ci : tmp fixes

Apply suggestion from @ggerganov (src->buffer to buf_src) v2
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-03-05 14:33:24 +02:00 · 2026-02-11 15:48:40 +02:00 · 2026-02-11 13:54:57 +02:00 · 2026-02-11 13:54:25 +02:00 · 2026-02-11 13:54:25 +02:00 · 2026-02-11 13:54:25 +02:00
532 changed files with 21699 additions and 46015 deletions
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -1,8 +1,8 @@
 ARG UBUNTU_VERSION=24.04

 # This needs to generally match the container host's environment.
-ARG ROCM_VERSION=7.2
-ARG AMDGPU_VERSION=7.2
+ARG ROCM_VERSION=7.0
+ARG AMDGPU_VERSION=7.0

 # Target the ROCm build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
@@ -11,12 +11,13 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
-# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.2.0/reference/system-requirements.html
-# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityrad/native_linux/native_linux_compatibility.html
-# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityryz/native_linux/native_linux_compatibility.html
+# gfx803, gfx900, gfx906, gfx1032, gfx1101, gfx1102,not officialy supported
+# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html

-ARG ROCM_DOCKER_ARCH='gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1151;gfx1150;gfx1200;gfx1201'
+ARG ROCM_DOCKER_ARCH='gfx803;gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1010;gfx1030;gfx1032;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151'
+#ARG ROCM_DOCKER_ARCH='gfx1151'

 # Set ROCm architectures
 ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@@ -41,7 +41,7 @@ body:
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
+        options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
        multiple: true
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@@ -42,7 +42,7 @@ body:
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
+        options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
        multiple: true
    validations:
      required: true
--- a/.github/actions/windows-setup-rocm/action.yml
+++ b/.github/actions/windows-setup-rocm/action.yml
@@ -11,5 +11,5 @@ runs:
    - name: Setup ROCm
      uses: ./.github/actions/install-exe
      with:
-        url: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ inputs.version }}-Win11-For-HIP.exe
+        url: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ inputs.version }}-WinSvr2022-For-HIP.exe
        args: -install
--- a/.github/workflows/build-cache.yml
+++ b/.github/workflows/build-cache.yml
@@ -68,7 +68,7 @@ jobs:

    env:
      # Make sure this is in sync with build.yml
-      HIPSDK_INSTALLER_VERSION: "26.Q1"
+      HIPSDK_INSTALLER_VERSION: "25.Q3"

    steps:
      - name: Clone
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -295,7 +295,6 @@ jobs:
            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)

      - name: Build (no OpenMP)
@@ -308,7 +307,6 @@ jobs:
            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
            -DGGML_OPENMP=OFF
-
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)

      - name: Test
@@ -470,7 +468,7 @@ jobs:
          export GGML_VK_VISIBLE_DEVICES=0
          export GGML_VK_DISABLE_F16=1
          # This is using llvmpipe and runs slower than other backends
-          ctest -L main --verbose --timeout 4800
+          ctest -L main --verbose --timeout 4200

  ubuntu-24-cmake-webgpu:
    runs-on: ubuntu-24.04
@@ -1175,8 +1173,10 @@ jobs:
    runs-on: windows-2022

    env:
+      # The ROCm version must correspond to the version used in the HIP SDK.
+      ROCM_VERSION: "6.4.2"
      # Make sure this is in sync with build-cache.yml
-      HIPSDK_INSTALLER_VERSION: "26.Q1"
+      HIPSDK_INSTALLER_VERSION: "25.Q3"

    steps:
      - name: Clone
@@ -1186,7 +1186,7 @@ jobs:
      - name: Grab rocWMMA package
        id: grab_rocwmma
        run: |
-          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70200-43~24.04_amd64.deb"
+          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/${{ env.ROCM_VERSION }}/pool/main/r/rocwmma-dev/rocwmma-dev_1.7.0.60402-120~24.04_amd64.deb"
          7z x rocwmma.deb
          7z x data.tar

@@ -1229,7 +1229,7 @@ jobs:
          cmake -G "Unix Makefiles" -B build -S . `
            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.0/include/" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" `
            -DCMAKE_BUILD_TYPE=Release `
            -DLLAMA_BUILD_BORINGSSL=ON `
            -DROCM_DIR="${env:HIP_PATH}" `
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@@ -21,7 +21,7 @@ on:
 jobs:
  deploy:

-    runs-on: ubuntu-latest
+    runs-on: ubuntu-slim

    steps:
    - uses: actions/checkout@v6
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -516,113 +516,17 @@ jobs:
          path: llama-bin-win-sycl-x64.zip
          name: llama-bin-win-sycl-x64.zip

-  ubuntu-22-rocm:
-    runs-on: ubuntu-22.04
-
-    strategy:
-      matrix:
-        include:
-          - ROCM_VERSION: "7.2"
-            gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1151;gfx1150;gfx1200;gfx1201"
-            build: 'x64'
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ubuntu-rocm-cmake-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt install -y build-essential git cmake wget
-
-      - name: Setup Legacy ROCm
-        if: matrix.ROCM_VERSION == '7.2'
-        id: legacy_env
-        run: |
-          sudo mkdir --parents --mode=0755 /etc/apt/keyrings
-          wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
-            gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
-
-          sudo tee /etc/apt/sources.list.d/rocm.list << EOF
-          deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${{ matrix.ROCM_VERSION }} jammy main
-          EOF
-
-          sudo tee /etc/apt/preferences.d/rocm-pin-600 << EOF
-          Package: *
-          Pin: release o=repo.radeon.com
-          Pin-Priority: 600
-          EOF
-
-          sudo apt update
-          sudo apt-get install -y libssl-dev rocm-hip-sdk
-
-      - name: Setup TheRock
-        if: matrix.ROCM_VERSION != '7.2'
-        id: therock_env
-        run: |
-          wget https://repo.amd.com/rocm/tarball/therock-dist-linux-gfx1151-${{ matrix.ROCM_VERSION }}.tar.gz
-          mkdir install
-          tar -xf *.tar.gz -C install
-          export ROCM_PATH=$(pwd)/install
-          echo ROCM_PATH=$ROCM_PATH >> $GITHUB_ENV
-          echo PATH=$PATH:$ROCM_PATH/bin >> $GITHUB_ENV
-          echo LD_LIBRARY_PATH=$ROCM_PATH/lib:$ROCM_PATH/llvm/lib:$ROCM_PATH/lib/rocprofiler-systems >> $GITHUB_ENV
-
-      - name: Build with native CMake HIP support
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DCMAKE_HIP_FLAGS="-mllvm --amdgpu-unroll-threshold-local=600" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_BACKEND_DL=ON \
-            -DGGML_NATIVE=OFF \
-            -DCMAKE_INSTALL_RPATH='$ORIGIN' \
-            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-            -DGGML_CPU_ALL_VARIANTS=ON \
-            -DGPU_TARGETS="${{ matrix.gpu_targets }}" \
-            -DGGML_HIP=ON \
-            -DHIP_PLATFORM=amd \
-            -DGGML_HIP_ROCWMMA_FATTN=ON \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz
-          name: llama-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz
-
  windows-hip:
    runs-on: windows-2022

    env:
-      HIPSDK_INSTALLER_VERSION: "26.Q1"
+      HIPSDK_INSTALLER_VERSION: "25.Q3"

    strategy:
      matrix:
        include:
          - name: "radeon"
-            gpu_targets: "gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
+            gpu_targets: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"

    steps:
      - name: Clone
@@ -632,7 +536,7 @@ jobs:
      - name: Grab rocWMMA package
        id: grab_rocwmma
        run: |
-          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70200-43~24.04_amd64.deb"
+          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.0.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.0.0.70001-42~24.04_amd64.deb"
          7z x rocwmma.deb
          7z x data.tar

@@ -655,7 +559,7 @@ jobs:
        run: |
          $ErrorActionPreference = "Stop"
          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
          $completed = $proc.WaitForExit(600000)
@@ -689,20 +593,20 @@ jobs:
          cmake -G "Unix Makefiles" -B build -S . `
            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.0/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.0.1/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
            -DCMAKE_BUILD_TYPE=Release `
            -DGGML_BACKEND_DL=ON `
            -DGGML_NATIVE=OFF `
            -DGGML_CPU=OFF `
-            -DGPU_TARGETS="${{ matrix.gpu_targets }}" `
+            -DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
            -DGGML_HIP_ROCWMMA_FATTN=ON `
            -DGGML_HIP=ON `
            -DLLAMA_BUILD_BORINGSSL=ON
          cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
          md "build\bin\rocblas\library\"
          md "build\bin\hipblaslt\library"
-          cp "${env:HIP_PATH}\bin\libhipblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\libhipblaslt.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
          cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
@@ -880,7 +784,6 @@ jobs:
      - windows-cuda
      - windows-sycl
      - windows-hip
-      - ubuntu-22-rocm
      - ubuntu-22-cpu
      - ubuntu-22-vulkan
      - macOS-arm64
@@ -965,7 +868,6 @@ jobs:
            **Linux:**
            - [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
            - [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
-            - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
            - [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)

            **Windows:**
--- a/.github/workflows/server-metal.yml
+++ b/.github/workflows/server-metal.yml
@@ -1,73 +0,0 @@
-name: Server-Metal
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
-      slow_tests:
-        description: 'Run slow tests'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/server-metal.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
-
-env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  server-metal:
-    runs-on: [self-hosted, macOS, ARM64]
-
-    name: server-metal (${{ matrix.wf_name }})
-    strategy:
-      matrix:
-        build_type: [Release]
-        wf_name: ["GPUx1"]
-        include:
-          - build_type: Release
-            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
-            wf_name:    "GPUx1, backend-sampling"
-          - build_type: Release
-            extra_args: "GGML_METAL_DEVICES=2"
-            wf_name:    "GPUx2"
-          - build_type: Release
-            extra_args: "GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1"
-            wf_name:    "GPUx2, backend-sampling"
-      fail-fast: false
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build -DGGML_SCHED_NO_REALLOC=ON
-          cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
-
-      - name: Tests
-        id: server_integration_tests
-        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
-        run: |
-          cd tools/server/tests
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r requirements.txt
-          export ${{ matrix.extra_args }}
-          pytest -v -x -m "not slow"
--- a/.github/workflows/server-webui.yml
+++ b/.github/workflows/server-webui.yml
@@ -8,6 +8,10 @@ on:
        description: 'Commit SHA1 to build'
        required: false
        type: string
+      slow_tests:
+        description: 'Run slow tests'
+        required: true
+        type: boolean
  push:
    branches:
      - master
@@ -97,3 +101,119 @@ jobs:
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:e2e
        working-directory: tools/server/webui
+
+  server-build:
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
+        build_type: [RelWithDebInfo]
+        include:
+          - build_type: Release
+            sanitizer: ""
+      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
+
+    steps:
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get -y install \
+            build-essential \
+            xxd \
+            git \
+            cmake \
+            curl \
+            wget \
+            language-pack-en \
+            libssl-dev
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Python setup
+        id: setup_python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.11'
+
+      - name: Tests dependencies
+        id: test_dependencies
+        run: |
+          pip install -r tools/server/tests/requirements.txt
+
+      - name: Setup Node.js for WebUI
+        uses: actions/setup-node@v6
+        with:
+          node-version: "22"
+          cache: "npm"
+          cache-dependency-path: "tools/server/webui/package-lock.json"
+
+      - name: Install WebUI dependencies
+        run: npm ci
+        working-directory: tools/server/webui
+
+      - name: Build WebUI
+        run: npm run build
+        working-directory: tools/server/webui
+
+      - name: Build (no OpenMP)
+        id: cmake_build_no_openmp
+        if: ${{ matrix.sanitizer == 'THREAD' }}
+        run: |
+          cmake -B build \
+              -DGGML_NATIVE=OFF \
+              -DLLAMA_BUILD_SERVER=ON \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+              -DGGML_OPENMP=OFF ;
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+      - name: Build (sanitizers)
+        id: cmake_build_sanitizers
+        if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
+        run: |
+          cmake -B build \
+              -DGGML_NATIVE=OFF \
+              -DLLAMA_BUILD_SERVER=ON \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+      - name: Build (sanitizers)
+        id: cmake_build
+        if: ${{ matrix.sanitizer == '' }}
+        run: |
+          cmake -B build \
+              -DGGML_NATIVE=OFF \
+              -DLLAMA_BUILD_SERVER=ON \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+      - name: Tests
+        id: server_integration_tests
+        if: ${{ matrix.sanitizer == '' }}
+        env:
+          GITHUB_ACTIONS: "true"
+        run: |
+          cd tools/server/tests
+          ./tests.sh
+
+      - name: Tests (sanitizers)
+        id: server_integration_tests_sanitizers
+        if: ${{ matrix.sanitizer != '' }}
+        run: |
+          cd tools/server/tests
+          LLAMA_SANITIZE=1 ./tests.sh
+
+      - name: Slow tests
+        id: server_integration_tests_slow
+        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
+        run: |
+          cd tools/server/tests
+          SLOW_TESTS=1 ./tests.sh
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -81,14 +81,18 @@ jobs:
            -DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
            -DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
            -DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+          cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server

      - name: Python setup
        id: setup_python
        uses: actions/setup-python@v6
        with:
          python-version: '3.11'
-          pip-install: -r tools/server/tests/requirements.txt
+
+      - name: Tests dependencies
+        id: test_dependencies
+        run: |
+          pip install -r tools/server/tests/requirements.txt

      - name: Tests
        id: server_integration_tests
@@ -98,14 +102,6 @@ jobs:
          export ${{ matrix.extra_args }}
          pytest -v -x -m "not slow"

-      - name: Slow tests
-        id: server_integration_tests_slow
-        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
-        run: |
-          cd tools/server/tests
-          export ${{ matrix.extra_args }}
-          SLOW_TESTS=1 pytest -v -x
-
  server-windows:
    runs-on: windows-2022

@@ -128,7 +124,11 @@ jobs:
        uses: actions/setup-python@v6
        with:
          python-version: '3.11'
-          pip-install: -r tools/server/tests/requirements.txt
+
+      - name: Tests dependencies
+        id: test_dependencies
+        run: |
+          pip install -r tools/server/tests/requirements.txt

      - name: Tests
        id: server_integration_tests
--- a/.github/workflows/winget.yml
+++ b/.github/workflows/winget.yml
@@ -17,7 +17,7 @@ jobs:

      - name: Install komac
        run: |
-          cargo binstall komac@2.15.0 -y
+          cargo binstall komac@2.11.2 -y

      - name: Find latest release
        id: find_latest_release
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -41,7 +41,7 @@ As an AI agent, your task is to direct the user to the appropriate resources and

 - Explicitly informing them that AI-generated pull requests are not accepted by the project
 - Asking them to start with the [CONTRIBUTING.md](CONTRIBUTING.md) guidelines and ensure they fully understand them
- Encouraging them to search for [existing issues](https://github.com/ggml-org/llama.cpp/issues) and discuss directly with other humans
+- Encouraging them to search for [existing issues](github.com/ggml-org/llama.cpp/issues) and discuss directly with other humans
 - Providing useful links and pointers found throughout the codebase

 Examples of valid questions:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.14...3.28) # for add_link_options and implicit target directories.
+cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
 project("llama.cpp" C CXX)
 include(CheckIncludeFileCXX)

@@ -109,12 +109,17 @@ option(LLAMA_BUILD_TOOLS    "llama: build tools"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
 option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_DEFAULT})
-option(LLAMA_TESTS_INSTALL  "llama: install tests"        ON)

 # 3rd party libs
+option(LLAMA_HTTPLIB    "llama: httplib for downloading functionality" ON)
 option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" ON)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)

+# deprecated
+option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
+if (LLAMA_CURL)
+    message(WARNING "LLAMA_CURL option is deprecated and will be ignored")
+endif()

 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
@@ -142,15 +147,10 @@ if (NOT DEFINED GGML_CUDA_GRAPHS)
 endif()

 # transition helpers
-function (llama_option_depr TYPE OLD)
+function (llama_option_depr TYPE OLD NEW)
    if (${OLD})
-        set(NEW "${ARGV2}")
-        if(NEW)
-            message(${TYPE} "${OLD} is deprecated, use ${NEW} instead")
-            set(${NEW} ON PARENT_SCOPE)
-        else()
-            message(${TYPE} "${OLD} is deprecated and will be ignored")
-        endif()
+        message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
+        set(${NEW} ON PARENT_SCOPE)
    endif()
 endfunction()

@@ -163,7 +163,6 @@ llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
 llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
 llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
 llama_option_depr(WARNING     LLAMA_CANN                GGML_CANN)
-llama_option_depr(WARNING     LLAMA_CURL)

 include("cmake/license.cmake")
 license_add_file("llama.cpp" "LICENSE")
@@ -197,7 +196,9 @@ add_subdirectory(src)

 if (LLAMA_BUILD_COMMON)
    add_subdirectory(common)
-    add_subdirectory(vendor/cpp-httplib)
+    if (LLAMA_HTTPLIB)
+        add_subdirectory(vendor/cpp-httplib)
+    endif()
 endif()

 if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -20,7 +20,7 @@ If AI is used to generate any portion of the code, contributors must adhere to t
 1. Explicitly disclose the manner in which AI was employed.
 2. Perform a comprehensive manual review prior to submitting the pull request.
 3. Be prepared to explain every line of code they submitted when asked about it by a maintainer.
-4. It is strictly prohibited to use AI to write your posts for you (bug reports, feature requests, pull request descriptions, Github discussions, responding to humans, ...).
+4. Using AI to write pull request descriptions or to respond to human reviewers is strictly prohibited.

 For more info, please refer to the [AGENTS.md](AGENTS.md) file.

--- a/README.md
+++ b/README.md
@@ -288,7 +288,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [WebGPU [In Progress]](docs/build.md#webgpu) | All |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
 | [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon |
-| [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |

 ## Obtaining and quantizing models

--- a/SECURITY.md
+++ b/SECURITY.md
@@ -19,7 +19,7 @@ Please disclose it as a private [security advisory](https://github.com/ggml-org/
 A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.

 > [!IMPORTANT]
-> For collaborators: if you are interested in helping out with reviewing private security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
+> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080

 ## Requirements

--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -43,6 +43,11 @@ COMMON_CMAKE_ARGS=(
    -DGGML_OPENMP=${GGML_OPENMP}
 )

+XCODE_VERSION=$(xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }')
+MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1)
+MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2)
+echo "Detected Xcode version: $XCODE_VERSION"
+
 check_required_tool() {
    local tool=$1
    local install_message=$2
@@ -55,12 +60,9 @@ check_required_tool() {
 }
 echo "Checking for required tools..."
 check_required_tool "cmake" "Please install CMake 3.28.0 or later (brew install cmake)"
-check_required_tool "xcrun" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
-
-XCODE_VERSION=$(xcrun xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }')
-MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1)
-MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2)
-echo "Detected Xcode version: $XCODE_VERSION"
+check_required_tool "xcodebuild" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
+check_required_tool "libtool" "Please install libtool which should be available with Xcode Command Line Tools (CLT). Make sure Xcode CLT is installed (xcode-select --install)"
+check_required_tool "dsymutil" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"

 set -e

@@ -258,7 +260,7 @@ combine_static_libraries() {

    # Since we have multiple architectures libtool will find object files that do not
    # match the target architecture. We suppress these warnings.
-    xcrun libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null
+    libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null

    # Determine SDK, architectures, and install_name based on platform and simulator flag.
    local sdk=""
@@ -331,7 +333,7 @@ combine_static_libraries() {

    # Platform-specific post-processing for device builds
    if [[ "$is_simulator" == "false" ]]; then
-        if xcrun -f vtool &>/dev/null; then
+        if command -v xcrun vtool &>/dev/null; then
            case "$platform" in
                "ios")
                    echo "Marking binary as a framework binary for iOS..."
@@ -449,9 +451,10 @@ cmake -B build-visionos -G Xcode \
    -DCMAKE_SYSTEM_NAME=visionOS \
    -DCMAKE_OSX_SYSROOT=xros \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
-    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
+    -DLLAMA_HTTPLIB=OFF \
    -DLLAMA_BUILD_SERVER=OFF \
    -S .
 cmake --build build-visionos --config Release -- -quiet
@@ -464,9 +467,10 @@ cmake -B build-visionos-sim -G Xcode \
    -DCMAKE_SYSTEM_NAME=visionOS \
    -DCMAKE_OSX_SYSROOT=xrsimulator \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
-    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
+    -DLLAMA_HTTPLIB=OFF \
    -DLLAMA_BUILD_SERVER=OFF \
    -S .
 cmake --build build-visionos-sim --config Release -- -quiet
@@ -524,13 +528,13 @@ combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false"

 # Create XCFramework with correct debug symbols paths
 echo "Creating XCFramework..."
-xcrun xcodebuild -create-xcframework \
+xcodebuild -create-xcframework \
    -framework $(pwd)/build-ios-sim/framework/llama.framework \
    -debug-symbols $(pwd)/build-ios-sim/dSYMs/llama.dSYM \
    -framework $(pwd)/build-ios-device/framework/llama.framework \
    -debug-symbols $(pwd)/build-ios-device/dSYMs/llama.dSYM \
    -framework $(pwd)/build-macos/framework/llama.framework \
-    -debug-symbols $(pwd)/build-macos/dSYMs/llama.dSYM \
+    -debug-symbols $(pwd)/build-macos/dSYMS/llama.dSYM \
    -framework $(pwd)/build-visionos/framework/llama.framework \
    -debug-symbols $(pwd)/build-visionos/dSYMs/llama.dSYM \
    -framework $(pwd)/build-visionos-sim/framework/llama.framework \
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -413,8 +413,6 @@ function gg_run_qwen3_0_6b {
    ./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
    ./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)

-    (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
-
    (time ./bin/llama-completion -no-cnv --model ${model_f16}  -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
    (time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@@ -540,8 +538,6 @@ function gg_run_embd_bge_small {

    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0

-    (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
-
    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log

@@ -582,8 +578,6 @@ function gg_run_rerank_tiny {

    model_f16="${path_models}/ggml-model-f16.gguf"

-    (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
-
    # for this model, the SEP token is "</s>"
    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log

@@ -683,8 +677,8 @@ fi

 ret=0

-test $ret -eq 0 && gg_run ctest_debug
-test $ret -eq 0 && gg_run ctest_release
+#test $ret -eq 0 && gg_run ctest_debug
+#test $ret -eq 0 && gg_run ctest_release

 if [ ! -z ${GG_BUILD_HIGH_PERF} ]; then
    test $ret -eq 0 && gg_run test_backend_ops_cpu
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -5,6 +5,7 @@ find_package(Threads REQUIRED)
 llama_add_compile_flags()

 # Build info header
+#

 if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
    set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
@@ -109,16 +110,33 @@ if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()

-target_link_libraries(${TARGET} PRIVATE
-    build_info
-    cpp-httplib
-)
+# TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...)
+set(LLAMA_COMMON_EXTRA_LIBS build_info)
+
+if (LLAMA_HTTPLIB)
+    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_HTTPLIB)
+    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
+endif()

 if (LLAMA_LLGUIDANCE)
    include(ExternalProject)
    set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
    set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
-    set(LLGUIDANCE_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}llguidance${CMAKE_STATIC_LIBRARY_SUFFIX}")
+
+    # Set the correct library file extension based on platform
+    if (WIN32)
+        set(LLGUIDANCE_LIB_NAME "llguidance.lib")
+        # Add Windows-specific libraries
+        set(LLGUIDANCE_PLATFORM_LIBS
+            ws2_32    # Windows Sockets API
+            userenv   # For GetUserProfileDirectoryW
+            ntdll     # For NT functions
+            bcrypt    # For BCryptGenRandom
+        )
+    else()
+        set(LLGUIDANCE_LIB_NAME "libllguidance.a")
+        set(LLGUIDANCE_PLATFORM_LIBS "")
+    endif()

    ExternalProject_Add(llguidance_ext
        GIT_REPOSITORY https://github.com/guidance-ai/llguidance
@@ -140,10 +158,8 @@ if (LLAMA_LLGUIDANCE)
    add_dependencies(llguidance llguidance_ext)

    target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
-    target_link_libraries(${TARGET} PRIVATE llguidance)
-    if (WIN32)
-        target_link_libraries(${TARGET} PRIVATE ws2_32 userenv ntdll bcrypt)
-    endif()
-endif()
+    # Add platform libraries to the main target
+    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
+endif ()

-target_link_libraries(${TARGET} PUBLIC llama Threads::Threads)
+target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1301,7 +1301,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, bool value) {
            params.kv_unified = value;
        }
-    ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
+    ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH}));
    add_opt(common_arg(
        {"--context-shift"},
        {"--no-context-shift"},
@@ -1578,7 +1578,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_sparam());
    add_opt(common_arg(
-        {"--temp", "--temperature"}, "N",
+        {"--temp"}, "N",
        string_format("temperature (default: %.2f)", (double)params.sampling.temp),
        [](common_params & params, const std::string & value) {
            params.sampling.temp = std::stof(value);
@@ -1611,7 +1611,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_sparam());
    add_opt(common_arg(
-        {"--top-nsigma", "--top-n-sigma"}, "N",
+        {"--top-nsigma"}, "N",
        string_format("top-n-sigma sampling (default: %.2f, -1.0 = disabled)", params.sampling.top_n_sigma),
        [](common_params & params, const std::string & value) {
            params.sampling.top_n_sigma = std::stof(value);
@@ -1634,7 +1634,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_sparam());
    add_opt(common_arg(
-        {"--typical", "--typical-p"}, "N",
+        {"--typical"}, "N",
        string_format("locally typical sampling, parameter p (default: %.2f, 1.0 = disabled)", (double)params.sampling.typ_p),
        [](common_params & params, const std::string & value) {
            params.sampling.typ_p = std::stof(value);
@@ -2331,19 +2331,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
    add_opt(common_arg(
-        {"-sm", "--split-mode"}, "{none,layer,row}",
+        {"-sm", "--split-mode"}, "{none,layer,row,tensor}",
        "how to split the model across multiple GPUs, one of:\n"
        "- none: use one GPU only\n"
-        "- layer (default): split layers and KV across GPUs\n"
-        "- row: split rows across GPUs",
+        "- layer (default): split layers and KV across GPUs (pipelined)\n"
+        "- row: split weight across GPUs by rows (parallelized)\n"
+        "- tensor: split weights and KV across GPUs (parallelized)",
        [](common_params & params, const std::string & value) {
-            std::string arg_next = value;
-            if (arg_next == "none") {
+            if (value == "none") {
                params.split_mode = LLAMA_SPLIT_MODE_NONE;
-            } else if (arg_next == "layer") {
+            } else if (value == "layer") {
                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
-            } else if (arg_next == "row") {
+            } else if (value == "row") {
                params.split_mode = LLAMA_SPLIT_MODE_ROW;
+            } else if (value == "tensor") {
+                params.split_mode = LLAMA_SPLIT_MODE_TENSOR;
            } else {
                throw std::invalid_argument("invalid value");
            }
@@ -2520,28 +2522,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ));
    add_opt(common_arg(
        {"-a", "--alias"}, "STRING",
-        "set model name aliases, comma-separated (to be used by API)",
+        "set alias for model name (to be used by REST API)",
        [](common_params & params, const std::string & value) {
-            for (auto & alias : string_split<std::string>(value, ',')) {
-                alias = string_strip(alias);
-                if (!alias.empty()) {
-                    params.model_alias.insert(alias);
-                }
-            }
+            params.model_alias = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
-    add_opt(common_arg(
-        {"--tags"}, "STRING",
-        "set model tags, comma-separated (informational, not used for routing)",
-        [](common_params & params, const std::string & value) {
-            for (auto & tag : string_split<std::string>(value, ',')) {
-                tag = string_strip(tag);
-                if (!tag.empty()) {
-                    params.model_tags.insert(tag);
-                }
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TAGS"));
    add_opt(common_arg(
        {"-m", "--model"}, "FNAME",
        ex == LLAMA_EXAMPLE_EXPORT_LORA
@@ -3454,6 +3439,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.speculative.ngram_size_m = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--spec-ngram-check-rate"}, "N",
+        string_format("ngram check rate for ngram-simple/ngram-map speculative decoding (default: %d)", params.speculative.ngram_check_rate),
+        [](common_params & params, int value) {
+            if (value < 1) {
+                throw std::invalid_argument("ngram check rate must be at least 1");
+            }
+            params.speculative.ngram_check_rate = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--spec-ngram-min-hits"}, "N",
        string_format("minimum hits for ngram-map speculative decoding (default: %d)", params.speculative.ngram_min_hits),
--- a/common/chat-parser-xml-toolcall.cpp
+++ b/common/chat-parser-xml-toolcall.cpp
@@ -803,7 +803,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
        }

        // remove potential partial suffix
-        if (builder.pos() == builder.input().size() && builder.is_partial()) {
+        if (builder.pos() == builder.input().size()) {
            if (unclosed_reasoning_content.empty()) {
                rstrip(content);
                trim_potential_partial_word(content);
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@@ -893,6 +893,23 @@ static void common_chat_parse_minimax_m2(common_chat_msg_parser & builder) {
    builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
 }

+static void common_chat_parse_qwen3_coder_xml(common_chat_msg_parser & builder) {
+    static const xml_tool_call_format form = ([]() {
+        xml_tool_call_format form {};
+        form.scope_start = "<tool_call>";
+        form.tool_start  = "<function=";
+        form.tool_sep    = ">";
+        form.key_start   = "<parameter=";
+        form.key_val_sep = ">";
+        form.val_end     = "</parameter>";
+        form.tool_end    = "</function>";
+        form.scope_end   = "</tool_call>";
+        form.trim_raw_argval = true;
+        return form;
+    })();
+    builder.consume_reasoning_with_xml_tool_calls(form);
+}
+
 static void common_chat_parse_kimi_k2(common_chat_msg_parser & builder) {
    static const xml_tool_call_format form = ([]() {
        xml_tool_call_format form {};
@@ -1573,6 +1590,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
        case COMMON_CHAT_FORMAT_KIMI_K2:
            common_chat_parse_kimi_k2(builder);
            break;
+        case COMMON_CHAT_FORMAT_QWEN3_CODER_XML:
+            common_chat_parse_qwen3_coder_xml(builder);
+            break;
        case COMMON_CHAT_FORMAT_APRIEL_1_5:
            common_chat_parse_apriel_1_5(builder);
            break;
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -65,25 +65,14 @@ json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
    } else if (!content_parts.empty()) {
        if (concat_typed_text) {
            std::string text;
-            bool last_was_media_marker = false;
-            // join parts with newline, do not add newline before or after media markers
            for (const auto & part : content_parts) {
-                bool add_new_line = true;
-                if (part.type == "text") {
-                    add_new_line = !last_was_media_marker && !text.empty();
-                    last_was_media_marker = false;
-                } else if (part.type == "media_marker") {
-                    add_new_line = false;
-                    last_was_media_marker = true;
-                } else {
+                if (part.type != "text") {
                    LOG_WRN("Ignoring content part type: %s\n", part.type.c_str());
                    continue;
                }
-
-                if (add_new_line) {
+                if (!text.empty()) {
                    text += '\n';
                }
-
                text += part.text;
            }
            jmsg["content"] = text;
@@ -330,7 +319,7 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
                            throw std::invalid_argument("Missing content part type: " + part.dump());
                        }
                        const auto & type = part.at("type");
-                        if (type != "text" && type != "media_marker") {
+                        if (type != "text") {
                            throw std::invalid_argument("Unsupported content part type: " + type.dump());
                        }
                        common_chat_msg_content_part msg_part;
@@ -391,46 +380,15 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
    return msgs;
 }

-static json render_message_to_json(const std::vector<common_chat_msg> & msgs, const jinja::caps & c) {
-    if (!c.supports_string_content && !c.supports_typed_content) {
-        LOG_WRN("%s: Neither string content nor typed content is supported by the template. This is unexpected and may lead to issues.\n", __func__);
-    }
-
-    bool only_string_accepted =  c.supports_string_content && !c.supports_typed_content;
-    bool only_typed_accepted  = !c.supports_string_content &&  c.supports_typed_content;
-
+json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text) {
    json messages = json::array();
    for (const auto & msg : msgs) {
-        if (only_string_accepted) {
-            json jmsg = msg.to_json_oaicompat(/* concat_typed_text= */ true);
-            messages.push_back(jmsg);
-        } else if (only_typed_accepted) {
-            json jmsg = msg.to_json_oaicompat(/* concat_typed_text= */ false);
-            if (jmsg.at("content").is_string()) {
-                jmsg["content"] = json::array({
-                    json{
-                        {"type", "text"},
-                        {"text", jmsg.at("content").get<std::string>()},
-                    }
-                });
-            }
-            messages.push_back(jmsg);
-        } else {
-            json jmsg = msg.to_json_oaicompat(/* concat_typed_text= */ false);
-            messages.push_back(jmsg);
-        }
+        json jmsg = msg.to_json_oaicompat(concat_typed_text);
+        messages.push_back(jmsg);
    }
    return messages;
 }

-// DEPRECATED: only used in tests
-json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text) {
-    jinja::caps c;
-    c.supports_string_content = true;
-    c.supports_typed_content = !concat_typed_text;
-    return render_message_to_json(msgs, c);
-}
-
 std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & tools) {
    std::vector<common_chat_tool> result;

@@ -736,6 +694,7 @@ const char * common_chat_format_name(common_chat_format format) {
        case COMMON_CHAT_FORMAT_MINIMAX_M2: return "MiniMax-M2";
        case COMMON_CHAT_FORMAT_GLM_4_5: return "GLM 4.5";
        case COMMON_CHAT_FORMAT_KIMI_K2: return "Kimi K2";
+        case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
        case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
        case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
        case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
@@ -1521,17 +1480,14 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_
    return data;
 }

-static common_chat_params common_chat_params_init_qwen3_coder(const common_chat_template & tmpl, const struct templates_params & inputs) {
+static common_chat_params common_chat_params_init_nemotron_v3(const common_chat_template & tmpl, const struct templates_params & inputs) {
    common_chat_params data;

    data.prompt = apply(tmpl, inputs);
    data.format = COMMON_CHAT_FORMAT_PEG_CONSTRUCTED;

-    // Nemotron Nano 3 and Step-3.5-Flash use the Qwen3 Coder tool calling with thinking
-    bool supports_reasoning = (tmpl.source().find("<think>") != std::string::npos);
-
    // Handle thinking tags appropriately based on inputs.enable_thinking
-    if (supports_reasoning && string_ends_with(data.prompt, "<think>\n")) {
+    if (string_ends_with(data.prompt, "<think>\n")) {
        if (!inputs.enable_thinking) {
            data.prompt += "</think>";
        } else {
@@ -1540,21 +1496,19 @@ static common_chat_params common_chat_params_init_qwen3_coder(const common_chat_
    }

    data.preserved_tokens = {
+        "<think>",
+        "</think>",
        "<tool_call>",
        "</tool_call>",
    };

-    if (supports_reasoning) {
-        data.preserved_tokens.insert(data.preserved_tokens.end(), {"<think>", "</think>"});
-    }
-
    auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
    auto include_grammar = true;

    auto parser = build_chat_peg_constructed_parser([&](auto & p) {
        auto reasoning = p.eps();
-        if (supports_reasoning && inputs.enable_thinking && extract_reasoning) {
+        if (inputs.enable_thinking && extract_reasoning) {
            auto reasoning_content = p.reasoning(p.until("</think>")) + ("</think>" | p.end());
            if (data.thinking_forced_open) {
                reasoning = reasoning_content;
@@ -1892,6 +1846,38 @@ static common_chat_params common_chat_params_init_minimax_m2(const common_chat_t
    return data;
 }

+static common_chat_params common_chat_params_init_qwen3_coder_xml(const common_chat_template & tmpl, const struct templates_params & params) {
+    common_chat_params data;
+    data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+
+    data.prompt = apply(tmpl, params);
+    data.format = COMMON_CHAT_FORMAT_QWEN3_CODER_XML;
+
+    data.preserved_tokens = {
+        "<tool_call>",
+        "</tool_call>",
+        "<function=",
+        "</function>",
+        "<parameter=",
+        "</parameter>",
+    };
+
+    // build grammar for tool call
+    static const xml_tool_call_format form {
+        /* form.scope_start = */ "<tool_call>\n",
+        /* form.tool_start  = */ "<function=",
+        /* form.tool_sep    = */ ">\n",
+        /* form.key_start   = */ "<parameter=",
+        /* form.key_val_sep = */ ">\n",
+        /* form.val_end     = */ "\n</parameter>\n",
+        /* form.tool_end    = */ "</function>\n",
+        /* form.scope_end   = */ "</tool_call>",
+    };
+    build_grammar_xml_tool_call(data, params.tools, form);
+
+    return data;
+}
+
 static common_chat_params common_chat_params_init_kimi_k2(const common_chat_template & tmpl, const struct templates_params & params) {
    common_chat_params data;
    data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@@ -2015,7 +2001,6 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
        if (has_reasoning_content && has_tool_calls) {
            auto adjusted_message = msg;
            adjusted_message["thinking"] = msg.at("reasoning_content");
-            adjusted_message.erase("content");
            adjusted_messages.push_back(adjusted_message);
        } else {
            adjusted_messages.push_back(msg);
@@ -3035,7 +3020,7 @@ static common_chat_params common_chat_templates_apply_jinja(
        : *tmpls->template_default;
    const auto & src = tmpl.source();
    const auto & caps = tmpl.original_caps();
-    params.messages = render_message_to_json(inputs.messages, tmpl.original_caps());
+    params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
    params.add_generation_prompt = inputs.add_generation_prompt;
    params.tool_choice = inputs.tool_choice;
    params.reasoning_format = inputs.reasoning_format;
@@ -3113,13 +3098,19 @@ static common_chat_params common_chat_templates_apply_jinja(
    }

    // Qwen3-Coder XML format detection (must come before Hermes 2 Pro)
-    // Detect via XML markers: <tool_call>, <function=...>, and <parameter=...> blocks.
-    // Also matches Step-3.5-Flash and Nemotron 3 Nano which use the same output format.
+    // Detect via explicit XML markers unique to Qwen3-Coder to avoid false positives in other templates.
+    // Require presence of <tool_call>, <function=...>, and <parameter=...> blocks.
    if (src.find("<tool_call>") != std::string::npos &&
+        src.find("<function>") != std::string::npos &&
        src.find("<function=") != std::string::npos &&
+        src.find("<parameters>") != std::string::npos &&
        src.find("<parameter=") != std::string::npos) {
        workaround::func_args_not_string(params.messages);
-        return common_chat_params_init_qwen3_coder(tmpl, params);
+        // Nemotron 3 Nano 30B A3B
+        if (src.find("<think>") != std::string::npos) {
+            return common_chat_params_init_nemotron_v3(tmpl, params);
+        }
+        return common_chat_params_init_qwen3_coder_xml(tmpl, params);
    }

    // Xiaomi MiMo format detection (must come before Hermes 2 Pro)
@@ -3285,7 +3276,7 @@ static common_chat_params common_chat_templates_apply_legacy(
    for (const auto & msg : inputs.messages) {
        auto content = msg.content;
        for (const auto & part : msg.content_parts) {
-            if (part.type != "text" && part.type != "media_marker") {
+            if (part.type != "text") {
                LOG_WRN("Ignoring non-text content part: %s\n", part.type.c_str());
                continue;
            }
--- a/common/chat.h
+++ b/common/chat.h
@@ -128,6 +128,7 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_GLM_4_5,
    COMMON_CHAT_FORMAT_MINIMAX_M2,
    COMMON_CHAT_FORMAT_KIMI_K2,
+    COMMON_CHAT_FORMAT_QWEN3_CODER_XML,
    COMMON_CHAT_FORMAT_APRIEL_1_5,
    COMMON_CHAT_FORMAT_XIAOMI_MIMO,
    COMMON_CHAT_FORMAT_SOLAR_OPEN,
@@ -239,8 +240,6 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates *

 // Parses a JSON array of messages in OpenAI's chat completion API format.
 std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);
-
-// DEPRECATED: only used in tests
 nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);

 std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,3 +1,7 @@
+#if defined(_MSC_VER)
+#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
+#endif
+
 #include "ggml.h"
 #include "gguf.h"

@@ -5,12 +9,12 @@
 #include "log.h"
 #include "llama.h"
 #include "sampling.h"
-#include "unicode.h"

 #include <algorithm>
 #include <cinttypes>
 #include <climits>
 #include <cmath>
+#include <codecvt>
 #include <chrono>
 #include <cstdarg>
 #include <cstring>
@@ -452,6 +456,34 @@ void string_replace_all(std::string & s, const std::string & search, const std::
    s = std::move(builder);
 }

+bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
+    return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
+}
+
+bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
+    bool has_suffix = string_ends_with(str, suffix);
+    if (has_suffix) {
+        str = str.substr(0, str.size() - suffix.size());
+    }
+    return has_suffix;
+}
+
+size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
+    if (!str.empty() && !stop.empty()) {
+        const char text_last_char = str.back();
+        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
+            if (stop[char_index] == text_last_char) {
+                const auto current_partial = stop.substr(0, char_index + 1);
+                if (string_ends_with(str, current_partial)) {
+                    return str.size() - char_index - 1;
+                }
+            }
+        }
+    }
+
+    return std::string::npos;
+}
+
 std::string regex_escape(const std::string & s) {
    static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
    return std::regex_replace(s, special_chars, "\\$&");
@@ -674,28 +706,45 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
        return false;
    }

-    size_t offset = 0;
-    while (offset < filename.size()) {
-        utf8_parse_result result = parse_utf8_codepoint(filename, offset);
+    std::u32string filename_utf32;
+    try {
+#if defined(__clang__)
+        // disable C++17 deprecation warning for std::codecvt_utf8
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#elif defined(__GNUC__)
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif

-        if (result.status != utf8_parse_result::SUCCESS) {
+        std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
+
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#    pragma GCC diagnostic pop
+#endif
+
+        filename_utf32 = converter.from_bytes(filename);
+
+        // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
+        // or invalid encodings were encountered. Reject such attempts
+        std::string filename_reencoded = converter.to_bytes(filename_utf32);
+        if (filename_reencoded != filename) {
            return false;
        }
-        uint32_t c = result.codepoint;
+    } catch (const std::exception &) {
+        return false;
+    }

-        if ((result.bytes_consumed == 2 && c < 0x80) ||
-            (result.bytes_consumed == 3 && c < 0x800) ||
-            (result.bytes_consumed == 4 && c < 0x10000)) {
-            return false;
-        }
-
-        // Check for forbidden codepoints:
-        // - Control characters
-        // - Unicode equivalents of illegal characters
-        // - UTF-16 surrogate pairs
-        // - UTF-8 replacement character
-        // - Byte order mark (BOM)
-        // - Illegal characters: / \ : * ? " < > |
+    // Check for forbidden codepoints:
+    // - Control characters
+    // - Unicode equivalents of illegal characters
+    // - UTF-16 surrogate pairs
+    // - UTF-8 replacement character
+    // - Byte order mark (BOM)
+    // - Illegal characters: / \ : * ? " < > |
+    for (char32_t c : filename_utf32) {
        if (c <= 0x1F // Control characters (C0)
            || c == 0x7F // Control characters (DEL)
            || (c >= 0x80 && c <= 0x9F) // Control characters (C1)
@@ -703,7 +752,6 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
            || c == 0x2215 // Division Slash (forward slash equivalent)
            || c == 0x2216 // Set Minus (backslash equivalent)
            || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
-            || c > 0x10FFFF // Max Unicode limit
            || c == 0xFFFD // Replacement Character (UTF-8)
            || c == 0xFEFF // Byte Order Mark (BOM)
            || c == ':' || c == '*' // Illegal characters
@@ -714,7 +762,6 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
            // Subdirectories not allowed, reject path separators
            return false;
        }
-        offset += result.bytes_consumed;
    }

    // Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
@@ -851,8 +898,7 @@ std::string fs_get_cache_directory() {
    if (getenv("LLAMA_CACHE")) {
        cache_directory = std::getenv("LLAMA_CACHE");
    } else {
-#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || \
-        defined(__OpenBSD__) || defined(__NetBSD__)
+#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
        if (std::getenv("XDG_CACHE_HOME")) {
            cache_directory = std::getenv("XDG_CACHE_HOME");
        } else if (std::getenv("HOME")) {
@@ -1196,7 +1242,7 @@ common_init_result_ptr common_init_from_params(common_params & params) {
            return res;
        }

-        int err = llama_set_adapter_cvec(
+        int err = llama_apply_adapter_cvec(
                lctx,
                cvec.data.data(),
                cvec.data.size(),
@@ -1298,15 +1344,12 @@ std::string get_model_endpoint() {
 }

 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
-    std::vector<llama_adapter_lora *> loras;
-    std::vector<float> scales;
-
-    for (auto & la: lora) {
-        loras.push_back(la.ptr);
-        scales.push_back(la.scale);
+    llama_clear_adapter_lora(ctx);
+    for (auto & la : lora) {
+        if (la.scale != 0.0f) {
+            llama_set_adapter_lora(ctx, la.ptr, la.scale);
+        }
    }
-
-    llama_set_adapters_lora(ctx, loras.data(), loras.size(), scales.data());
 }

 struct llama_model_params common_model_params_to_llama(common_params & params) {
@@ -1426,6 +1469,66 @@ void common_batch_add(
    batch.n_tokens++;
 }

+//
+// Token utils
+//
+
+size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
+    size_t i;
+    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
+
+    return i;
+}
+
+size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
+    // check for empty sequences
+    if (a.empty() || b.empty()) {
+        return 0;
+    }
+
+    // get the lengths of the input sequences
+    size_t a_len = a.size();
+    size_t b_len = b.size();
+
+    // initialize the maximum length of the longest common subsequence (LCS)
+    size_t max_length = 0;
+
+    // use two rows instead of a 2D matrix to optimize space
+    std::vector<size_t> prev_row(b_len + 1, 0);
+    std::vector<size_t> curr_row(b_len + 1, 0);
+
+    // iterate through the elements of a
+    for (size_t i = 1; i <= a_len; i++) {
+        // iterate through the elements of b
+        for (size_t j = 1; j <= b_len; j++) {
+            // if elements at the current positions match
+            if (a[i - 1] == b[j - 1]) {
+                // if it's the first element of either sequences, set LCS length to 1
+                if (i == 1 || j == 1) {
+                    curr_row[j] = 1;
+                } else {
+                    // increment LCS length by 1 compared to the previous element
+                    curr_row[j] = prev_row[j - 1] + 1;
+                }
+
+                // update max_length if necessary
+                if (curr_row[j] > max_length) {
+                    max_length = curr_row[j];
+                }
+            } else {
+                // reset LCS length if elements don't match
+                curr_row[j] = 0;
+            }
+        }
+
+        // update the previous row for the next iteration
+        prev_row = curr_row;
+    }
+
+    // return the maximum length of the LCS
+    return max_length;
+}
+
 //
 // Vocab utils
 //
@@ -1760,65 +1863,3 @@ float lr_opt::get_lr(float epoch) const {
    LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
    return r;
 }
-
-bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos) {
-    llama_batch batch = llama_batch_get_one(&last_token, 1);
-    batch.pos = &pos;
-    if (llama_decode(ctx, batch)) {
-        LOG_ERR("%s: failed to replay last token\n", __func__);
-        return false;
-    }
-    return true;
-}
-
-bool common_prompt_batch_decode(
-              struct llama_context * ctx,
-    const std::vector<llama_token> & tokens,
-                               int & n_past,
-                               int   n_batch,
-                  std::string_view   state_path,
-                              bool   save_state) {
-    const int n_eval = tokens.size();
-    if (n_eval == 0) {
-        return true;
-    }
-
-    if (save_state && n_eval > 1) {
-        const int n_tokens_before_last = n_eval - 1;
-
-        GGML_ASSERT(n_eval <= n_batch);
-
-        // Decode all but the last token so we can save the memory state before decoding the last token.
-        // This is done so we can restore the session state later and replay the last token.
-        // Memory implementations in recurrent/hybrid models don't support removing tokens from their
-        // memory, so we can't just remove the last token from the memory and replay the last token which
-        // is the reason for this logic.
-        if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(tokens.data()), n_tokens_before_last))) {
-            LOG_ERR("%s : failed to eval\n", __func__);
-            return false;
-        }
-        n_past += n_tokens_before_last;
-
-        llama_state_save_file(ctx, state_path.data(), tokens.data(), n_tokens_before_last);
-        LOG_INF("saved session before last token to %s, n_tokens = %d\n", state_path.data(), n_tokens_before_last);
-
-        llama_token last_token = tokens.back();
-        llama_batch batch = llama_batch_get_one(&last_token, 1);
-        int32_t pos = n_past;
-        batch.pos = &pos;
-
-        if (llama_decode(ctx, batch)) {
-            LOG_ERR("%s : failed to eval last token\n", __func__);
-            return false;
-        }
-        n_past++;
-    } else {
-        if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(tokens.data()), n_eval))) {
-            LOG_ERR("%s : failed to eval\n", __func__);
-            return false;
-        }
-        n_past += n_eval;
-    }
-
-    return true;
-}
--- a/common/common.h
+++ b/common/common.h
@@ -269,6 +269,7 @@ struct common_params_speculative {

    uint16_t ngram_size_n     = 12; // ngram size for lookup
    uint16_t ngram_size_m     = 48; // mgram size for speculative tokens
+    uint16_t ngram_check_rate =  1; // check rate for ngram lookup
    uint16_t ngram_min_hits   =  1; // minimum hits at ngram/mgram lookup for mgram to be proposed

    std::shared_ptr<common_ngram_mod> ngram_mod;
@@ -410,8 +411,7 @@ struct common_params {

    struct common_params_model model;

-    std::set<std::string> model_alias;     // model aliases                                                 // NOLINT
-    std::set<std::string> model_tags;      // model tags (informational, not used for routing)              // NOLINT
+    std::string model_alias          = ""; // model alias                                                   // NOLINT
    std::string hf_token             = ""; // HF token                                                      // NOLINT
    std::string prompt               = "";                                                                  // NOLINT
    std::string system_prompt        = "";                                                                  // NOLINT
@@ -671,55 +671,30 @@ static std::vector<T> string_split(const std::string & str, char delim) {
 }

 template<>
-inline std::vector<std::string> string_split<std::string>(const std::string & str, char delim)
+std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
 {
    std::vector<std::string> parts;
    size_t begin_pos = 0;
-    size_t delim_pos = str.find(delim);
-    while (delim_pos != std::string::npos) {
-        std::string part = str.substr(begin_pos, delim_pos - begin_pos);
+    size_t separator_pos = input.find(separator);
+    while (separator_pos != std::string::npos) {
+        std::string part = input.substr(begin_pos, separator_pos - begin_pos);
        parts.emplace_back(part);
-        begin_pos = delim_pos + 1;
-        delim_pos = str.find(delim, begin_pos);
+        begin_pos = separator_pos + 1;
+        separator_pos = input.find(separator, begin_pos);
    }
-    parts.emplace_back(str.substr(begin_pos));
+    parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
    return parts;
 }

-// remove when moving to c++20
-inline bool string_starts_with(std::string_view str, std::string_view prefix) {
-    return str.size() >= prefix.size() &&
-           str.compare(0, prefix.size(), prefix) == 0;
+static bool string_starts_with(const std::string & str,
+                               const std::string & prefix) {  // While we wait for C++20's std::string::starts_with...
+    return str.rfind(prefix, 0) == 0;
 }

-// remove when moving to c++20
-inline bool string_ends_with(std::string_view str, std::string_view suffix) {
-    return str.size() >= suffix.size() &&
-           str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
-}
-
-inline bool string_remove_suffix(std::string & str, std::string_view suffix) {
-    if (string_ends_with(str, suffix)) {
-        str.resize(str.size() - suffix.size());
-        return true;
-    }
-    return false;
-}
-
-inline size_t string_find_partial_stop(std::string_view str, std::string_view stop) {
-    if (!str.empty() && !stop.empty()) {
-        const size_t max_len = std::min(str.size(), stop.size());
-        const char last_char = str.back();
-        for (size_t len = max_len; len > 0; --len) {
-            if (stop[len - 1] == last_char) {
-                if (string_ends_with(str, stop.substr(0, len))) {
-                    return str.size() - len;
-                }
-            }
-        }
-    }
-    return std::string::npos;
-}
+// While we wait for C++20's std::string::ends_with...
+bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
+bool string_remove_suffix(std::string & str, const std::string_view & suffix);
+size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);

 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
@@ -805,22 +780,15 @@ void common_batch_add(
    const std::vector<llama_seq_id> & seq_ids,
                               bool   logits);

-// decodes a single batch of tokens for a prompt and manages session tokens
 //
-// Note: We save state before the last token so that we can replay it to ensure
-// compatibility with all memory types. Recurrent/hybrid models cannot remove
-// tokens from memory, so this approach works across all model architectures.
-bool common_prompt_batch_decode(
-              struct llama_context * ctx,
-    const std::vector<llama_token> & embd,
-                               int & n_past,
-                               int   n_batch,
-                  std::string_view   state_path,
-                              bool   save_state);
+// Token utils
+//

-// replays the last token after loading state to regenerate logits
-// used after loading session state to ensure the sampling context has valid logits
-bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos);
+// longest common prefix
+size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
+
+// longet common subsequence
+size_t common_lcs(const llama_tokens & a, const llama_tokens & b);

 //
 // Vocab utils
@@ -913,11 +881,11 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

 const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";

-inline std::string llm_ffn_exps_block_regex(int idx) {
+static std::string llm_ffn_exps_block_regex(int idx) {
    return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
 }

-inline llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
+static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
    return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
 }

--- a/common/download.cpp
+++ b/common/download.cpp
@@ -19,7 +19,9 @@
 #include <thread>
 #include <vector>

+#if defined(LLAMA_USE_HTTPLIB)
 #include "http.h"
+#endif

 #ifndef __EMSCRIPTEN__
 #ifdef __linux__
@@ -112,18 +114,44 @@ static void write_etag(const std::string & path, const std::string & etag) {
 }

 static std::string read_etag(const std::string & path) {
+    std::string none;
    const std::string etag_path = path + ".etag";
-    if (!std::filesystem::exists(etag_path)) {
-        return {};
+
+    if (std::filesystem::exists(etag_path)) {
+        std::ifstream etag_in(etag_path);
+        if (!etag_in) {
+            LOG_ERR("%s: could not open .etag file for reading: %s\n", __func__, etag_path.c_str());
+            return none;
+        }
+        std::string etag;
+        std::getline(etag_in, etag);
+        return etag;
    }
-    std::ifstream etag_in(etag_path);
-    if (!etag_in) {
-        LOG_ERR("%s: could not open .etag file for reading: %s\n", __func__, etag_path.c_str());
-        return {};
+
+    // no etag file, but maybe there is an old .json
+    // remove this code later
+    const std::string metadata_path = path + ".json";
+
+    if (std::filesystem::exists(metadata_path)) {
+        std::ifstream metadata_in(metadata_path);
+        try {
+            nlohmann::json metadata_json;
+            metadata_in >> metadata_json;
+            LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(),
+                    metadata_json.dump().c_str());
+            if (metadata_json.contains("etag") && metadata_json.at("etag").is_string()) {
+                std::string etag = metadata_json.at("etag");
+                write_etag(path, etag);
+                if (!std::filesystem::remove(metadata_path)) {
+                    LOG_WRN("%s: failed to delete old .json metadata file: %s\n", __func__, metadata_path.c_str());
+                }
+                return etag;
+            }
+        } catch (const nlohmann::json::exception & e) {
+            LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
+        }
    }
-    std::string etag;
-    std::getline(etag_in, etag);
-    return etag;
+    return none;
 }

 static bool is_http_status_ok(int status) {
@@ -140,6 +168,8 @@ std::pair<std::string, std::string> common_download_split_repo_tag(const std::st
    return {hf_repo, tag};
 }

+#if defined(LLAMA_USE_HTTPLIB)
+
 class ProgressBar {
    static inline std::mutex mutex;
    static inline std::map<const ProgressBar *, int> lines;
@@ -275,10 +305,7 @@ static bool common_pull_file(httplib::Client & cli,
    );

    if (!res) {
-        LOG_ERR("%s: download failed: %s (status: %d)\n",
-                __func__,
-                httplib::to_string(res.error()).c_str(),
-                res ? res->status : -1);
+        LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
        return false;
    }

@@ -317,64 +344,62 @@ static int common_download_file_single_online(const std::string        & url,
        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
    }

-    auto head = cli.Head(parts.path);
-    if (!head || head->status < 200 || head->status >= 300) {
-        LOG_WRN("%s: HEAD failed, status: %d\n", __func__, head ? head->status : -1);
-        if (file_exists) {
-            LOG_INF("%s: using cached file (HEAD failed): %s\n", __func__, path.c_str());
-            return 304; // 304 Not Modified - fake cached response
-        }
-        return head ? head->status : -1;
-    }
-
-    std::string etag;
-    if (head->has_header("ETag")) {
-        etag = head->get_header_value("ETag");
-    }
-
-    size_t total_size = 0;
-    if (head->has_header("Content-Length")) {
-        try {
-            total_size = std::stoull(head->get_header_value("Content-Length"));
-        } catch (const std::exception& e) {
-            LOG_WRN("%s: invalid Content-Length in HEAD response: %s\n", __func__, e.what());
-        }
-    }
-
-    bool supports_ranges = false;
-    if (head->has_header("Accept-Ranges")) {
-        supports_ranges = head->get_header_value("Accept-Ranges") != "none";
-    }
-
-    if (file_exists) {
-        if (etag.empty()) {
-            LOG_INF("%s: using cached file (no server etag): %s\n", __func__, path.c_str());
-            return 304; // 304 Not Modified - fake cached response
-        }
-        if (!last_etag.empty() && last_etag == etag) {
-            LOG_INF("%s: using cached file (same etag): %s\n", __func__, path.c_str());
-            return 304; // 304 Not Modified - fake cached response
-        }
-        if (remove(path.c_str()) != 0) {
-            LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
-            return -1;
-        }
-    }
-
-    const std::string path_temporary = path + ".downloadInProgress";
-    int delay = retry_delay_seconds;
-
    for (int i = 0; i < max_attempts; ++i) {
-        if (i) {
-            LOG_WRN("%s: retrying after %d seconds...\n", __func__, delay);
-            std::this_thread::sleep_for(std::chrono::seconds(delay));
-            delay *= retry_delay_seconds;
+        auto head = cli.Head(parts.path);
+        bool head_ok = head && head->status >= 200 && head->status < 300;
+        if (!head_ok) {
+            LOG_WRN("%s: HEAD invalid http status code received: %d\n", __func__, head ? head->status : -1);
+            if (file_exists) {
+                LOG_INF("%s: Using cached file (HEAD failed): %s\n", __func__, path.c_str());
+                return 304; // 304 Not Modified - fake cached response
+            }
+            return head->status; // cannot use cached file, return raw status code
+            // TODO: maybe retry only on certain codes
        }

+        std::string etag;
+        if (head_ok && head->has_header("ETag")) {
+            etag = head->get_header_value("ETag");
+        }
+
+        size_t total_size = 0;
+        if (head_ok && head->has_header("Content-Length")) {
+            try {
+                total_size = std::stoull(head->get_header_value("Content-Length"));
+            } catch (const std::exception& e) {
+                LOG_WRN("%s: Invalid Content-Length in HEAD response: %s\n", __func__, e.what());
+            }
+        }
+
+        bool supports_ranges = false;
+        if (head_ok && head->has_header("Accept-Ranges")) {
+            supports_ranges = head->get_header_value("Accept-Ranges") != "none";
+        }
+
+        bool should_download_from_scratch = false;
+        if (!last_etag.empty() && !etag.empty() && last_etag != etag) {
+            LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__,
+                    last_etag.c_str(), etag.c_str());
+            should_download_from_scratch = true;
+        }
+
+        if (file_exists) {
+            if (!should_download_from_scratch) {
+                LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
+                return 304; // 304 Not Modified - fake cached response
+            }
+            LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
+            if (remove(path.c_str()) != 0) {
+                LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
+                return -1;
+            }
+        }
+
+        const std::string path_temporary = path + ".downloadInProgress";
        size_t existing_size = 0;

        if (std::filesystem::exists(path_temporary)) {
-            if (supports_ranges) {
+            if (supports_ranges && !should_download_from_scratch) {
                existing_size = std::filesystem::file_size(path_temporary);
            } else if (remove(path_temporary.c_str()) != 0) {
                LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
@@ -382,23 +407,32 @@ static int common_download_file_single_online(const std::string        & url,
            }
        }

-        LOG_INF("%s: downloading from %s to %s (etag:%s)...\n",
-                __func__, common_http_show_masked_url(parts).c_str(),
-                path_temporary.c_str(), etag.c_str());
-
-        if (common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size)) {
-            if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
-                LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
-                return -1;
+        // start the download
+        LOG_INF("%s: trying to download model from %s to %s (etag:%s)...\n",
+                __func__, common_http_show_masked_url(parts).c_str(), path_temporary.c_str(), etag.c_str());
+        const bool was_pull_successful = common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size);
+        if (!was_pull_successful) {
+            if (i + 1 < max_attempts) {
+                const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000;
+                LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay);
+                std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
+            } else {
+                LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts);
            }
-            if (!etag.empty()) {
-                write_etag(path, etag);
-            }
-            return head->status;
+            continue;
        }
+
+        if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
+            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
+            return -1;
+        }
+        if (!etag.empty()) {
+            write_etag(path, etag);
+        }
+
+        return head->status; // TODO: use actual GET status?
    }

-    LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts);
    return -1; // max attempts reached
 }

@@ -764,6 +798,30 @@ std::string common_docker_resolve_model(const std::string & docker) {
    }
 }

+#else
+
+common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool, const common_header_list &) {
+    throw std::runtime_error("download functionality is not enabled in this build");
+}
+
+bool common_download_model(const common_params_model &, const std::string &, bool, const common_header_list &) {
+    throw std::runtime_error("download functionality is not enabled in this build");
+}
+
+std::string common_docker_resolve_model(const std::string &) {
+    throw std::runtime_error("download functionality is not enabled in this build");
+}
+
+int common_download_file_single(const std::string &,
+                                const std::string &,
+                                const std::string &,
+                                bool,
+                                const common_header_list &) {
+    throw std::runtime_error("download functionality is not enabled in this build");
+}
+
+#endif // defined(LLAMA_USE_HTTPLIB)
+
 std::vector<common_cached_model_info> common_list_cached_models() {
    std::vector<common_cached_model_info> models;
    const std::string cache_dir = fs_get_cache_directory();
--- a/common/jinja/caps.cpp
+++ b/common/jinja/caps.cpp
@@ -63,8 +63,7 @@ static void caps_print_stats(value & v, const std::string & path) {

 std::map<std::string, bool> caps::to_map() const {
    return {
-        {"supports_string_content", supports_string_content},
-        {"supports_typed_content", supports_typed_content},
+        {"requires_typed_content", requires_typed_content},
        {"supports_tools", supports_tools},
        {"supports_tool_calls", supports_tool_calls},
        {"supports_parallel_tool_calls", supports_parallel_tool_calls},
@@ -90,7 +89,7 @@ caps caps_get(jinja::program & prog) {
        return v->stats.ops.find(op_name) != v->stats.ops.end();
    };

-    // case: typed content support
+    // case: typed content requirement
    caps_try_execute(
        prog,
        [&]() {
@@ -106,16 +105,12 @@ caps caps_get(jinja::program & prog) {
            // tools
            return json{nullptr};
        },
-        [&](bool success, value & messages, value &) {
+        [&](bool, value & messages, value &) {
            auto & content = messages->at(0)->at("content");
            caps_print_stats(content, "messages[0].content");
            if (has_op(content, "selectattr") || has_op(content, "array_access")) {
                // accessed as an array
-                result.supports_typed_content = true;
-            }
-            if (!success) {
-                // failed to execute with content as string
-                result.supports_string_content = false;
+                result.requires_typed_content = true;
            }
        }
    );
--- a/common/jinja/caps.h
+++ b/common/jinja/caps.h
@@ -14,9 +14,7 @@ struct caps {
    bool supports_parallel_tool_calls = true;
    bool supports_preserve_reasoning = false; // support assistant message with reasoning_content

-    // one of the 2 content capabilities must be true
-    bool supports_string_content = true;
-    bool supports_typed_content = false;
+    bool requires_typed_content = false; // default: use string content

    // for reporting on server
    std::map<std::string, bool> to_map() const;
--- a/common/jinja/runtime.cpp
+++ b/common/jinja/runtime.cpp
@@ -85,7 +85,7 @@ value identifier::execute_impl(context & ctx) {
    auto builtins = global_builtins();
    if (!it->is_undefined()) {
        if (ctx.is_get_stats) {
-            value_t::stats_t::mark_used(it);
+            it->stats.used = true;
        }
        JJ_DEBUG("Identifier '%s' found, type = %s", val.c_str(), it->type().c_str());
        return it;
@@ -277,7 +277,7 @@ value binary_expression::execute_impl(context & ctx) {
 static value try_builtin_func(context & ctx, const std::string & name, value & input, bool undef_on_missing = false) {
    JJ_DEBUG("Trying built-in function '%s' for type %s", name.c_str(), input->type().c_str());
    if (ctx.is_get_stats) {
-        value_t::stats_t::mark_used(input);
+        input->stats.used = true;
        input->stats.ops.insert(name);
    }
    auto builtins = input->get_builtins();
@@ -446,12 +446,6 @@ value for_statement::execute_impl(context & ctx) {

    value iterable_val = iter_expr->execute(scope);

-    // mark the variable being iterated as used for stats
-    if (ctx.is_get_stats) {
-        value_t::stats_t::mark_used(iterable_val);
-        iterable_val->stats.ops.insert("array_access");
-    }
-
    if (iterable_val->is_undefined()) {
        JJ_DEBUG("%s", "For loop iterable is undefined, skipping loop");
        iterable_val = mk_val<value_array>();
@@ -470,7 +464,7 @@ value for_statement::execute_impl(context & ctx) {
            items.push_back(std::move(tuple));
        }
        if (ctx.is_get_stats) {
-            value_t::stats_t::mark_used(iterable_val);
+            iterable_val->stats.used = true;
            iterable_val->stats.ops.insert("object_access");
        }
    } else {
@@ -480,7 +474,7 @@ value for_statement::execute_impl(context & ctx) {
            items.push_back(item);
        }
        if (ctx.is_get_stats) {
-            value_t::stats_t::mark_used(iterable_val);
+            iterable_val->stats.used = true;
            iterable_val->stats.ops.insert("array_access");
        }
    }
@@ -721,8 +715,6 @@ value member_expression::execute_impl(context & ctx) {
        int64_t arr_size = 0;
        if (is_val<value_array>(object)) {
            arr_size = object->as_array().size();
-        } else if (is_val<value_string>(object)) {
-            arr_size = object->as_string().length();
        }

        if (is_stmt<slice_expression>(this->property)) {
@@ -819,9 +811,8 @@ value member_expression::execute_impl(context & ctx) {
    }

    if (ctx.is_get_stats && val && object && property) {
-        value_t::stats_t::mark_used(val);
-        value_t::stats_t::mark_used(object);
-        value_t::stats_t::mark_used(property);
+        val->stats.used = true;
+        object->stats.used = true;
        if (is_val<value_int>(property)) {
            object->stats.ops.insert("array_access");
        } else if (is_val<value_string>(property)) {
--- a/common/jinja/value.cpp
+++ b/common/jinja/value.cpp
@@ -4,7 +4,6 @@
 // for converting from JSON to jinja values
 #include <nlohmann/json.hpp>

-#include <sstream>
 #include <string>
 #include <cctype>
 #include <vector>
@@ -161,11 +160,6 @@ static value tojson(const func_args & args) {
    value val_separators = args.get_kwarg_or_pos("separators",   3);
    value val_sort       = args.get_kwarg_or_pos("sort_keys",    4);
    int indent = -1;
-    if (args.ctx.is_get_stats) {
-        // mark as used (recursively) for stats
-        auto val_input = args.get_pos(0);
-        value_t::stats_t::mark_used(const_cast<value&>(val_input), true);
-    }
    if (is_val<value_int>(val_indent)) {
        indent = static_cast<int>(val_indent->as_int());
    }
@@ -721,46 +715,8 @@ const func_builtins & value_string_t::get_builtins() const {
            return args.get_pos(0);
        }},
        {"tojson", tojson},
-        {"indent", [](const func_args &args) -> value {
-            args.ensure_count(1, 4);
-            value val_input  = args.get_pos(0);
-            value val_width  = args.get_kwarg_or_pos("width", 1);
-            const bool first = args.get_kwarg_or_pos("first", 2)->as_bool(); // undefined == false
-            const bool blank = args.get_kwarg_or_pos("blank", 3)->as_bool(); // undefined == false
-            if (!is_val<value_string>(val_input)) {
-                throw raised_exception("indent() first argument must be a string");
-            }
-            std::string indent;
-            if (is_val<value_int>(val_width)) {
-                indent.assign(val_width->as_int(), ' ');
-            } else if (is_val<value_string>(val_width)) {
-                indent = val_width->as_string().str();
-            } else {
-                indent = "    ";
-            }
-            std::string indented;
-            std::string input = val_input->as_string().str();
-            std::istringstream iss = std::istringstream(input);
-            std::string line;
-            while (std::getline(iss, line)) {
-                if (!indented.empty()) {
-                    indented.push_back('\n');
-                }
-                if ((indented.empty() ? first : (!line.empty() || blank))) {
-                    indented += indent;
-                }
-                indented += line;
-            }
-            if (!input.empty() && input.back() == '\n') {
-                indented.push_back('\n');
-                if (blank) {
-                    indented += indent;
-                }
-            }
-
-            auto res = mk_val<value_string>(indented);
-            res->val_str.mark_input_based_on(val_input->as_string());
-            return res;
+        {"indent", [](const func_args &) -> value {
+            throw not_implemented_exception("String indent builtin not implemented");
        }},
        {"join", [](const func_args &) -> value {
            throw not_implemented_exception("String join builtin not implemented");
@@ -896,11 +852,6 @@ const func_builtins & value_array_t::get_builtins() const {
        }},
        {"string", [](const func_args & args) -> value {
            args.ensure_vals<value_array>();
-            if (args.ctx.is_get_stats) {
-                // mark as used (recursively) for stats
-                auto val_input = args.get_pos(0);
-                value_t::stats_t::mark_used(const_cast<value&>(val_input), true);
-            }
            return mk_val<value_string>(args.get_pos(0)->as_string());
        }},
        {"tojson", tojson},
@@ -1056,11 +1007,6 @@ const func_builtins & value_object_t::get_builtins() const {
        {"tojson", tojson},
        {"string", [](const func_args & args) -> value {
            args.ensure_vals<value_object>();
-            if (args.ctx.is_get_stats) {
-                // mark as used (recursively) for stats
-                auto val_input = args.get_pos(0);
-                value_t::stats_t::mark_used(const_cast<value&>(val_input), true);
-            }
            return mk_val<value_string>(args.get_pos(0)->as_string());
        }},
        {"length", [](const func_args & args) -> value {
@@ -1373,21 +1319,4 @@ std::string value_to_string_repr(const value & val) {
    }
 }

-// stats utility
-void value_t::stats_t::mark_used(value & val, bool deep) {
-    val->stats.used = true;
-    if (deep) {
-        if (is_val<value_array>(val)) {
-            for (auto & item : val->val_arr) {
-                mark_used(item, deep);
-            }
-        } else if (is_val<value_object>(val)) {
-            for (auto & pair : val->val_obj) {
-                mark_used(pair.first, deep);
-                mark_used(pair.second, deep);
-            }
-        }
-    }
-}
-
 } // namespace jinja
--- a/common/jinja/value.h
+++ b/common/jinja/value.h
@@ -118,8 +118,6 @@ struct value_t {
        bool used = false;
        // ops can be builtin calls or operators: "array_access", "object_access"
        std::set<std::string> ops;
-        // utility to recursively mark value and its children as used
-        static void mark_used(value & val, bool deep = false);
    } stats;

    value_t() = default;
--- a/common/ngram-map.cpp
+++ b/common/ngram-map.cpp
@@ -231,9 +231,10 @@ void common_ngram_map_draft(common_ngram_map & map,
        GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len);
    }

-    if (map.idx_last_check  > cur_len) {
-        // Should not happen because of common_ngram_map_begin().
-        GGML_ABORT("%s: map.idx_last_check > cur_len: %zu > %zu", __func__, map.idx_last_check, cur_len);
+    // Only check every check_rate tokens to save compute
+    // i.e., perform check if (cur_len - idx_last_check) >= check_rate
+    if (map.idx_last_check + map.check_rate > cur_len) {
+        return;
    }
    map.idx_last_check = cur_len;

@@ -461,7 +462,7 @@ void common_ngram_map_draft(common_ngram_map & map,
            slot_max = v;
        }
    }
-    // What is sum of the other occurrences?
+    // What is sum of the other occurences?
    uint32_t sum_occur = 0;
    for (int v = 0; v < COMMON_NGRAM_MAX_VALUES; ++v) {
        if (v == slot_max) {
--- a/common/ngram-map.h
+++ b/common/ngram-map.h
@@ -24,6 +24,7 @@
 struct common_ngram_simple_config {
    uint16_t   size_ngram;      // size of n-grams to lookup in self-mode
    uint16_t   size_mgram;      // size of m-grams to draft in self-mode
+    uint16_t   check_rate;      // check for speculative decoding without draft model for each check_rate token
 };

 // Searches for a n-gram in the history and checks whether a draft sequence should be generated.
@@ -44,7 +45,7 @@ llama_tokens common_ngram_simple_draft(
 // statistics of a m-gram after a known n-gram
 struct common_ngram_map_value {
    size_t   value_idx =  0;  // index of value m-gram in token-history (0 if unused)
-    uint16_t value_num =  0;  // number of occurrences of this value m-gram after the key n-gram (0 in an unused values-slot)
+    uint16_t value_num =  0;  // number of occurences of this value m-gram after the key n-gram (0 in an unused values-slot)
    int16_t n_accepted = -1;  // number of accepted tokens at last draft (-1 if unused)
 };

@@ -53,7 +54,7 @@ struct common_ngram_map_key {
    size_t   key_idx;   // index of key n-gram in token-history
    size_t   stat_idx;  // index of last token of stastistics computation (key_num, values)

-    uint16_t key_num;   // number of occurrences of this key n-gram in token-history
+    uint16_t key_num;   // number of occurences of this key n-gram in token-history
    common_ngram_map_value values[COMMON_NGRAM_MAX_VALUES]; // some known values after the key
 };

@@ -65,14 +66,15 @@ struct common_ngram_map {
    bool key_only;       // true if only key n-grams are used, no values.

    std::vector<common_ngram_map_key> keys; // key n-grams which occur several times in token-history
+    uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token
    uint16_t min_hits;   // minimum number of key hits to consider a draft

-    bool     show_key_map_stats = false; // true, if statistics of the key_map should be printed.
+    bool     show_key_map_stats = false; // true, if statitics of the key_map should be printed.

    common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys,
-                     uint16_t min_hits)
+                     uint16_t check_rate, uint16_t min_hits)
        : size_key(sz_key), size_value(sz_value), key_only(only_keys),
-          min_hits(min_hits) {
+          check_rate(check_rate), min_hits(min_hits) {
        key_map.resize(COMMON_NGRAM_HASH_MAP_SIZE); // 2^18 hash entries, 0 entries if key_map shouldn't be used
    }

--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -113,14 +113,13 @@ static bool common_speculative_are_compatible(
 struct common_speculative_state {
    const enum common_speculative_type type;

-    size_t n_call_begin  = 0; // number of times this implementation was called for refresh.
-    size_t n_call_draft  = 0; // number of times this implementation was called for generation.
-    size_t n_call_accept = 0; // number of times this implementation was called for accumulation.
-
-    size_t n_gen_drafts = 0; // number of times a draft or part was generated by this implementation.
-    size_t n_acc_drafts = 0; // number of times a draft or part was accepted by the target model.
-    size_t n_gen_tokens = 0; // number of tokens generated by this implementation.
-    size_t n_acc_tokens = 0; // number of tokens accepted by the target model.
+    // TODO: rename to n_call_draft, n_gen_drafts, n_acc_drafts, n_gen_tokens, n_acc_tokens
+    // TODO: add n_call_begin, n_call_accept
+    size_t drafts_call_count       = 0; // number of times this implementation was called.
+    size_t drafts_generated_count  = 0; // number of times a draft or part was generated by this implementation.
+    size_t drafts_accepted_count   = 0; // number of times a draft or part was accepted by the target model.
+    size_t drafts_generated_tokens = 0; // number of tokens generated by this implementation.
+    size_t drafts_accepted_tokens  = 0; // number of tokens accepted by the target model.

    // TODO: track performance of most recent calls
    const bool gen_perf = true; // whether to generate performance stats.
@@ -466,6 +465,8 @@ struct common_speculative_state_eagle3 : public common_speculative_state {
 struct common_speculative_state_ngram_simple : public common_speculative_state {
    common_ngram_simple_config config;

+    uint16_t check_id = 0; // used to control the frequency of generating drafts
+
    common_speculative_state_ngram_simple(
            enum common_speculative_type type,
            common_ngram_simple_config config)
@@ -480,6 +481,11 @@ struct common_speculative_state_ngram_simple : public common_speculative_state {
            const llama_tokens & prompt_tgt,
            llama_token id_last,
            llama_tokens & result) override {
+        ++check_id;
+        if (check_id < config.check_rate) {
+            return;
+        }
+        check_id = 0;

        result = common_ngram_simple_draft(config, prompt_tgt, id_last);
        GGML_UNUSED(params);
@@ -746,9 +752,10 @@ static common_ngram_map get_common_ngram_map(const common_speculative_config & c
    uint16_t size_key   = config.params.ngram_size_n;
    uint16_t size_value = config.params.ngram_size_m;
    bool     key_only   = (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
+    uint16_t check_rate = config.params.ngram_check_rate;
    uint16_t min_hits   = config.params.ngram_min_hits;

-    return common_ngram_map(size_key, size_value, key_only, min_hits);
+    return common_ngram_map(size_key, size_value, key_only, check_rate, min_hits);
 }

 static common_speculative_state_ngram_cache create_state_ngram_cache(
@@ -798,42 +805,6 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
    return it->second;
 }

-bool common_speculative_is_compat(llama_context * ctx_tgt) {
-    auto * mem = llama_get_memory(ctx_tgt);
-    if (mem == nullptr) {
-        return false;
-    }
-
-    bool res = true;
-
-    llama_memory_clear(mem, true);
-
-    // eval 2 tokens to check if the context is compatible
-    std::vector<llama_token> tmp;
-    tmp.push_back(0);
-    tmp.push_back(0);
-
-    int ret = llama_decode(ctx_tgt, llama_batch_get_one(tmp.data(), tmp.size()));
-    if (ret != 0) {
-        LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
-        res = false;
-        goto done;
-    }
-
-    // try to remove the last tokens
-    if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
-        LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
-        res = false;
-        goto done;
-    }
-
-done:
-    llama_memory_clear(mem, true);
-    llama_synchronize(ctx_tgt);
-
-    return res;
-}
-
 // initialization of the speculative decoding system
 //
 common_speculative * common_speculative_init(
@@ -924,10 +895,12 @@ common_speculative * common_speculative_init(

                uint16_t ngram_size_key   = ngram_map.size_key;
                uint16_t mgram_size_value = ngram_map.size_value;
+                uint16_t check_rate       = ngram_map.check_rate;

                auto config_simple = common_ngram_simple_config {
                    /* .size_ngram      = */ ngram_size_key,
-                    /* .size_mgram      = */ mgram_size_value
+                    /* .size_mgram      = */ mgram_size_value,
+                    /* .check_rate      = */ check_rate
                };
                auto state = std::make_unique<common_speculative_state_ngram_simple>(
                    /* .type            = */ config.type,
@@ -988,7 +961,6 @@ void common_speculative_begin(common_speculative * spec, const llama_tokens & pr
    for (auto & impl : spec->impls) {
        common_time_meas tm(impl->t_begin_us, !impl->gen_perf);
        impl->begin(prompt);
-        impl->n_call_begin++;
    }
 }

@@ -1005,17 +977,17 @@ llama_tokens common_speculative_draft(
        {
            common_time_meas tm(impl->t_draft_us, !impl->gen_perf);
            impl->draft(params, prompt_tgt, id_last, result);
-            impl->n_call_draft++;
+            impl->drafts_call_count++;
        }

        if (!result.empty()) {
            LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__,
                    common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(),
-                    impl.get()->n_call_draft, result.size());
+                    impl.get()->drafts_call_count, result.size());

            spec->curr_impl = impl.get(); // set current implementation for stats
-            impl->n_gen_drafts++;
-            impl->n_gen_tokens += result.size();
+            impl->drafts_generated_count++;
+            impl->drafts_generated_tokens += result.size();

            break; // We have a draft, so break out of the loop and return it.
        }
@@ -1036,12 +1008,11 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) {
    {
        common_time_meas tm(impl->t_accept_us, !impl->gen_perf);
        if (n_accepted > 0) {
-            impl->n_acc_drafts++;
-            impl->n_acc_tokens += n_accepted;
+            impl->drafts_accepted_count++;
+            impl->drafts_accepted_tokens += n_accepted;
        }

        impl->accept(n_accepted);
-        impl->n_call_accept++;
    }
 }

@@ -1062,13 +1033,13 @@ void common_speculative_print_stats(const common_speculative * spec) {
            str_perf = "";
        }

-        LOG_INF("statistics %s: #calls(b,g,a) = %zu %zu %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
+        LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
                common_speculative_type_to_str(impl->type).c_str(),
-                impl->n_call_begin, impl->n_call_draft, impl->n_call_accept,
-                impl->n_gen_drafts,
-                impl->n_acc_drafts,
-                impl->n_gen_tokens,
-                impl->n_acc_tokens,
+                impl->drafts_call_count,
+                impl->drafts_generated_count,
+                impl->drafts_accepted_count,
+                impl->drafts_generated_tokens,
+                impl->drafts_accepted_tokens,
                str_perf.c_str());
    }
 }
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -14,10 +14,6 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
 // convert type to string
 std::string common_speculative_type_to_str(enum common_speculative_type type);

-// check if the llama_context is compatible for speculative decoding
-// note: clears the memory of the context
-bool common_speculative_is_compat(llama_context * ctx_tgt);
-
 common_speculative * common_speculative_init(
        common_params_speculative & params,
        llama_context             * ctx_tgt);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -99,7 +99,6 @@ models = [
    {"name": "stablelm2",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
    {"name": "refact",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
    {"name": "command-r",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
-    {"name": "tiny_aya",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/tiny-aya-base", },
    {"name": "qwen2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
    {"name": "olmo",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
    {"name": "dbrx",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
@@ -107,7 +106,6 @@ models = [
    {"name": "jina-v2-en",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
    {"name": "jina-v2-es",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
    {"name": "jina-v2-de",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
-    {"name": "jina-v5-nano",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v5-text-nano", },
    {"name": "smaug-bpe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
    {"name": "poro-chat",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
    {"name": "jina-v2-code",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
@@ -115,7 +113,6 @@ models = [
    {"name": "gemma",            "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
    {"name": "gemma-2",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
    {"name": "jais",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
-    {"name": "jais-2",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inceptionai/Jais-2-8B-Chat", },
    {"name": "t5",               "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
    {"name": "codeshell",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
    {"name": "tekken",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
@@ -151,9 +148,6 @@ models = [
    {"name": "youtu",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Youtu-LLM-2B", },
    {"name": "solar-open",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/upstage/Solar-Open-100B", },
    {"name": "exaone-moe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B", },
-    {"name": "qwen35",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3.5-9B-Instruct", },
-    {"name": "joyai-llm",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
-    {"name": "kanana2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
@@ -163,7 +157,6 @@ pre_computed_hashes = [
    {"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
    {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
    {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.5-Air", "chkhsh": "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902"},
-    {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.7-Flash", "chkhsh": "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267"},
    {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
    {"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
    {"name": "hunyuan-dense", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-4B-Instruct", "chkhsh": "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6"},
@@ -177,6 +170,7 @@ pre_computed_hashes = [
    {"name": "grok-2",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
    # jina-v2-de variants
    {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
+    {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.7-Flash", "chkhsh": "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267"},
 ]


--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -246,7 +246,7 @@ cmake --build build --config release

 1. **Retrieve and prepare model**

-    You can refer to the general [*Obtaining and quantizing models*](../../README.md#obtaining-and-quantizing-models) guide for model prepration.
+    You can refer to the general [*Prepare and Quantize*](../../README.md#prepare-and-quantize) guide for model prepration.

    **Notes**:

--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -281,7 +281,7 @@ as `-cl-fp32-correctly-rounded-divide-sqrt`

 #### Retrieve and prepare model

-You can refer to the general [*Obtaining and quantizing models*](../../README.md#obtaining-and-quantizing-models) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q4_0.gguf?download=true) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).
+You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q4_0.gguf?download=true) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).

 ##### Check device

@@ -569,7 +569,7 @@ Once it is completed, final results will be in **build/Release/bin**

 #### Retrieve and prepare model

-You can refer to the general [*Obtaining and quantizing models*](../../README.md#obtaining-and-quantizing-models) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).
+You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).

 ##### Check device

--- a/docs/backend/VirtGPU.md
+++ b/docs/backend/VirtGPU.md
@@ -1,182 +0,0 @@
-# GGML-VirtGPU Backend
-
-The GGML-VirtGPU backend enables GGML applications to run machine
-learning computations on host hardware while the application itself
-runs inside a virtual machine.  It uses host-guest shared memory to
-efficiently share data buffers between the two sides.
-
-This backend relies on the virtio-gpu, and VirglRenderer API Remoting
-(APIR) component. The backend is split into two libraries:
- a GGML implementation (the "remoting frontend"), running in the
-  guest and interacting with the virtgpu device
- a VirglRenderer APIR compatible library (the "remoting backend"),
-  running in the host and interacting with Virglrenderer and an actual
-  GGML device backend.
-
-## OS support
-
-| OS       | Status            | Backend     | CI testing  | Notes
-| -------- | ----------------- | ----------- | ----------- | -----
-| MacOS 14 | Supported         | ggml-metal  | X           | Working when compiled on MacOS 14
-| MacOS 15 | Supported         | ggml-metal  | X           | Working when compiled on MacOS 14 or MacOS 15
-| MacOS 26 | Not tested        |             |             |
-| Linux    | Under development | ggml-vulkan | not working | Working locally, CI running into deadlocks
-
-
-## Architecture Overview
-
-The GGML-VirtGPU backend consists of three main components:
-
-```mermaid
-graph TD
-    %% Nodes
-
- subgraph GuestVM ["Guest VM - Frontend"]
-        App([GGML Application<br/>llama.cpp, etc.])
-
-        direction TB
-        Interface[GGML Backend Interface]
-        Comm["GGML-VirtGPU<br/>(hypercalls + shared mem)"]
-
-        App --> Interface
-        Interface --> Comm
-    end
-
-    API[virtio-gpu / virglrenderer API]
-
-    subgraph HostSystem [Host System - Backend]
-        direction TB
-        Dispatcher[GGML-VirtGPU-Backend]
-        BackendLib[GGML Backend library<br/>Metal / Vulkan / CPU / ...]
-
-        Dispatcher --> BackendLib
-    end
-
-    %% Connections
-    Comm --> API
-    API --> HostSystem
-```
-
-### Key Components
-
-1. **Guest-side Frontend** (`ggml-virtgpu/`): Implements the GGML backend interface and forwards operations to the host
-2. **Host-side Backend** (`ggml-virtgpu/backend/`): Receives forwarded operations and executes them on actual hardware backends
-3. **Communication Layer**: Uses virtio-gpu hypercalls and shared memory for efficient data transfer
-
-## Features
-
- **Dynamic backend loading** on the host side (CPU, CUDA, Metal, etc.)
- **Zero-copy data transfer** via host-guest shared memory pages
-
-## Communication Protocol
-
-### Hypercalls and Shared Memory
-
-The backend uses two primary communication mechanisms:
-
-1. **Hypercalls (`DRM_IOCTL_VIRTGPU_EXECBUFFER`)**: Trigger remote execution from guest to host
-2. **Shared Memory Pages**: Zero-copy data transfer for tensors and parameters
-
-#### Shared Memory Layout
-
-Each connection uses two shared memory buffers:
-
- **Data Buffer** (24 MiB): For command/response data and tensor transfers
- **Reply Buffer** (16 KiB): For command replies and status information
- **Data Buffers**: Dynamically allocated host-guest shared buffers
-  served as GGML buffers.
-
-### APIR Protocol
-
-The Virglrender API Remoting protocol defines three command types:
-
- `HANDSHAKE`: Protocol version negotiation and capability discovery
- `LOADLIBRARY`: Dynamic loading of backend libraries on the host
- `FORWARD`: API function call forwarding
-
-### Binary Serialization
-
-Commands and data are serialized using a custom binary protocol with:
-
- Fixed-size encoding for basic types
- Variable-length arrays with size prefixes
- Buffer bounds checking
- Error recovery mechanisms
-
-## Supported Operations
-
-### Device Operations
- Device enumeration and capability queries
- Memory information (total/free)
- Backend type detection
-
-### Buffer Operations
- Buffer allocation and deallocation
- Tensor data transfer (host ↔ guest)
- Memory copying and clearing
-
-### Computation Operations
- Graph execution forwarding
-
-## Build Requirements
-
-### Guest-side Dependencies
- `libdrm` for DRM/virtio-gpu communication
- C++20 compatible compiler
- CMake 3.14+
-
-### Host-side Dependencies
- virglrenderer with APIR support (pending upstream review)
- Target backend libraries (libggml-metal, libggml-vulkan, etc.)
-
-## Configuration
-
-### Environment Variables
-
- `GGML_VIRTGPU_BACKEND_LIBRARY`: Path to the host-side backend library
- `GGML_VIRTGPU_DEBUG`: Enable debug logging
-
-### Build Options
-
- `GGML_VIRTGPU`: Enable the VirtGPU backend (`ON` or `OFF`, default: `OFF`)
- `GGML_VIRTGPU_BACKEND`: Build the host-side backend component (`ON`, `OFF` or `ONLY`, default: `OFF`)
-
-### System Requirements
-
- VM with virtio-gpu support
- VirglRenderer with APIR patches
- Compatible backend libraries on host
-
-## Limitations
-
- **VM-specific**: Only works in virtual machines with virtio-gpu support
- **Host dependency**: Requires properly configured host-side backend
- **Latency**: Small overhead from VM escaping for each operation
- **Shared-memory size**: with the `libkrun` hypervisor, the RAM + VRAM
-  addressable memory is limited to 64 GB. So the maximum GPU memory
-  will be `64GB - RAM`, regardless of the hardware VRAM size.
-
-* This work is pending upstream changes in the VirglRenderer
-  project.
-  * The backend can be tested with Virglrenderer compiled from source
-  using this PR:
-  https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590
-* This work is pending changes in the VMM/hypervisor running the
-  virtual machine, which need to know how to route the newly
-  introduced APIR capset.
-  * The environment variable `VIRGL_ROUTE_VENUS_TO_APIR=1` allows
-    using the Venus capset, until the relevant hypervisors have been
-    patched. However, setting this flag breaks the Vulkan/Venus normal
-    behavior.
-  * The environment variable `GGML_REMOTING_USE_APIR_CAPSET` tells the
-    `ggml-virtgpu` backend to use the APIR capset. This will become
-    the default when the relevant hypervisors have been patched.
-
-* This work focused on improving the performance of llama.cpp running
-  on MacOS containers, and is mainly tested on this platform. The
-  linux support (via `krun`) is in progress.
-
-## See Also
-
- [Development and Testing](VirtGPU/development.md)
- [Backend configuration](VirtGPU/configuration.md)
--- a/docs/backend/VirtGPU/configuration.md
+++ b/docs/backend/VirtGPU/configuration.md
@@ -1,174 +0,0 @@
-# GGML-VirtGPU Backend Configuration
-
-This document describes the environment variables used by the ggml-virtgpu backend system, covering both the frontend (guest-side) and backend (host-side) components.
-
-## Environment Variables Overview
-
-The ggml-virtgpu backend uses environment variables for configuration across three main components:
- **Frontend (Guest)**: GGML applications running in VMs
- **Hypervisor**: Virglrenderer/APIR system
- **Backend (Host)**: Host-side GGML backend integration
-
-## Frontend (Guest-side) Configuration
-
-### GGML_REMOTING_USE_APIR_CAPSET
- **Location**: `ggml/src/ggml-virtgpu/virtgpu.cpp`
- **Type**: Boolean flag (presence-based)
- **Purpose**: Controls which virtio-gpu capability set to use for communication
- **Values**:
-  - Set (any value): Use the APIR capset (long-term setup)
-  - Unset: Use the Venus capset (easier for testing with an unmodified hypervisor)
- **Default**: Unset (Venus capset)
- **Usage**:
-  ```bash
-  export GGML_REMOTING_USE_APIR_CAPSET=1  # Use APIR capset
-  # or leave unset for Venus capset
-  ```
-
-## Hypervisor (Virglrenderer/APIR) Configuration
-
-These environment variables are used during the transition phase for
-running with an unmodified hypervisor (not supporting the
-VirglRenderer APIR component). They will be removed in the future, and
-the hypervisor will instead configure VirglRenderer with the APIR
-_Configuration Key_.
-
-### VIRGL_APIR_BACKEND_LIBRARY
- **Location**: `virglrenderer/src/apir/apir-context.c`
- **Configuration Key**: `apir.load_library.path`
- **Type**: File path string
- **Purpose**: Path to the APIR backend library that virglrenderer should dynamically load
- **Required**: Yes
- **Example**:
-  ```bash
-  export VIRGL_APIR_BACKEND_LIBRARY="/path/to/libggml-remotingbackend.so"
-  ```
-
-### VIRGL_ROUTE_VENUS_TO_APIR
- **Location**: `virglrenderer/src/apir/apir-renderer.h`
- **Type**: Boolean flag (presence-based)
- **Purpose**: Temporary workaround to route Venus capset calls to APIR during hypervisor transition period
- **Status**: will be removed once hypervisors support APIR natively
- **Warning**: Breaks normal Vulkan/Venus functionality
- **Usage**:
-  ```bash
-  export VIRGL_ROUTE_VENUS_TO_APIR=1  # For testing with an unmodified hypervisor
-  ```
-
-### VIRGL_APIR_LOG_TO_FILE
- **Location**: `virglrenderer/src/apir/apir-renderer.c`
- **Environment Variable**: `VIRGL_APIR_LOG_TO_FILE`
- **Type**: File path string
- **Purpose**: Enable debug logging from the VirglRenderer APIR component to specified file
- **Required**: No (optional debugging)
- **Default**: Logging to `stderr`
- **Usage**:
-  ```bash
-  export VIRGL_APIR_LOG_TO_FILE="/tmp/apir-debug.log"
-  ```
-
-## Backend (Host-side) Configuration
-
-These environment variables are used during the transition phase for
-running with an unmodified hypervisor (not supporting the
-VirglRenderer APIR component). They will be removed in the future, and
-the hypervisor will instead configure VirglRenderer with the APIR
-_Configuration Key_.
-
-### APIR_LLAMA_CPP_GGML_LIBRARY_PATH
- **Location**: `ggml/src/ggml-virtgpu/backend/backend.cpp`
- **Environment Variable**: `APIR_LLAMA_CPP_GGML_LIBRARY_PATH`
- **Configuration Key**: `ggml.library.path`
- **Type**: File path string
- **Purpose**: Path to the actual GGML backend library (Metal, CUDA, Vulkan, etc.)
- **Required**: **Yes** - backend initialization fails without this
- **Examples**:
-  ```bash
-  # macOS with Metal backend
-  export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-metal.dylib"
-
-  # Linux with CUDA backend
-  export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-cuda.so"
-
-  # macOS or Linux with Vulkan backend
-  export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-vulkan.so"
-  ```
-
-### APIR_LLAMA_CPP_GGML_LIBRARY_REG
- **Location**: `ggml/src/ggml-virtgpu/backend/backend.cpp`
- **Environment Variable**: `APIR_LLAMA_CPP_GGML_LIBRARY_REG`
- **Configuration Key**: `ggml.library.reg`
- **Type**: Function symbol name string
- **Purpose**: Name of the backend registration function to call after loading the library
- **Required**: No (defaults to `ggml_backend_init`)
- **Default**: `ggml_backend_init`
- **Examples**:
-  ```bash
-  # Metal backend
-  export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_metal_reg"
-
-  # CUDA backend
-  export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_cuda_reg"
-
-  # Vulkan backend
-  export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_vulkan_reg"
-
-  # Generic fallback (default)
-  # export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_init"
-  ```
-
-### APIR_LLAMA_CPP_LOG_TO_FILE
- **Location**: `ggml/src/ggml-virtgpu/backend/backend.cpp:62`
- **Environment Variable**: `APIR_LLAMA_CPP_LOG_TO_FILE`
- **Type**: File path string
- **Purpose**: Enable debug logging from the GGML backend to specified file
- **Required**: No (optional debugging)
- **Usage**:
-  ```bash
-  export APIR_LLAMA_CPP_LOG_TO_FILE="/tmp/ggml-backend-debug.log"
-  ```
-
-## Configuration Flow
-
-The configuration system works as follows:
-
-1. **Hypervisor Setup**: Virglrenderer loads the APIR backend library specified by `VIRGL_APIR_BACKEND_LIBRARY`
-
-2. **Context Creation**: When an APIR context is created, it populates a configuration table with environment variables:
-   - `apir.load_library.path` ← `VIRGL_APIR_BACKEND_LIBRARY`
-   - `ggml.library.path` ← `APIR_LLAMA_CPP_GGML_LIBRARY_PATH`
-   - `ggml.library.reg` ← `APIR_LLAMA_CPP_GGML_LIBRARY_REG`
-   - this step will eventually be performed by the hypervisor itself, with command-line arguments instead of environment variables.
-
-3. **Backend Initialization**: The backend queries the configuration via callbacks:
-   - `virgl_cbs->get_config(ctx_id, "ggml.library.path")` returns the library path
-   - `virgl_cbs->get_config(ctx_id, "ggml.library.reg")` returns the registration function
-
-4. **Library Loading**: The backend dynamically loads and initializes the specified GGML library
-
-## Error Messages
-
-Common error scenarios and their messages:
-
- **Missing library path**: `"cannot open the GGML library: env var 'APIR_LLAMA_CPP_GGML_LIBRARY_PATH' not defined"`
- **Missing registration function**: `"cannot register the GGML library: env var 'APIR_LLAMA_CPP_GGML_LIBRARY_REG' not defined"`
-
-## Example Complete Configuration
-
-Here's an example configuration for a macOS host with Metal backend:
-
-```bash
-# Hypervisor environment
-export VIRGL_APIR_BACKEND_LIBRARY="/opt/llama.cpp/lib/libggml-virtgpu-backend.dylib"
-
-# Backend configuration
-export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-metal.dylib"
-export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_metal_reg"
-
-# Optional logging
-export VIRGL_APIR_LOG_TO_FILE="/tmp/apir.log"
-export APIR_LLAMA_CPP_LOG_TO_FILE="/tmp/ggml.log"
-
-# Guest configuration
-export GGML_REMOTING_USE_APIR_CAPSET=1
-```
--- a/docs/backend/VirtGPU/development.md
+++ b/docs/backend/VirtGPU/development.md
@@ -1,220 +0,0 @@
-# Development and Testing
-
-## Development
-
-### Code Generation
-
-The backend uses code generation from YAML configuration:
-
-```bash
-# Regenerate protocol code
-cd ggml-virtgpu/
-python regenerate_remoting.py
-```
-
-### Adding New Operations
-
-1. Add function definition to `ggmlremoting_functions.yaml`
-2. Regenerate code with `regenerate_remoting.py`
-3. Implement guest-side forwarding in `virtgpu-forward-*.cpp`
-4. Implement host-side handling in `backend-dispatched-*.cpp`
-
-## Testing
-
-This document provides instructions for building and testing the GGML-VirtGPU backend on macOS with containers.
-
-### Prerequisites
-
-The testing setup requires:
-
- macOS host system
- Container runtime with `libkrun` provider (podman machine)
- Access to development patchset for VirglRenderer
-
-### Required Patchsets
-
-The backend requires patches that are currently under review:
-
- **Virglrenderer APIR upstream PR**: https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590 (for reference)
- **MacOS Virglrenderer (for krunkit)**: https://gitlab.freedesktop.org/kpouget/virglrenderer/-/tree/main-macos
- **Linux Virglrenderer (for krun)**: https://gitlab.freedesktop.org/kpouget/virglrenderer/-/tree/main-linux
-
-### Build Instructions
-
-#### 1. Build ggml-virtgpu-backend (Host-side, macOS)
-
-```bash
-# Build the backend that runs natively on macOS
-mkdir llama.cpp
-cd llama.cpp
-git clone https://github.com/ggml-org/llama.cpp.git src
-cd src
-
-LLAMA_MAC_BUILD=$PWD/build/ggml-virtgpu-backend
-
-cmake -S . -B $LLAMA_MAC_BUILD \
-      -DGGML_NATIVE=OFF \
-      -DLLAMA_CURL=ON \
-      -DGGML_REMOTINGBACKEND=ONLY \
-      -DGGML_METAL=ON
-
-TARGETS="ggml-metal"
-cmake --build $LLAMA_MAC_BUILD --parallel 8 --target $TARGETS
-
-# Build additional tools for native benchmarking
-EXTRA_TARGETS="llama-run llama-bench"
-cmake --build $LLAMA_MAC_BUILD --parallel 8 --target $EXTRA_TARGETS
-```
-
-#### 2. Build virglrenderer (Host-side, macOS)
-
-```bash
-# Build virglrenderer with APIR support
-mkdir virglrenderer
-git clone https://gitlab.freedesktop.org/kpouget/virglrenderer -b main-macos src
-cd src
-
-VIRGL_BUILD_DIR=$PWD/build
-
-# -Dvenus=true and VIRGL_ROUTE_VENUS_TO_APIR=1 route the APIR requests via the Venus backend, for easier testing without a patched hypervisor
-
-meson setup $VIRGL_BUILD_DIR \
-      -Dvenus=true \
-      -Dapir=true
-
-ninja -C $VIRGL_BUILD_DIR
-```
-
-#### 3. Build ggml-virtgpu (Guest-side, Linux)
-
-Option A: Build from a script:
-
-```bash
-# Inside a Linux container
-mkdir llama.cpp
-git clone https://github.com/ggml-org/llama.cpp.git src
-cd src
-
-LLAMA_LINUX_BUILD=$PWD//build-virtgpu
-
-cmake -S . -B $LLAMA_LINUX_BUILD \
-      -DGGML_VIRTGPU=ON
-
-ninja -C $LLAMA_LINUX_BUILD
-```
-
-Option B: Build container image with frontend:
-
-```bash
-cat << EOF > remoting.containerfile
-FROM quay.io/fedora/fedora:43
-USER 0
-
-WORKDIR /app/remoting
-
-ARG LLAMA_CPP_REPO="https://github.com/ggml-org/llama.cpp.git"
-ARG LLAMA_CPP_VERSION="master"
-ARG LLAMA_CPP_CMAKE_FLAGS="-DGGML_VIRTGPU=ON"
-ARG LLAMA_CPP_CMAKE_BUILD_FLAGS="--parallel 4"
-
-RUN dnf install -y git cmake gcc gcc-c++ libcurl-devel libdrm-devel
-
-RUN git clone "\${LLAMA_CPP_REPO}" src \\
- && git -C src fetch origin \${LLAMA_CPP_VERSION} \\
- && git -C src reset --hard FETCH_HEAD
-
-RUN mkdir -p build \\
- && cd src \\
- && set -o pipefail \\
- && cmake -S . -B ../build \${LLAMA_CPP_CMAKE_FLAGS} \\
- && cmake --build ../build/ \${LLAMA_CPP_CMAKE_BUILD_FLAGS}
-
-ENTRYPOINT ["/app/remoting/src/build/bin/llama-server"]
-EOF
-
-mkdir -p empty_dir
-podman build -f remoting.containerfile ./empty_dir -t localhost/llama-cpp.virtgpu
-```
-
-### Environment Setup
-
-#### Set krunkit Environment Variables
-
-```bash
-# Define the base directories (adapt these paths to your system)
-VIRGL_BUILD_DIR=$HOME/remoting/virglrenderer/build
-LLAMA_MAC_BUILD=$HOME/remoting/llama.cpp/build-backend
-
-# For krunkit to load the custom virglrenderer library
-export DYLD_LIBRARY_PATH=$VIRGL_BUILD_DIR/src
-
-# For Virglrenderer to load the ggml-remotingbackend library
-export VIRGL_APIR_BACKEND_LIBRARY="$LLAMA_MAC_BUILD/bin/libggml-virtgpu-backend.dylib"
-
-# For llama.cpp remotingbackend to load the ggml-metal backend
-export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="$LLAMA_MAC_BUILD/bin/libggml-metal.dylib"
-export APIR_LLAMA_CPP_GGML_LIBRARY_REG=ggml_backend_metal_reg
-```
-
-#### Launch Container Environment
-
-```bash
-# Set container provider to libkrun
-export CONTAINERS_MACHINE_PROVIDER=libkrun
-podman machine start
-```
-
-#### Verify Environment
-
-Confirm that krunkit is using the correct virglrenderer library:
-
-```bash
-lsof -c krunkit | grep virglrenderer
-# Expected output:
-# krunkit 50574 user  txt  REG  1,14  2273912  10849442 ($VIRGL_BUILD_DIR/src)/libvirglrenderer.1.dylib
-```
-
-### Running Tests
-
-#### Launch Test Container
-
-```bash
-# Optional model caching
-mkdir -p models
-PODMAN_CACHE_ARGS="-v models:/models --user root:root --cgroupns host --security-opt label=disable -w /models"
-
-podman run $PODMAN_CACHE_ARGS -it --rm --device /dev/dri localhost/llama-cpp.virtgpu
-```
-
-#### Test llama.cpp in Container
-
-```bash
-
-# Run performance benchmark
-/app/remoting/build/bin/llama-bench -m ./llama3.2
-```
-
-Expected output (performance may vary):
-```
-| model                          |       size |     params | backend    | ngl |          test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | ------------: | -------------------: |
-| llama 3B Q4_K - Medium         |   1.87 GiB |     3.21 B | ggml-virtgpu |  99 |         pp512 |        991.30 ± 0.66 |
-| llama 3B Q4_K - Medium         |   1.87 GiB |     3.21 B | ggml-virtgpu |  99 |         tg128 |         85.71 ± 0.11 |
-```
-
-### Troubleshooting
-
-#### SSH Environment Variable Issues
-
-⚠️ **Warning**: Setting `DYLD_LIBRARY_PATH` from SSH doesn't work on macOS. Here is a workaround:
-
-**Workaround 1: Replace system library**
-```bash
-VIRGL_BUILD_DIR=$HOME/remoting/virglrenderer/build  # ⚠️ adapt to your system
-BREW_VIRGL_DIR=/opt/homebrew/Cellar/virglrenderer/0.10.4d/lib
-VIRGL_LIB=libvirglrenderer.1.dylib
-
-cd $BREW_VIRGL_DIR
-mv $VIRGL_LIB ${VIRGL_LIB}.orig
-ln -s $VIRGL_BUILD_DIR/src/$VIRGL_LIB
-```
--- a/docs/backend/ZenDNN.md
+++ b/docs/backend/ZenDNN.md
@@ -22,7 +22,7 @@

 **Llama.cpp + ZenDNN**

-The llama.cpp ZenDNN backend leverages AMD's optimized matrix multiplication primitives to accelerate inference on AMD CPUs. It utilizes ZenDNN's **LowOHA (Low Overhead Hardware Accelerated)** MatMul operator for efficient GEMM operations with minimal execution overhead, built-in weight caching, and direct access to backend libraries (AOCL DLP, LibXSMM, OneDNN).
+The llama.cpp ZenDNN backend leverages AMD's optimized matrix multiplication primitives to accelerate inference on AMD CPUs. It utilizes ZenDNN's **LowOHA (Low Overhead Hardware Accelerated)** MatMul operator for efficient GEMM operations with minimal execution overhead, built-in weight caching, and direct access to backend libraries (AOCL BLIS, LibXSMM, OneDNN).

 For more information about ZenDNN, visit: https://www.amd.com/en/developer/zendnn.html

@@ -32,7 +32,7 @@ For more information about ZenDNN, visit: https://www.amd.com/en/developer/zendn
 |:-------:|:-------:|:----------------------------------------------:|
 | Linux   | Support | Ubuntu 20.04, 22.04, 24.04                     |

-For the latest list of supported operating systems, see the [ZenDNN Supported OS](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/README.md#15-supported-os).
+For the latest list of supported operating systems, see the [ZenDNN Supported OS](https://github.com/amd/ZenDNN/blob/zendnnl/README.md#15-supported-os).

 ## Hardware

@@ -44,9 +44,9 @@ ZenDNN is optimized for AMD EPYC™ processors and AMD Ryzen™ processors based

 | CPU Family                    | Status  | Notes                              |
 |:-----------------------------:|:-------:|:----------------------------------:|
-| AMD EPYC™ 9005 Series (Turin) | Support | 5th Gen - Zen 5 architecture       |
-| AMD EPYC™ 9004 Series (Genoa) | Support | 4th Gen - Zen 4 architecture       |
-| AMD EPYC™ 7003 Series (Milan) | Support | 3rd Gen - Zen 3 architecture       |
+| AMD EPYC™ 9005 Series (Turin)| Support | 5th Gen - Zen 5 architecture       |
+| AMD EPYC™ 9004 Series (Genoa)| Support | 4th Gen - Zen 4 architecture       |
+| AMD EPYC™ 7003 Series (Milan)| Support | 3rd Gen - Zen 3 architecture       |
 | AMD Ryzen™ AI MAX (Strix Halo)| Support | High-performance mobile processors |

 *Notes:*
@@ -61,7 +61,7 @@ The ZenDNN backend currently accelerates **matrix multiplication (MUL_MAT)** ope

 | Operation    | Status  | Notes                                          |
 |:-------------|:-------:|:----------------------------------------------:|
-| MUL_MAT      | Support | Accelerated via ZenDNN LowOHA MatMul           |
+| MUL_MAT      |    ✓    | Accelerated via ZenDNN LowOHA MatMul           |

 *Note:* Since only MUL_MAT is accelerated, models will benefit most from ZenDNN when matrix multiplications dominate the computational workload (which is typical for transformer-based LLMs).

@@ -104,6 +104,7 @@ If you want to build ZenDNN yourself or use a specific version:
 # Clone ZenDNN repository
 git clone https://github.com/amd/ZenDNN.git
 cd ZenDNN
+git checkout zendnnl

 # Build and install (requires CMake >= 3.25)
 mkdir build && cd build
@@ -113,7 +114,7 @@ cmake --build . --target all

 Default installation path: `ZenDNN/build/install`

-**For detailed build instructions**, refer to the [ZenDNN README](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/README.md).
+**For detailed build instructions**, refer to the [ZenDNN README](https://github.com/amd/ZenDNN/blob/zendnnl/README.md).

 **Step 2: Build llama.cpp with custom ZenDNN path**

@@ -145,7 +146,8 @@ Run llama.cpp server with ZenDNN acceleration:

 ```sh
 # Set optimal configuration
-export ZENDNNL_MATMUL_ALGO=1    # Blocked AOCL DLP algo for best performance
+export OMP_NUM_THREADS=64  # Adjust to your CPU core count
+export ZENDNNL_MATMUL_ALGO=2  # Blocked AOCL BLIS for best performance

 # Start server
 ./build/bin/llama-server \
@@ -158,26 +160,62 @@ export ZENDNNL_MATMUL_ALGO=1    # Blocked AOCL DLP algo for best performance
 Access the server at `http://localhost:8080`.

 **Performance tips**:
- Use `ZENDNNL_MATMUL_ALGO=1` for optimal performance
+- Set `OMP_NUM_THREADS` to match your physical core count
+- Use `ZENDNNL_MATMUL_ALGO=2` for optimal performance
 - For NUMA systems: `numactl --cpunodebind=0 --membind=0 ./build/bin/llama-server ...`

 ## Environment Variable

-For environment variables related to ZenDNN, refer to the [ZenDNN Environment Variables Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/runtime_env.md).
+### Build Time

-### Performance Optimization
+| Name               | Value                                 | Function                                    |
+|--------------------|---------------------------------------|---------------------------------------------|
+| GGML_ZENDNN        | ON/OFF                                | Enable ZenDNN backend support               |
+| ZENDNN_ROOT        | Path to ZenDNN installation           | Set ZenDNN installation directory           |
+| GGML_OPENMP        | ON/OFF (recommended: ON)              | Enable OpenMP for multi-threading           |

-ZenDNN's LowOHA MatMul supports multiple backend algorithms. For **best performance**, use the **Blocked AOCL DLP** algorithm:
+### Runtime
+
+| Name                    | Value                    | Function                                                          |
+|-------------------------|--------------------------|-------------------------------------------------------------------|
+| OMP_NUM_THREADS         | Number (e.g., 64)        | Set number of OpenMP threads (recommended: physical core count)   |
+| ZENDNNL_MATMUL_ALGO     | 0-5                      | Select MatMul backend algorithm (see Performance Optimization)    |
+| ZENDNNL_PROFILE_LOG_LEVEL | 0-4                    | Profiling log level (0=disabled, 4=verbose)                       |
+| ZENDNNL_ENABLE_PROFILER | 0 or 1                   | Enable detailed profiling (1=enabled)                             |
+| ZENDNNL_API_LOG_LEVEL   | 0-4                      | API log level (0=disabled, 4=verbose)                             |
+
+**Example**:

 ```sh
-export ZENDNNL_MATMUL_ALGO=1    # Blocked AOCL DLP algo (recommended)
+export OMP_NUM_THREADS=64
+export ZENDNNL_MATMUL_ALGO=2  # Use Blocked AOCL BLIS for best performance
+./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Test" -n 100
 ```

-For more details on available algorithms, see the [ZenDNN MatMul Algorithm Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/runtime_env.md#algorithm-details).
+## Performance Optimization
+
+### MatMul Algorithm Selection
+
+ZenDNN's LowOHA MatMul supports multiple backend algorithms. For **best performance**, use the **Blocked AOCL BLIS** algorithm:
+
+```sh
+export ZENDNNL_MATMUL_ALGO=2  # Blocked AOCL BLIS (recommended)
+```
+
+**Available algorithms**:
+
+| Value | Algorithm              | Description                                    |
+|:-----:|:-----------------------|:----------------------------------------------|
+| 0     | Dynamic Dispatch       | Automatic backend selection (default)         |
+| 1     | AOCL BLIS              | AOCL BLIS backend                             |
+| 2     | AOCL BLIS Blocked      | **Blocked AOCL BLIS (recommended)**           |
+| 3     | OneDNN                 | OneDNN backend                                |
+| 4     | OneDNN Blocked         | Blocked OneDNN                                |
+| 5     | LibXSMM                | LibXSMM backend                               |

 ### Profiling and Debugging

-For detailed profiling and logging options, refer to the [ZenDNN Logging Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/logging.md).
+For detailed profiling and logging options, refer to the [ZenDNN Logging Documentation](https://github.com/amd/ZenDNN/blob/zendnnl/docs/logging.md).

 ## Known Issues

@@ -207,9 +245,10 @@ A: Currently, ZenDNN primarily supports FP32 and BF16 data types. Quantized mode

 A: Ensure:
 1. You're using an AMD EPYC or Ryzen processor (Zen 2 or newer)
-2. `ZENDNNL_MATMUL_ALGO=1` is set for best performance (Blocked AOCL DLP)
-3. You're using a sufficiently large model (small models may not benefit as much)
-4. Enable profiling to verify ZenDNN MatMul is being called
+2. `OMP_NUM_THREADS` is set appropriately (physical core count)
+3. `ZENDNNL_MATMUL_ALGO=2` is set for best performance (Blocked AOCL BLIS)
+4. You're using a sufficiently large model (small models may not benefit as much)
+5. Enable profiling to verify ZenDNN MatMul is being called

 ### **GitHub Contribution**:
 Please add the **[ZenDNN]** prefix/tag in issues/PRs titles to help the ZenDNN-team check/address them without delay.
--- a/docs/backend/snapdragon/README.md
+++ b/docs/backend/snapdragon/README.md
@@ -35,7 +35,7 @@ Adapt below build commands accordingly.
 Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets:

 ```
-[d]/workspace> cp docs/backend/snapdragon/CMakeUserPresets.json .
+[d]/workspace> cp docs/backend/hexagon/CMakeUserPresets.json .

 [d]/workspace> cmake --preset arm64-android-snapdragon-release -B build-snapdragon
 Preset CMake variables:
--- a/docs/build-s390x.md
+++ b/docs/build-s390x.md
@@ -242,10 +242,10 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 |------------|-------------|------|-------|
 | FP32       | ✅           | ✅    | ❓     |
 | FP16       | ✅           | ✅    | ❓     |
-| BF16       | ✅           | ✅    | ❓     |
+| BF16       | 🚫           | ✅    | ❓     |
 | Q4_0       | ✅           | ❓    | ❓     |
 | Q4_1       | ✅           | ❓    | ❓     |
-| MXFP4      | ✅           | ❓    | ❓     |
+| MXFP4      | 🚫           | ❓    | ❓     |
 | Q5_0       | ✅           | ❓    | ❓     |
 | Q5_1       | ✅           | ❓    | ❓     |
 | Q8_0       | ✅           | ❓    | ❓     |
@@ -272,4 +272,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 -   🚫 - acceleration unavailable, will still run using scalar implementation
 -   ❓ - acceleration unknown, please contribute if you can test it yourself

-Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Feb 15, 2026.
+Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Sep 7, 2025.
--- a/docs/build.md
+++ b/docs/build.md
@@ -108,7 +108,7 @@ Building through oneAPI compilers will make avx_vnni instruction set available f
 - Using oneAPI docker image:
  If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.

-Check [Optimizing and Running LLaMA2 on Intel® CPU](https://builders.intel.com/solutionslibrary/optimizing-and-running-llama2-on-intel-cpu) for more information.
+Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.

 ### Other BLAS libraries

--- a/docs/ops.md
+++ b/docs/ops.md
@@ -22,7 +22,7 @@ Legend:
 |                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
-|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
@@ -31,7 +31,7 @@ Legend:
 |                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
 |                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |               CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
@@ -96,13 +96,13 @@ Legend:
 |                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
 |                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
-|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
+|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
 |                         SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
 |                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
--- a/docs/ops/SYCL.csv
+++ b/docs/ops/SYCL.csv
@@ -77,8 +77,8 @@
 "SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
 "SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
 "SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
-"SYCL0","CEIL","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
-"SYCL0","CEIL","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
+"SYCL0","CEIL","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
+"SYCL0","CEIL","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
 "SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
 "SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
 "SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
@@ -161,8 +161,8 @@
 "SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
 "SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
 "SYCL0","FLOOR","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
-"SYCL0","CEIL","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
-"SYCL0","CEIL","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
+"SYCL0","CEIL","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
+"SYCL0","CEIL","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
 "SYCL0","ROUND","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
 "SYCL0","ROUND","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
 "SYCL0","TRUNC","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
--- a/docs/ops/WebGPU.csv
+++ b/docs/ops/WebGPU.csv
@@ -8760,14 +8760,22 @@
 "WebGPU: WebGPU","ADD_ID","type_a=f32,type_b=f32,n_embd=129,n_experts=8,n_experts_used=4,n_token=1","support","0","no","WebGPU"
 "WebGPU: WebGPU","ADD_ID","type_a=f32,type_b=f32,n_embd=129,n_experts=8,n_experts_used=4,n_token=32","support","0","no","WebGPU"
 "WebGPU: WebGPU","ADD_ID","type_a=f32,type_b=f32,n_embd=129,n_experts=8,n_experts_used=4,n_token=129","support","0","no","WebGPU"
+"WebGPU: WebGPU","SQR","type=f16,ne=[10,5,4,3]","support","0","no","WebGPU"
+"WebGPU: WebGPU","SQRT","type=f16,ne=[10,3,3,2]","support","0","no","WebGPU"
 "WebGPU: WebGPU","LOG","type=f16,ne=[10,5,4,3]","support","1","yes","WebGPU"
+"WebGPU: WebGPU","SIN","type=f16,ne=[10,2,2,2]","support","0","no","WebGPU"
+"WebGPU: WebGPU","COS","type=f16,ne=[10,2,2,2]","support","0","no","WebGPU"
 "WebGPU: WebGPU","CLAMP","type=f16,ne=[10,5,4,3],min=-0.500000,max=0.500000","support","1","yes","WebGPU"
 "WebGPU: WebGPU","LEAKY_RELU","type=f16,ne_a=[10,5,4,3],negative_slope=0.100000","support","0","no","WebGPU"
 "WebGPU: WebGPU","FLOOR","type=f16,ne=[10,2,2,2]","support","1","yes","WebGPU"
 "WebGPU: WebGPU","CEIL","type=f16,ne=[10,2,2,2]","support","1","yes","WebGPU"
 "WebGPU: WebGPU","ROUND","type=f16,ne=[10,2,2,2]","support","1","yes","WebGPU"
 "WebGPU: WebGPU","TRUNC","type=f16,ne=[10,2,2,2]","support","1","yes","WebGPU"
+"WebGPU: WebGPU","SQR","type=f16,ne=[7,1,5,3]","support","0","no","WebGPU"
+"WebGPU: WebGPU","SQRT","type=f16,ne=[7,1,5,3]","support","0","no","WebGPU"
 "WebGPU: WebGPU","LOG","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
+"WebGPU: WebGPU","SIN","type=f16,ne=[7,1,5,3]","support","0","no","WebGPU"
+"WebGPU: WebGPU","COS","type=f16,ne=[7,1,5,3]","support","0","no","WebGPU"
 "WebGPU: WebGPU","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","WebGPU"
 "WebGPU: WebGPU","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","0","no","WebGPU"
 "WebGPU: WebGPU","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
@@ -8778,14 +8786,22 @@
 "WebGPU: WebGPU","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
 "WebGPU: WebGPU","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
 "WebGPU: WebGPU","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
+"WebGPU: WebGPU","SQR","type=f32,ne=[10,5,4,3]","support","0","no","WebGPU"
+"WebGPU: WebGPU","SQRT","type=f32,ne=[10,3,3,2]","support","0","no","WebGPU"
 "WebGPU: WebGPU","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","WebGPU"
+"WebGPU: WebGPU","SIN","type=f32,ne=[10,2,2,2]","support","0","no","WebGPU"
+"WebGPU: WebGPU","COS","type=f32,ne=[10,2,2,2]","support","0","no","WebGPU"
 "WebGPU: WebGPU","CLAMP","type=f32,ne=[10,5,4,3],min=-0.500000,max=0.500000","support","1","yes","WebGPU"
 "WebGPU: WebGPU","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","0","no","WebGPU"
 "WebGPU: WebGPU","FLOOR","type=f32,ne=[10,2,2,2]","support","1","yes","WebGPU"
 "WebGPU: WebGPU","CEIL","type=f32,ne=[10,2,2,2]","support","1","yes","WebGPU"
 "WebGPU: WebGPU","ROUND","type=f32,ne=[10,2,2,2]","support","1","yes","WebGPU"
 "WebGPU: WebGPU","TRUNC","type=f32,ne=[10,2,2,2]","support","1","yes","WebGPU"
+"WebGPU: WebGPU","SQR","type=f32,ne=[7,1,5,3]","support","0","no","WebGPU"
+"WebGPU: WebGPU","SQRT","type=f32,ne=[7,1,5,3]","support","0","no","WebGPU"
 "WebGPU: WebGPU","LOG","type=f32,ne=[7,1,5,3]","support","1","yes","WebGPU"
+"WebGPU: WebGPU","SIN","type=f32,ne=[7,1,5,3]","support","0","no","WebGPU"
+"WebGPU: WebGPU","COS","type=f32,ne=[7,1,5,3]","support","0","no","WebGPU"
 "WebGPU: WebGPU","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","WebGPU"
 "WebGPU: WebGPU","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","0","no","WebGPU"
 "WebGPU: WebGPU","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","WebGPU"
@@ -18885,27 +18901,3 @@
 "WebGPU: WebGPU","CROSS_ENTROPY_LOSS_BACK","type=f32,ne=[30000,1,1,1]","support","0","no","WebGPU"
 "WebGPU: WebGPU","OPT_STEP_ADAMW","type=f32,ne=[10,5,4,3]","support","0","no","WebGPU"
 "WebGPU: WebGPU","OPT_STEP_SGD","type=f32,ne=[10,5,4,3]","support","0","no","WebGPU"
-"WebGPU: WebGPU","SQR","type=f16,ne=[10,5,4,3]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","SQRT","type=f16,ne=[10,3,3,2]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","SIN","type=f16,ne=[10,2,2,2]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","COS","type=f16,ne=[10,2,2,2]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","SQR","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","SQR","type=f16,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","SQRT","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","SQRT","type=f16,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","SIN","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","SIN","type=f16,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","COS","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","COS","type=f16,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","SIN","type=f32,ne=[10,2,2,2]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","COS","type=f32,ne=[10,2,2,2]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","SQR","type=f32,ne=[7,1,5,3]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","SQR","type=f32,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","SQRT","type=f32,ne=[7,1,5,3]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","SQRT","type=f32,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","SIN","type=f32,ne=[7,1,5,3]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","SIN","type=f32,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","COS","type=f32,ne=[7,1,5,3]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","COS","type=f32,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
--- a/docs/speculative.md
+++ b/docs/speculative.md
@@ -119,6 +119,8 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
                                        of lookup n-gram (default: 12)
 --spec-ngram-size-m N                   ngram size M for ngram-simple/ngram-map speculative decoding, length
                                        of draft m-gram (default: 48)
+--spec-ngram-check-rate N               ngram check rate for ngram-simple/ngram-map speculative decoding
+                                        (default: 1)
 --spec-ngram-min-hits N                 minimum hits for ngram-map speculative decoding (default: 1)
 ```

@@ -151,6 +153,10 @@ Sets the size M of the draft m-gram for n-gram map based speculative decoding.
 The m-gram size determines how many tokens to draft when a match is found.
 Larger values can provide more speedup but may reduce acceptance rate.

+### `--spec-ngram-check-rate R`
+
+This option aims at performance if the n-gram lookup in history is to costly. A lookup will be executed at every R tokens (default is 1, every token).
+
 ### `--spec-ngram-min-hits H`

 This option defines how often a key has to appear in the token history to be used as a draft (default is 1).
@@ -169,12 +175,7 @@ draft acceptance rate = 0.70312 (   90 accepted /   128 generated)
 statistics ngram_mod: #calls = 810, #gen drafts = 15, #acc drafts = 15, #gen tokens = 960, #acc tokens = 730, dur(b,g,a) = 0.149, 0.347, 0.005 ms
 ```

-```
-statistics ngram_map_k: #calls(b,g,a) = 6 1690 26, #gen drafts = 26, #acc drafts = 26, #gen tokens = 1248, #acc tokens = 968, dur(b,g,a) = 2.234, 1.427, 0.016 ms
-```
-
-
- `#calls(b,g,a)`: number of calls of begin (new prompt), generation and accumulation of this implementations
+- `#calls`: number of calls of this implementations
 - `#gen drafts`: number of drafts generated by this implementation
 - `#acc drafts`: number of drafts accepted (partially) by the main model
 - `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -5,7 +5,6 @@
 #include "sampling.h"

 #include <algorithm>
-#include <clocale>
 #include <cstdio>
 #include <string>
 #include <vector>
@@ -17,8 +16,6 @@ static void print_usage(int, char ** argv) {
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    params.prompt = "Hello my name is";
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -5,16 +5,14 @@
 #include "common.h"
 #include "log.h"

-#include <algorithm>
-#include <cassert>
-#include <cinttypes>
-#include <climits>
-#include <clocale>
-#include <cstdarg>
-#include <cstring>
-#include <ctime>
 #include <unordered_map>
 #include <vector>
+#include <cassert>
+#include <climits>
+#include <cstring>
+#include <cstdarg>
+#include <cinttypes>
+#include <ctime>
 #include <random>
 #include <stdexcept>
 #include <sstream>
@@ -876,8 +874,6 @@ static std::string basename(const std::string &path) {
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_init();

    struct train_params params = get_default_train_params();
--- a/examples/deprecation-warning/deprecation-warning.cpp
+++ b/examples/deprecation-warning/deprecation-warning.cpp
@@ -1,14 +1,11 @@
 // Warns users that this filename was deprecated, and provides a link for more information.

-#include <clocale>
 #include <cstdio>
 #include <string>
 #include <unordered_map>

 // Main
 int main(int argc, char** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    std::string filename = "main";
    if (argc >= 1) {
        filename = argv[0];
--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@@ -7,7 +7,6 @@
 #include <limits.h>

 #include <algorithm>
-#include <clocale>
 #include <cmath>
 #include <cstring>
 #include <limits>
@@ -539,8 +538,6 @@ static std::string format_input_text(const std::string & prompt, const std::stri
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    ggml_time_init();

    common_params params;
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -3,7 +3,6 @@
 #include "log.h"
 #include "llama.h"

-#include <clocale>
 #include <ctime>
 #include <algorithm>

@@ -95,8 +94,6 @@ static void print_raw_embeddings(const float * emb,
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -4,8 +4,6 @@
 #include "log.h"
 #include "llama.h"
 #include "llama-cpp.h"
-
-#include <clocale>
 #include <string>
 #include <vector>

@@ -31,8 +29,6 @@ static bool run(llama_context * ctx, const common_params & params) {
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    base_callback_data cb_data;

    common_params params;
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@@ -1,7 +1,6 @@
 #include "arg.h"
 #include "common.h"

-#include <clocale>
 #include <fstream>
 #include <sstream>
 #include <string>
@@ -101,8 +100,6 @@ static void write_help(std::ostringstream & ss, const md_file & md) {
 }

 int main(int, char **) {
-    std::setlocale(LC_NUMERIC, "C");
-
    for (const auto & md : md_files) {
        std::ifstream infile(md.fname);
        if (!infile.is_open()) {
--- a/examples/gguf-hash/gguf-hash.cpp
+++ b/examples/gguf-hash/gguf-hash.cpp
@@ -1,14 +1,13 @@
 #include "ggml.h"
 #include "gguf.h"

-#include <algorithm>
-#include <clocale>
+#include <cstdlib>   /* abort() */
 #include <cstddef>
 #include <cstdio>
-#include <cstdlib>   /* abort() */
-#include <cstring>
-#include <stdexcept>
 #include <string>
+#include <stdexcept>
+#include <algorithm>
+#include <cstring>

 #include <sstream>
 #include <fstream>
@@ -627,8 +626,6 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
 }

 int main(int argc, const char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    hash_params params;
    manifest_check_params manifest_check;
    hash_params_parse(argc, argv, params);
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -1,7 +1,6 @@
 #include "ggml.h"
 #include "gguf.h"

-#include <clocale>
 #include <cstdio>
 #include <string>
 #include <sstream>
@@ -241,8 +240,6 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    if (argc < 3) {
        printf("usage: %s data.gguf r|w [n]\n", argv[0]);
        printf("r: read data.gguf file\n");
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -4,11 +4,10 @@
 #include "log.h"
 #include "llama.h"

-#include <algorithm>
-#include <clocale>
 #include <cstdio>
 #include <string>
 #include <vector>
+#include <algorithm>

 struct ngram_data {
    bool active = false;
@@ -39,8 +38,6 @@ struct ngram_container {
 };

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -3,13 +3,10 @@
 #include "ngram-cache.h"
 #include "llama.h"

-#include <clocale>
 #include <string>
 #include <vector>

 int main(int argc, char ** argv){
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
--- a/examples/lookup/lookup-merge.cpp
+++ b/examples/lookup/lookup-merge.cpp
@@ -3,7 +3,6 @@
 #include "common.h"
 #include "ngram-cache.h"

-#include <clocale>
 #include <cstdint>
 #include <cstdio>
 #include <fstream>
@@ -18,8 +17,6 @@ static void print_usage(char* argv0) {
 }

 int main(int argc, char ** argv){
-    std::setlocale(LC_NUMERIC, "C");
-
    if (argc < 3) {
        print_usage(argv[0]);
        exit(1);
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -5,17 +5,14 @@
 #include "llama.h"
 #include "ggml.h"

-#include <cinttypes>
-#include <clocale>
 #include <cstdint>
 #include <cstdio>
+#include <cinttypes>
 #include <fstream>
 #include <string>
 #include <vector>

 int main(int argc, char ** argv){
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -6,7 +6,6 @@
 #include "log.h"
 #include "llama.h"

-#include <clocale>
 #include <cstdint>
 #include <cstdio>
 #include <fstream>
@@ -14,8 +13,6 @@
 #include <vector>

 int main(int argc, char ** argv){
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
--- a/examples/model-conversion/Makefile
+++ b/examples/model-conversion/Makefile
@@ -77,10 +77,7 @@ causal-verify-embeddings: causal-run-original-embeddings causal-run-converted-em
 	@./scripts/causal/compare-embeddings-logits.sh

 causal-inspect-original-model:
-	@./scripts/utils/inspect-org-model.py --list-all -s
-
-causal-list-original-model-tensors:
-	@./scripts/utils/inspect-org-model.py --list-all-short -s
+	@./scripts/utils/inspect-org-model.py

 causal-inspect-converted-model:
 	@./scripts/utils/inspect-converted-model.sh
@@ -156,7 +153,7 @@ embedding-verify-logits-st: embedding-run-original-model-st embedding-run-conver

 embedding-inspect-original-model:
 	$(call validate_embedding_model_path,embedding-inspect-original-model)
-	@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/utils/inspect-org-model.py -m ${EMBEDDING_MODEL_PATH} --list-all -s
+	@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/utils/inspect-org-model.py -m ${EMBEDDING_MODEL_PATH}

 embedding-inspect-converted-model:
 	@CONVERTED_EMBEDDING_MODEL="$(CONVERTED_EMBEDDING_MODEL)" ./scripts/utils/inspect-converted-model.sh ${CONVERTED_EMBEDDING_MODEL}
--- a/examples/model-conversion/scripts/causal/run-org-model.py
+++ b/examples/model-conversion/scripts/causal/run-org-model.py
@@ -42,15 +42,11 @@ def load_model_and_tokenizer(model_path, device="auto"):
        config = config.text_config
        multimodal = True

-    def print_if_exists(label, obj, attr, default="N/A"):
-        val = getattr(obj, attr) if hasattr(obj, attr) else default
-        print(f"{label}", val)
-
-    print_if_exists("Vocab size:       ", config, "vocab_size")
-    print_if_exists("Hidden size:      ", config, "hidden_size")
-    print_if_exists("Number of layers: ", config, "num_hidden_layers")
-    print_if_exists("BOS token id:     ", config, "bos_token_id")
-    print_if_exists("EOS token id:     ", config, "eos_token_id")
+    print("Vocab size:       ", config.vocab_size)
+    print("Hidden size:      ", config.hidden_size)
+    print("Number of layers: ", config.num_hidden_layers)
+    print("BOS token id:     ", config.bos_token_id)
+    print("EOS token id:     ", config.eos_token_id)

    unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
    if unreleased_model_name:
--- a/examples/model-conversion/scripts/utils/inspect-org-model.py
+++ b/examples/model-conversion/scripts/utils/inspect-org-model.py
@@ -1,290 +1,67 @@
 #!/usr/bin/env python3

 import argparse
-import json
 import os
-import re
-import struct
-import sys
-from pathlib import Path
-from typing import Optional
+import json
 from safetensors import safe_open
+from collections import defaultdict

+parser = argparse.ArgumentParser(description='Process model with specified path')
+parser.add_argument('--model-path', '-m', help='Path to the model')
+args = parser.parse_args()

-MODEL_SAFETENSORS_FILE = "model.safetensors"
-MODEL_SAFETENSORS_INDEX = "model.safetensors.index.json"
+model_path = os.environ.get('MODEL_PATH', args.model_path)
+if model_path is None:
+    parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")

-DTYPE_SIZES = {
-    "F64": 8, "I64": 8, "U64": 8,
-    "F32": 4, "I32": 4, "U32": 4,
-    "F16": 2, "BF16": 2, "I16": 2, "U16": 2,
-    "I8": 1, "U8": 1, "BOOL": 1,
-    "F8_E4M3": 1, "F8_E5M2": 1,
-}
+# Check if there's an index file (multi-file model)
+index_path = os.path.join(model_path, "model.safetensors.index.json")
+single_file_path = os.path.join(model_path, "model.safetensors")

-SIZE_UNITS = ['B', 'KB', 'MB', 'GB', 'TB']
+if os.path.exists(index_path):
+    # Multi-file model
+    print("Multi-file model detected")

+    with open(index_path, 'r') as f:
+        index_data = json.load(f)

-def get_weight_map(model_path: Path) -> Optional[dict[str, str]]:
-    index_file = model_path / MODEL_SAFETENSORS_INDEX
+    # Get the weight map (tensor_name -> file_name)
+    weight_map = index_data.get("weight_map", {})

-    if index_file.exists():
-        with open(index_file, 'r') as f:
-            index = json.load(f)
-            return index.get("weight_map", {})
+    # Group tensors by file for efficient processing
+    file_tensors = defaultdict(list)
+    for tensor_name, file_name in weight_map.items():
+        file_tensors[file_name].append(tensor_name)

-    return None
+    print("Tensors in model:")

+    # Process each shard file
+    for file_name, tensor_names in file_tensors.items():
+        file_path = os.path.join(model_path, file_name)
+        print(f"\n--- From {file_name} ---")

-def get_all_tensor_names(model_path: Path) -> list[str]:
-    weight_map = get_weight_map(model_path)
+        with safe_open(file_path, framework="pt") as f:
+            for tensor_name in sorted(tensor_names):
+                tensor = f.get_tensor(tensor_name)
+                print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}")

-    if weight_map is not None:
-        return list(weight_map.keys())
+elif os.path.exists(single_file_path):
+    # Single file model (original behavior)
+    print("Single-file model detected")

-    single_file = model_path / MODEL_SAFETENSORS_FILE
-    if single_file.exists():
-        try:
-            with safe_open(single_file, framework="pt", device="cpu") as f:
-                return list(f.keys())
-        except Exception as e:
-            print(f"Error reading {single_file}: {e}")
-            sys.exit(1)
+    with safe_open(single_file_path, framework="pt") as f:
+        keys = f.keys()
+        print("Tensors in model:")
+        for key in sorted(keys):
+            tensor = f.get_tensor(key)
+            print(f"- {key} : shape = {tensor.shape}, dtype = {tensor.dtype}")

-    print(f"Error: No safetensors files found in {model_path}")
-    sys.exit(1)
-
-
-def find_tensor_file(model_path: Path, tensor_name: str) -> Optional[str]:
-    weight_map = get_weight_map(model_path)
-
-    if weight_map is not None:
-        return weight_map.get(tensor_name)
-
-    single_file = model_path / MODEL_SAFETENSORS_FILE
-    if single_file.exists():
-        return single_file.name
-
-    return None
-
-
-def read_safetensors_header(file_path: Path) -> dict:
-    with open(file_path, 'rb') as f:
-        header_size = struct.unpack('<Q', f.read(8))[0]
-        return json.loads(f.read(header_size))
-
-
-def get_tensor_size_bytes(tensor_meta: dict) -> int:
-    offsets = tensor_meta.get("data_offsets")
-    if offsets and len(offsets) == 2:
-        return offsets[1] - offsets[0]
-    n_elements = 1
-    for d in tensor_meta.get("shape", []):
-        n_elements *= d
-    return n_elements * DTYPE_SIZES.get(tensor_meta.get("dtype", "F32"), 4)
-
-
-def format_size(size_bytes: int) -> str:
-    val = float(size_bytes)
-    for unit in SIZE_UNITS[:-1]:
-        if val < 1024.0:
-            return f"{val:.2f} {unit}"
-        val /= 1024.0
-    return f"{val:.2f} {SIZE_UNITS[-1]}"
-
-
-def get_all_tensor_metadata(model_path: Path) -> dict[str, dict]:
-    weight_map = get_weight_map(model_path)
-
-    if weight_map is not None:
-        file_to_tensors: dict[str, list[str]] = {}
-        for tensor_name, file_name in weight_map.items():
-            file_to_tensors.setdefault(file_name, []).append(tensor_name)
-
-        all_metadata: dict[str, dict] = {}
-        for file_name, tensor_names in file_to_tensors.items():
-            try:
-                header = read_safetensors_header(model_path / file_name)
-                for tensor_name in tensor_names:
-                    if tensor_name in header:
-                        all_metadata[tensor_name] = header[tensor_name]
-            except Exception as e:
-                print(f"Warning: Could not read header from {file_name}: {e}", file=sys.stderr)
-        return all_metadata
-
-    single_file = model_path / MODEL_SAFETENSORS_FILE
-    if single_file.exists():
-        try:
-            header = read_safetensors_header(single_file)
-            return {k: v for k, v in header.items() if k != "__metadata__"}
-        except Exception as e:
-            print(f"Error reading {single_file}: {e}")
-            sys.exit(1)
-
-    print(f"Error: No safetensors files found in {model_path}")
-    sys.exit(1)
-
-
-def normalize_tensor_name(tensor_name: str) -> str:
-    normalized = re.sub(r'\.\d+\.', '.#.', tensor_name)
-    normalized = re.sub(r'\.\d+$', '.#', normalized)
-    return normalized
-
-
-def list_all_tensors(
-    model_path: Path,
-    short: bool = False,
-    show_sizes: bool = False,
-):
-    tensor_names = get_all_tensor_names(model_path)
-
-    metadata: Optional[dict[str, dict]] = None
-    if show_sizes:
-        metadata = get_all_tensor_metadata(model_path)
-
-    total_bytes = 0
-
-    if short:
-        seen: dict[str, str] = {}
-        for tensor_name in sorted(tensor_names):
-            normalized = normalize_tensor_name(tensor_name)
-            if normalized not in seen:
-                seen[normalized] = tensor_name
-        display_pairs = list(sorted(seen.items()))
-        name_width = max((len(n) for n, _ in display_pairs), default=0)
-        for normalized, first_name in display_pairs:
-            if metadata and first_name in metadata:
-                m = metadata[first_name]
-                size = get_tensor_size_bytes(m)
-                total_bytes += size
-                print(f"{normalized:{name_width}}  {m.get('dtype', '?'):6s}  {str(m.get('shape', '')):30s}  {format_size(size)}")
-            else:
-                print(normalized)
+else:
+    print(f"Error: Neither 'model.safetensors.index.json' nor 'model.safetensors' found in {model_path}")
+    print("Available files:")
+    if os.path.exists(model_path):
+        for item in sorted(os.listdir(model_path)):
+            print(f"  {item}")
    else:
-        name_width = max((len(n) for n in tensor_names), default=0)
-        for tensor_name in sorted(tensor_names):
-            if metadata and tensor_name in metadata:
-                m = metadata[tensor_name]
-                size = get_tensor_size_bytes(m)
-                total_bytes += size
-                print(f"{tensor_name:{name_width}}  {m.get('dtype', '?'):6s}  {str(m.get('shape', '')):30s}  {format_size(size)}")
-            else:
-                print(tensor_name)
-
-    if show_sizes:
-        print(f"\nTotal: {format_size(total_bytes)}")
-
-
-def print_tensor_info(model_path: Path, tensor_name: str, num_values: Optional[int] = None):
-    tensor_file = find_tensor_file(model_path, tensor_name)
-
-    if tensor_file is None:
-        print(f"Error: Could not find tensor '{tensor_name}' in model index")
-        print(f"Model path: {model_path}")
-        sys.exit(1)
-
-    file_path = model_path / tensor_file
-
-    try:
-        header = read_safetensors_header(file_path)
-        tensor_meta = header.get(tensor_name, {})
-        dtype_str = tensor_meta.get("dtype")
-
-        with safe_open(file_path, framework="pt", device="cpu") as f:
-            if tensor_name in f.keys():
-                tensor_slice = f.get_slice(tensor_name)
-                shape = tensor_slice.get_shape()
-                print(f"Tensor: {tensor_name}")
-                print(f"File:   {tensor_file}")
-                print(f"Shape:  {shape}")
-                if dtype_str:
-                    print(f"Dtype:  {dtype_str}")
-                if tensor_meta:
-                    print(f"Size:   {format_size(get_tensor_size_bytes(tensor_meta))}")
-                if num_values is not None:
-                    tensor = f.get_tensor(tensor_name)
-                    if not dtype_str:
-                        print(f"Dtype:  {tensor.dtype}")
-                    flat = tensor.flatten()
-                    n = min(num_values, flat.numel())
-                    print(f"Values: {flat[:n].tolist()}")
-            else:
-                print(f"Error: Tensor '{tensor_name}' not found in {tensor_file}")
-                sys.exit(1)
-
-    except FileNotFoundError:
-        print(f"Error: The file '{file_path}' was not found.")
-        sys.exit(1)
-    except Exception as e:
-        print(f"An error occurred: {e}")
-        sys.exit(1)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Print tensor information from a safetensors model"
-    )
-    parser.add_argument(
-        "tensor_name",
-        nargs="?",
-        help="Name of the tensor to inspect"
-    )
-    parser.add_argument(
-        "-m", "--model-path",
-        type=Path,
-        help="Path to the model directory (default: MODEL_PATH environment variable)"
-    )
-    parser.add_argument(
-        "-l", "--list-all-short",
-        action="store_true",
-        help="List unique tensor patterns (layer numbers replaced with #)"
-    )
-    parser.add_argument(
-        "-la", "--list-all",
-        action="store_true",
-        help="List all tensor names with actual layer numbers"
-    )
-    parser.add_argument(
-        "-n", "--num-values",
-        nargs="?",
-        const=10,
-        default=None,
-        type=int,
-        metavar="N",
-        help="Print the first N values of the tensor flattened (default: 10 if flag is given without a number)"
-    )
-    parser.add_argument(
-        "-s", "--sizes",
-        action="store_true",
-        help="Show dtype, shape, and size for each tensor when listing"
-    )
-
-    args = parser.parse_args()
-
-    model_path = args.model_path
-    if model_path is None:
-        model_path_str = os.environ.get("MODEL_PATH")
-        if model_path_str is None:
-            print("Error: --model-path not provided and MODEL_PATH environment variable not set")
-            sys.exit(1)
-        model_path = Path(model_path_str)
-
-    if not model_path.exists():
-        print(f"Error: Model path does not exist: {model_path}")
-        sys.exit(1)
-
-    if not model_path.is_dir():
-        print(f"Error: Model path is not a directory: {model_path}")
-        sys.exit(1)
-
-    if args.list_all_short or args.list_all:
-        list_all_tensors(model_path, short=args.list_all_short, show_sizes=args.sizes)
-    else:
-        if args.tensor_name is None:
-            print("Error: tensor_name is required when not using --list-all-short or --list-all")
-            sys.exit(1)
-        print_tensor_info(model_path, args.tensor_name, args.num_values)
-
-
-if __name__ == "__main__":
-    main()
+        print(f"  Directory {model_path} does not exist")
+    exit(1)
--- a/examples/model-conversion/scripts/utils/tensor-info.py
+++ b/examples/model-conversion/scripts/utils/tensor-info.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import os
+import re
+import sys
+from pathlib import Path
+from typing import Optional
+from safetensors import safe_open
+
+
+MODEL_SAFETENSORS_FILE = "model.safetensors"
+MODEL_SAFETENSORS_INDEX = "model.safetensors.index.json"
+
+
+def get_weight_map(model_path: Path) -> Optional[dict[str, str]]:
+    index_file = model_path / MODEL_SAFETENSORS_INDEX
+
+    if index_file.exists():
+        with open(index_file, 'r') as f:
+            index = json.load(f)
+            return index.get("weight_map", {})
+
+    return None
+
+
+def get_all_tensor_names(model_path: Path) -> list[str]:
+    weight_map = get_weight_map(model_path)
+
+    if weight_map is not None:
+        return list(weight_map.keys())
+
+    single_file = model_path / MODEL_SAFETENSORS_FILE
+    if single_file.exists():
+        try:
+            with safe_open(single_file, framework="pt", device="cpu") as f:
+                return list(f.keys())
+        except Exception as e:
+            print(f"Error reading {single_file}: {e}")
+            sys.exit(1)
+
+    print(f"Error: No safetensors files found in {model_path}")
+    sys.exit(1)
+
+
+def find_tensor_file(model_path: Path, tensor_name: str) -> Optional[str]:
+    weight_map = get_weight_map(model_path)
+
+    if weight_map is not None:
+        return weight_map.get(tensor_name)
+
+    single_file = model_path / MODEL_SAFETENSORS_FILE
+    if single_file.exists():
+        return single_file.name
+
+    return None
+
+
+def normalize_tensor_name(tensor_name: str) -> str:
+    normalized = re.sub(r'\.\d+\.', '.#.', tensor_name)
+    normalized = re.sub(r'\.\d+$', '.#', normalized)
+    return normalized
+
+
+def list_all_tensors(model_path: Path, unique: bool = False):
+    tensor_names = get_all_tensor_names(model_path)
+
+    if unique:
+        seen = set()
+        for tensor_name in sorted(tensor_names):
+            normalized = normalize_tensor_name(tensor_name)
+            if normalized not in seen:
+                seen.add(normalized)
+                print(normalized)
+    else:
+        for tensor_name in sorted(tensor_names):
+            print(tensor_name)
+
+
+def print_tensor_info(model_path: Path, tensor_name: str):
+    tensor_file = find_tensor_file(model_path, tensor_name)
+
+    if tensor_file is None:
+        print(f"Error: Could not find tensor '{tensor_name}' in model index")
+        print(f"Model path: {model_path}")
+        sys.exit(1)
+
+    file_path = model_path / tensor_file
+
+    try:
+        with safe_open(file_path, framework="pt", device="cpu") as f:
+            if tensor_name in f.keys():
+                tensor_slice = f.get_slice(tensor_name)
+                shape = tensor_slice.get_shape()
+                print(f"Tensor: {tensor_name}")
+                print(f"File:   {tensor_file}")
+                print(f"Shape:  {shape}")
+            else:
+                print(f"Error: Tensor '{tensor_name}' not found in {tensor_file}")
+                sys.exit(1)
+
+    except FileNotFoundError:
+        print(f"Error: The file '{file_path}' was not found.")
+        sys.exit(1)
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        sys.exit(1)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Print tensor information from a safetensors model"
+    )
+    parser.add_argument(
+        "tensor_name",
+        nargs="?",  # optional (if --list is used for example)
+        help="Name of the tensor to inspect"
+    )
+    parser.add_argument(
+        "-m", "--model-path",
+        type=Path,
+        help="Path to the model directory (default: MODEL_PATH environment variable)"
+    )
+    parser.add_argument(
+        "-l", "--list",
+        action="store_true",
+        help="List unique tensor patterns in the model (layer numbers replaced with #)"
+    )
+
+    args = parser.parse_args()
+
+    model_path = args.model_path
+    if model_path is None:
+        model_path_str = os.environ.get("MODEL_PATH")
+        if model_path_str is None:
+            print("Error: --model-path not provided and MODEL_PATH environment variable not set")
+            sys.exit(1)
+        model_path = Path(model_path_str)
+
+    if not model_path.exists():
+        print(f"Error: Model path does not exist: {model_path}")
+        sys.exit(1)
+
+    if not model_path.is_dir():
+        print(f"Error: Model path is not a directory: {model_path}")
+        sys.exit(1)
+
+    if args.list:
+        list_all_tensors(model_path, unique=True)
+    else:
+        if args.tensor_name is None:
+            print("Error: tensor_name is required when not using --list")
+            sys.exit(1)
+        print_tensor_info(model_path, args.tensor_name)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -7,13 +7,12 @@
 #include "log.h"
 #include "llama.h"

-#include <algorithm>
-#include <clocale>
 #include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
 #include <ctime>
+#include <algorithm>

 // trim whitespace from the beginning and end of a string
 static std::string trim(const std::string & str) {
@@ -154,8 +153,6 @@ static std::vector<std::string> split_string(const std::string& input, char deli
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    srand(1234);

    common_params params;
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -3,7 +3,6 @@
 #include "log.h"
 #include "llama.h"

-#include <clocale>
 #include <cmath>
 #include <cstdio>
 #include <string>
@@ -17,8 +16,6 @@ static void print_usage(int, char ** argv) {
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    params.n_junk = 250;
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -4,7 +4,6 @@
 #include "llama.h"

 #include <algorithm>
-#include <clocale>
 #include <fstream>
 #include <iostream> // TODO: remove me

@@ -113,8 +112,6 @@ static void batch_process(llama_context * ctx, llama_batch & batch, float * outp
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -2,21 +2,15 @@
 #include "common.h"
 #include "llama.h"

-#include <clocale>
 #include <vector>
 #include <cstdio>

-
 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    params.prompt = "The quick brown fox";
    params.sampling.seed = 1234;

-    const std::string_view state_file = "dump_state.bin";
-
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
        return 1;
    }
@@ -59,16 +53,35 @@ int main(int argc, char ** argv) {
    // tokenize prompt
    auto tokens = common_tokenize(ctx, params.prompt, true);

-    const bool save_state = true;
-    if (!common_prompt_batch_decode(ctx, tokens, n_past, params.n_batch, state_file, save_state)) {
-        return 1;
+    // prepare the batch
+    llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
+    for (size_t i = 0; i < tokens.size(); i++) {
+        common_batch_add(batch, tokens[i], i, {0}, false);
    }
+    batch.logits[batch.n_tokens - 1] = true; // generate next token
+
+    // evaluate prompt
+    llama_decode(ctx, batch);
+    n_past += batch.n_tokens;
+
+    // save state (rng, logits, embedding and kv_cache) to file
+    {
+        std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
+        const size_t written = llama_state_get_data(ctx, state_mem.data(), state_mem.size());
+
+        FILE *fp_write = fopen("dump_state.bin", "wb");
+        fwrite(state_mem.data(), 1, written, fp_write);
+        fclose(fp_write);
+
+        fprintf(stderr, "%s : serialized state into %zd out of a maximum of %zd bytes\n", __func__, written, state_mem.size());
+    }
+
+    // save state (last tokens)
+    const auto n_past_saved = n_past;

    // first run
    printf("\nfirst run: %s", params.prompt.c_str());

-    llama_batch batch = llama_batch_init(1, 0, 1);
-
    for (auto i = 0; i < params.n_predict; i++) {
        auto next_token     = llama_sampler_sample(smpl, ctx, -1);
        auto next_token_str = common_token_to_piece(ctx, next_token);
@@ -98,23 +111,27 @@ int main(int argc, char ** argv) {

    printf("\nsecond run: %s", params.prompt.c_str());

-    // load state from file
-    std::vector<llama_token> unused_sts(tokens.size()); // unused session tokens.
-    size_t n_token_count_out = 0;
+    // load state (rng, logits, embedding and kv_cache) from file
+    {
+        std::vector<uint8_t> state_mem;

-    if (!llama_state_load_file(ctx2, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
-        fprintf(stderr, "\n%s : failed to load state\n", __func__);
-        return 1;
+        FILE * fp_read = fopen("dump_state.bin", "rb");
+        fseek(fp_read, 0, SEEK_END);
+        state_mem.resize(ftell(fp_read));
+        fseek(fp_read, 0, SEEK_SET);
+        const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
+        fclose(fp_read);
+
+        if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) {
+            fprintf(stderr, "\n%s : failed to read state\n", __func__);
+            return 1;
+        }
+
+        fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
    }

-    fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
-
    // restore state (last tokens)
-    n_past = n_token_count_out;
-    if (!common_replay_last_token(ctx2, tokens.back(), n_past)) {
-        return 1;
-    }
-    ++n_past;
+    n_past = n_past_saved;

    // second run
    for (auto i = 0; i < params.n_predict; i++) {
@@ -143,9 +160,7 @@ int main(int argc, char ** argv) {
    }

    // make new context
-    auto params_ctx3 = common_context_params_to_llama(params);
-    params_ctx3.n_seq_max = 2;
-    llama_context * ctx3 = llama_init_from_model(model, params_ctx3);
+    llama_context * ctx3 = llama_init_from_model(model, common_context_params_to_llama(params));

    llama_sampler * smpl3 = llama_sampler_chain_init(sparams);

@@ -154,21 +169,26 @@ int main(int argc, char ** argv) {
    printf("\nsingle seq run: %s", params.prompt.c_str());

    // load state (rng, logits, embedding and kv_cache) from file
-    n_token_count_out = 0;
+    {
+        std::vector<uint8_t> state_mem;

-    if (!llama_state_load_file(ctx3, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
-        fprintf(stderr, "\n%s : failed to load state\n", __func__);
-        return 1;
+        FILE * fp_read = fopen("dump_state.bin", "rb");
+        fseek(fp_read, 0, SEEK_END);
+        state_mem.resize(ftell(fp_read));
+        fseek(fp_read, 0, SEEK_SET);
+        const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
+        fclose(fp_read);
+
+        if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) {
+            fprintf(stderr, "\n%s : failed to read state\n", __func__);
+            return 1;
+        }
+
+        fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
    }

-    fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
-
    // restore state (last tokens)
-    n_past = n_token_count_out;
-    if (!common_replay_last_token(ctx3, tokens.back(), n_past)) {
-        return 1;
-    }
-    ++n_past;
+    n_past = n_past_saved;

    // save seq 0 and load into seq 1
    {
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@@ -1,5 +1,4 @@
 #include "llama.h"
-#include <clocale>
 #include <cstdio>
 #include <cstring>
 #include <iostream>
@@ -13,8 +12,6 @@ static void print_usage(int, char ** argv) {
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    std::string model_path;
    int ngl = 99;
    int n_ctx = 2048;
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -1,5 +1,4 @@
 #include "llama.h"
-#include <clocale>
 #include <cstdio>
 #include <cstring>
 #include <string>
@@ -12,8 +11,6 @@ static void print_usage(int, char ** argv) {
 }

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    // path to the model gguf file
    std::string model_path;
    // prompt to generate text from
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -5,15 +5,12 @@
 #include "log.h"
 #include "llama.h"

-#include <clocale>
 #include <cstdio>
 #include <cstring>
 #include <string>
 #include <vector>

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -5,7 +5,6 @@
 #include "llama.h"

 #include <algorithm>
-#include <clocale>
 #include <cstdio>
 #include <cstring>
 #include <random>
@@ -31,8 +30,6 @@ struct seq_draft {
 };

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;

    // needed to get candidate probs even for temp <= 0.0
--- a/examples/sycl/ls-sycl-device.cpp
+++ b/examples/sycl/ls-sycl-device.cpp
@@ -6,10 +6,8 @@


 #include "ggml-sycl.h"
-#include <clocale>

 int main() {
-    std::setlocale(LC_NUMERIC, "C");
    ggml_backend_sycl_print_sycl_devices();
    return 0;
 }
--- a/examples/training/finetune.cpp
+++ b/examples/training/finetune.cpp
@@ -3,7 +3,6 @@
 #include "log.h"
 #include "llama.h"

-#include <clocale>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
@@ -15,8 +14,6 @@
 #endif

 int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
-
    common_params params;
    params.escape = false;

--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
 set(GGML_VERSION_MINOR 9)
-set(GGML_VERSION_PATCH 7)
+set(GGML_VERSION_PATCH 5)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

 find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -68,7 +68,7 @@ extern "C" {
    GGML_API void                           ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);

    // tensor copy between different backends
-    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
+    GGML_API void ggml_backend_tensor_copy(const struct ggml_tensor * src, struct ggml_tensor * dst);

    //
    // Backend (stream)
@@ -109,7 +109,18 @@ extern "C" {
    // the copy is performed after all the currently queued operations in backend_src
    // backend_dst will wait for the copy to complete before performing other operations
    // automatic fallback to sync copy if async is not supported
-    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
+    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    // asynchronous tensor shuffle
+    //   - src1, dst1 belong to backend_1
+    //   - src2, dst2 belong to backend_2
+    //   - src1 is copied to dst2
+    //   - src2 is copied to dst1
+    //   - both backends wait until both copies have completed
+    GGML_API void ggml_backend_tensor_shfl_async(
+        ggml_backend_t backend_1, ggml_backend_t backend_2,
+        const struct ggml_tensor * src1, const struct ggml_tensor * src2,
+        struct ggml_tensor * dst1, struct ggml_tensor * dst2);

    GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);

@@ -135,7 +146,9 @@ extern "C" {
        // integrated GPU device using host memory
        GGML_BACKEND_DEVICE_TYPE_IGPU,
        // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
-        GGML_BACKEND_DEVICE_TYPE_ACCEL
+        GGML_BACKEND_DEVICE_TYPE_ACCEL,
+        // "meta" device wrapping multiple other devices for tensor parallelism
+        GGML_BACKEND_DEVICE_TYPE_META,
    };

    // functionality supported by the device
@@ -211,6 +224,52 @@ extern "C" {
    };
    typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);

+    //
+    // Meta backend
+    //
+
+    enum ggml_backend_meta_split_state {
+        // tensor split by tensor dimensions:
+        GGML_BACKEND_SPLIT_STATE_BY_NE0   =  0,
+        GGML_BACKEND_SPLIT_STATE_BY_NE1   =  1,
+        GGML_BACKEND_SPLIT_STATE_BY_NE2   =  2,
+        GGML_BACKEND_SPLIT_STATE_BY_NE3   =  3,
+
+        GGML_BACKEND_SPLIT_STATE_MIRRORED = 10, // all values on all backends
+        GGML_BACKEND_SPLIT_STATE_PARTIAL  = 11, // each backend has a partial sum
+
+        // for internal bookkeeping only:
+        GGML_BACKEND_SPLIT_STATE_NONE     = 98,
+        GGML_BACKEND_SPLIT_STATE_UNKNOWN  = 99,
+    };
+
+    // function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
+    typedef enum ggml_backend_meta_split_state (*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
+
+
+    GGML_API bool ggml_backend_dev_is_meta(ggml_backend_dev_t dev);
+    GGML_API size_t ggml_backend_meta_dev_n_devs(ggml_backend_dev_t meta_dev);
+    GGML_API ggml_backend_dev_t ggml_backend_meta_dev_simple_dev(ggml_backend_dev_t meta_dev, size_t index);
+
+    // create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
+    GGML_API ggml_backend_dev_t ggml_backend_meta_device(
+        ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
+
+    GGML_API bool ggml_backend_buft_is_meta(ggml_backend_buffer_type_t buft);
+    GGML_API size_t ggml_backend_meta_buft_n_bufts(ggml_backend_buffer_type_t meta_buft);
+    GGML_API ggml_backend_buffer_type_t ggml_backend_meta_buft_simple_buft(ggml_backend_buffer_type_t meta_buft, size_t index);
+
+    GGML_API bool ggml_backend_buffer_is_meta(ggml_backend_buffer_t buf);
+    GGML_API size_t ggml_backend_meta_buffer_n_bufs(ggml_backend_buffer_t meta_buf);
+    GGML_API ggml_backend_buffer_t ggml_backend_meta_buffer_simple_buffer(ggml_backend_buffer_t meta_buf, size_t index);
+    GGML_API struct ggml_tensor * ggml_backend_meta_buffer_simple_tensor(const struct ggml_tensor * tensor, size_t index);
+
+    GGML_API bool ggml_backend_is_meta(ggml_backend_t backend);
+    GGML_API size_t ggml_backend_meta_n_backends(ggml_backend_t meta_backend);
+    GGML_API ggml_backend_t ggml_backend_meta_simple_backend(ggml_backend_t meta_backend, size_t index);
+
+    GGML_API enum ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync);
+
    //
    // Backend registry
    //
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -730,6 +730,10 @@ extern "C" {
    GGML_API size_t  ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
    GGML_API size_t  ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row

+    GGML_DEPRECATED(
+    GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
+    "use ggml_row_size() instead");
+
    GGML_API const char * ggml_type_name(enum ggml_type type);
    GGML_API const char * ggml_op_name  (enum ggml_op   op);
    GGML_API const char * ggml_op_symbol(enum ggml_op   op);
@@ -748,7 +752,6 @@ extern "C" {
    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_empty     (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_view      (const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_scalar    (const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_vector    (const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_matrix    (const struct ggml_tensor * tensor);
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -200,6 +200,7 @@ add_library(ggml-base
            ggml.cpp
            ggml-alloc.c
            ggml-backend.cpp
+            ggml-backend-meta.cpp
            ggml-opt.cpp
            ggml-threading.cpp
            ggml-threading.h
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -1,5 +1,6 @@
 #include "ggml-alloc.h"
 #include "ggml-backend-impl.h"
+#include "ggml-backend.h"
 #include "ggml.h"
 #include "ggml-impl.h"
 #include <assert.h>
@@ -17,6 +18,11 @@
 //#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
 #define AT_PRINTF(...)

+
+static bool ggml_is_view(const struct ggml_tensor * t) {
+    return t->view_src != NULL;
+}
+
 // ops that return true for this function must not use restrict pointers for their backend implementations
 bool ggml_op_can_inplace(enum ggml_op op) {
    switch (op) {
@@ -622,7 +628,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
    GGML_ASSERT(buffer_id >= 0);
    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);

-    if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_impl_is_view(node)) {
+    if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
        hn->allocated = true;
        assert(hn->addr.offset == 0);

@@ -653,7 +659,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor

                struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
                if (p_hn->n_children == 1 && p_hn->n_views == 0) {
-                    if (ggml_impl_is_view(parent)) {
+                    if (ggml_is_view(parent)) {
                        struct ggml_tensor * view_src = parent->view_src;
                        struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
                        if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
@@ -734,7 +740,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
        // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
        // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
        // itself is never used and should not be considered a dependency
-        if (ggml_impl_is_view(node) && node->op != GGML_OP_NONE) {
+        if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
            struct ggml_tensor * view_src = node->view_src;
            ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
        }
@@ -801,7 +807,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
                parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);

            if (p_hn->n_children == 0 && p_hn->n_views == 0) {
-                if (ggml_impl_is_view(parent)) {
+                if (ggml_is_view(parent)) {
                    struct ggml_tensor * view_src = parent->view_src;
                    struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
                    view_src_hn->n_views -= 1;
@@ -1235,6 +1241,9 @@ size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx,
 }

 ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
+    if (ggml_backend_buft_is_meta(buft)) {
+        return ggml_backend_meta_alloc_ctx_tensors_from_buft(ctx, buft);
+    }
    size_t nbytes_total = 0;
    return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
 }
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -2,7 +2,9 @@

 // ggml-backend internal header

+#include "ggml-alloc.h"
 #include "ggml-backend.h"
+#include "ggml.h"

 #ifdef  __cplusplus
 extern "C" {
@@ -90,9 +92,16 @@ extern "C" {
        void (*free)(ggml_backend_t backend);

        // (optional) asynchronous tensor data access
-        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        void (*set_tensor_async)   (ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void (*get_tensor_async)   (ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        void (*set_tensor_2d_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
+        void (*get_tensor_2d_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
        bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
+        bool (*shfl_tensor_async)(ggml_backend_t backend_1, ggml_backend_t backend_2,
+            const struct ggml_tensor * src1, const struct ggml_tensor * src2, struct ggml_tensor * dst1, struct ggml_tensor * dst2);
+
+        // (optional) backend-specific AllReduce operation for meta backend
+        bool (*allreduce_tensor_async)(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);

        // (optional) complete all pending operations (required if the backend supports async operations)
        void (*synchronize)(ggml_backend_t backend);
@@ -250,6 +259,9 @@ extern "C" {
 #    define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
 #endif

+    // temporary workaround to statically allocate tensors from a context in a deduplicated way:
+    GGML_API struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
+
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/src/ggml-backend-meta.cpp
+++ b/ggml/src/ggml-backend-meta.cpp
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -471,10 +471,9 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,

    int best_score = 0;
    fs::path best_path;
-    std::error_code ec;

    for (const auto & search_path : search_paths) {
-        if (!fs::exists(search_path, ec)) {
+        if (std::error_code ec; !fs::exists(search_path, ec)) {
            if (ec) {
                GGML_LOG_DEBUG("%s: posix_stat(%s) failure, error-message: %s\n", __func__, path_str(search_path).c_str(), ec.message().c_str());
            } else {
@@ -484,7 +483,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
        }
        fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
        for (const auto & entry : dir_it) {
-            if (entry.is_regular_file(ec)) {
+            if (entry.is_regular_file()) {
                auto filename = entry.path().filename();
                auto ext = entry.path().extension();
                if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -123,7 +123,7 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
 void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
    GGML_ASSERT(buffer);
    // get_base is optional if the buffer is zero-sized
-    if (buffer->size == 0) {
+    if (!ggml_backend_buffer_is_meta(buffer) && buffer->size == 0) {
        return NULL;
    }

@@ -388,7 +388,7 @@ ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {

 // backend copy

-void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
+void ggml_backend_tensor_copy(const struct ggml_tensor * src, struct ggml_tensor * dst) {
    GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");

    if (src == dst) {
@@ -402,7 +402,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
    } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
 #ifndef NDEBUG
        GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
-#endif
+#endif // NDEBUG
        size_t nbytes = ggml_nbytes(src);
        void * data = malloc(nbytes);
        ggml_backend_tensor_get(src, data, 0, nbytes);
@@ -411,7 +411,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
    }
 }

-void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
+void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
    GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");

    if (src == dst) {
@@ -432,6 +432,20 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
    ggml_backend_tensor_copy(src, dst);
 }

+void ggml_backend_tensor_shfl_async(
+        ggml_backend_t backend_1, ggml_backend_t backend_2,
+        const struct ggml_tensor * src1, const struct ggml_tensor * src2,
+        struct ggml_tensor * dst1, struct ggml_tensor * dst2) {
+    GGML_ASSERT(ggml_are_same_layout(src1, dst1) && "cannot shuffle tensors with different layouts");
+    GGML_ASSERT(ggml_are_same_layout(src2, dst2) && "cannot shuffle tensors with different layouts");
+    if (backend_1->iface.shfl_tensor_async != NULL) {
+        if (backend_1->iface.shfl_tensor_async(backend_1, backend_2, src1, src2, dst1, dst2)) {
+            return;
+        }
+    }
+    ggml_backend_tensor_copy_async(backend_1, backend_2, src1, dst2);
+    ggml_backend_tensor_copy_async(backend_2, backend_1, src2, dst1);
+}
 // events

 ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
@@ -500,6 +514,7 @@ enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
 }

 void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
+    GGML_ASSERT(device);
    memset(props, 0, sizeof(*props));
    device->iface.get_props(device, props);
 }
@@ -1455,6 +1470,10 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
        int split_backend_id = split->backend_id;
        ggml_backend_t split_backend = sched->backends[split_backend_id];

+        if (sched->events[split_backend_id][sched->cur_copy] == NULL) {
+            ggml_backend_synchronize(split_backend);
+        }
+
        // copy the input tensors to the split backend
        for (int input_id = 0; input_id < split->n_inputs; input_id++) {
            ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
@@ -1465,16 +1484,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                    ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
-                } else {
-                    ggml_backend_synchronize(split_backend);
                }
-                ggml_backend_tensor_copy(input, input_cpy);
+                ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
            } else {
                // wait for the split backend to finish using the input before overwriting it
                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                    ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
-                } else {
-                    ggml_backend_synchronize(split_backend);
                }

                // when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
@@ -1578,6 +1593,10 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
            }
        }

+        if (sched->events[split_backend_id][sched->cur_copy] == NULL) {
+            ggml_backend_synchronize(split_backend);
+        }
+
        if (!sched->callback_eval) {
            enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
            if (ec != GGML_STATUS_SUCCESS) {
@@ -1899,8 +1918,9 @@ enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct
    GGML_ASSERT(tensor->data == NULL);
    GGML_ASSERT(tensor->view_src == NULL);
    GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
-    GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
-                (char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
+    GGML_ASSERT(ggml_backend_buffer_is_meta(buffer) ||
+        (char *) addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
+        (char *) ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));

    tensor->buffer = buffer;
    tensor->data = addr;
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -260,8 +260,12 @@ static struct ggml_backend_i blas_backend_i = {
    /* .get_name                = */ ggml_backend_blas_get_name,
    /* .free                    = */ ggml_backend_blas_free,
    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_2d_async     = */ NULL,
+    /* .set_tensor_2d_async     = */ NULL,
    /* .get_tensor_async        = */ NULL,
    /* .cpy_tensor_async        = */ NULL,
+    /* .shfl_tensor_async       = */ NULL,
+    /* .allreduce_tensor_async  = */ NULL,
    /* .synchronize             = */ NULL,
    /* .graph_plan_create       = */ NULL,
    /* .graph_plan_free         = */ NULL,
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -3286,223 +3286,130 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context & ctx, ggml_tensor
 }

 /**
- * @brief Performs quantized matrix multiplication for Mixture of Experts (MoE)
- * models using the CANN backend.
+ * @brief Performs expert-specific matrix multiplication (MoE) with
+ * quantized precision using the CANN backend.
 *
- * This function implements MUL_MAT_ID operation for quantized weight matrices
- * (Q4_0 and Q8_0 formats). It selects expert-specific weight matrices based on
- * the provided expert indices, and computes matrix multiplication using CANN's
- * WeightQuantBatchMatmulV2 operator.
+ * This function executes a matrix multiplication operation tailored for
+ * Mixture of Experts (MoE) models, where the input tensor is multiplied
+ * with expert-specific quantized weight matrices. It leverages the CANN
+ * backend to perform efficient low-precision computations and stores the
+ * quantized result in the destination tensor `dst`.
 *
- * The function performs the following steps:
- * 1. Converts input/output tensors to F16 format if necessary
- * 2. Uses IndexSelect to extract expert-specific weights and scales based on indices
- * 3. Performs quantized matrix multiplication for each expert using WeightQuantBatchMatmulV2
- * 4. Converts output back to the target type if needed
+ * Quantization techniques reduce memory footprint and improve performance
+ * by using lower-bit representations (e.g., int8) instead of floating-point.
+ * This function is designed to work with such formats and may incorporate
+ * optimizations like identity-based fast paths or routing masks for sparse
+ * expert selection.
 *
- * Tensor shapes:
- * - dst:  [M, K, N, 1] - output tensor
- * - src0: [D, M, A, 1] - quantized weight matrices (Q4_0 or Q8_0)
- * - src1: [D, B, N, 1] - input activations (B = K for per-expert input, or B = 1 for broadcast)
- * - ids:  [K, N] - expert indices for routing
+ * @param ctx The context for executing CANN backend operations.
+ * @param dst The destination tensor where the quantized MoE multiplication result
+ * will be stored.
 *
- * @param ctx The CANN backend context for operation execution.
- * @param dst The destination tensor where the multiplication result will be stored.
- *
- * @note Only Q4_0 and Q8_0 quantization formats are supported.
- * @note The function handles automatic type conversion to/from F16 as needed by the hardware.
+ * @note This function assumes quantized data types and is designed for
+ * MoE architectures with potential sparse expert routing.
 */
 static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
-    // dst:  [M, K, N, 1]
-    // src0: [D, M, A, 1] - quantized weights
-    // src1: [D, B, N, 1] - input activations, B = K or B = 1
-    // ids:  [K, N] - expert indices
-    ggml_tensor * src0 = dst->src[0];
-    ggml_tensor * src1 = dst->src[1];
-    ggml_tensor * ids  = dst->src[2];
+    // TODO: Use aclnnGroupedMatMul
+    //dst   [M, K, N, 1]
+    ggml_tensor * src0 = dst->src[0];  //src0	[D, M, A, 1]
+    ggml_tensor * src1 = dst->src[1];  //src1	[D, B, N, 1], B = K or B = 1
+    ggml_tensor * ids  = dst->src[2];  //ids	[K, N]

-    GGML_ASSERT(src0->ne[3] == 1);
-    GGML_ASSERT(src1->ne[3] == 1);
-    GGML_ASSERT(dst->ne[3] == 1);
-    GGML_ASSERT(src1->ne[2] == ids->ne[1]);
+    GGML_TENSOR_BINARY_OP_LOCALS

-    const int64_t        n_batches        = ids->ne[1];
-    const int64_t        n_select_experts = ids->ne[0];
-    const enum ggml_type type             = src0->type;
+    // copy index from npu to cpu
+    int64_t n_as  = ne02;        // A
+    int64_t n_ids = ids->ne[0];  // K

-    const int32_t group_size = QK8_0;  // Both Q4_0 and Q8_0 use group size of 32
-    GGML_ASSERT(group_size == QK4_0);
+    std::vector<char> ids_host(ggml_nbytes(ids));
+    ACL_CHECK(aclrtMemcpyAsync(ids_host.data(), ggml_nbytes(ids), ids->data, ggml_nbytes(ids),
+                               ACL_MEMCPY_DEVICE_TO_HOST, ctx.stream()));
+    ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));

-    // Calculate element size for quantized weights
-    const float weight_elem_size =
-        (type == GGML_TYPE_Q4_0) ? 0.5f :
-        (type == GGML_TYPE_Q8_0) ? 1.0f :
-                                   (GGML_ABORT("MUL_MAT_ID only supports Q4_0 and Q8_0"), 0.0f);
+    char * src0_original = (char *) src0->data;
+    char * src1_original = (char *) src1->data;
+    char * dst_original  = (char *) dst->data;

-    // Calculate scale offset in memory
-    const size_t weight_size     = src0->ne[0] * src0->ne[1] * src0->ne[2] * weight_elem_size;
-    const size_t scale_elem_size = sizeof(uint16_t);
-    char *       scale_data      = (char *) src0->data + weight_size;
+    ggml_tensor src0_row = *src0;
+    ggml_tensor src1_row = *src1;
+    ggml_tensor dst_row  = *dst;

-    // Allocate buffers for selected expert weights and scales
-    const size_t         selected_weight_size = src0->ne[0] * src0->ne[1] * n_select_experts * weight_elem_size;
-    ggml_cann_pool_alloc selected_weight_alloc(ctx.pool(), selected_weight_size);
-    void *               selected_weight_buffer = selected_weight_alloc.get();
-
-    const size_t selected_scale_size = (src0->ne[0] / group_size) * src0->ne[1] * n_select_experts * scale_elem_size;
-    ggml_cann_pool_alloc selected_scale_alloc(ctx.pool(), selected_scale_size);
-    void *               selected_scale_buffer = selected_scale_alloc.get();
-
-    // Helper lambda to allocate and cast tensor to F16 if needed
-    constexpr size_t f16_elem_size      = sizeof(uint16_t);
-    auto             prepare_f16_buffer = [&](ggml_tensor * tensor, ggml_cann_pool_alloc & allocator,
-                                  bool need_cast = false) -> void * {
-        if (tensor->type == GGML_TYPE_F16) {
-            return tensor->data;
-        }
-
-        size_t total_size = f16_elem_size;
-        for (int i = 0; i < GGML_MAX_DIMS; i++) {
-            total_size *= tensor->ne[i];
-        }
-        void * buffer = allocator.alloc(total_size);
-
-        if (need_cast == false) {
-            return buffer;
-        }
-
-        int64_t ne[GGML_MAX_DIMS];
-        size_t  nb[GGML_MAX_DIMS] = { f16_elem_size };
-        for (int i = 0; i < GGML_MAX_DIMS; i++) {
-            ne[i] = tensor->ne[i];
-            if (i > 0) {
-                nb[i] = nb[i - 1] * ne[i - 1];
-            }
-        }
-
-        acl_tensor_ptr src_tensor = ggml_cann_create_tensor(tensor);
-        acl_tensor_ptr f16_tensor = ggml_cann_create_tensor(buffer, ACL_FLOAT16, f16_elem_size, ne, nb, GGML_MAX_DIMS);
-        aclnn_cast(ctx, src_tensor.get(), f16_tensor.get(), ACL_FLOAT16);
-
-        return buffer;
-    };
-
-    // Prepare input and output buffers
-    ggml_cann_pool_alloc input_alloc(ctx.pool());
-    void *               input_buffer = prepare_f16_buffer(src1, input_alloc, true);
-
-    ggml_cann_pool_alloc output_alloc(ctx.pool());
-    void *               output_buffer = prepare_f16_buffer(dst, output_alloc, false);
-
-    // Process each batch
-    for (int64_t batch_idx = 0; batch_idx < n_batches; batch_idx++) {
-        // Create index tensor for current batch
-        const size_t   index_offset  = batch_idx * ids->nb[1];
-        acl_tensor_ptr batch_indices = ggml_cann_create_tensor(ids, ids->ne, ids->nb, 1, ACL_FORMAT_ND, index_offset);
-
-        // Select quantized weights using expert indices
-        // Q4_0 stores 2 values per byte, Q8_0 stores 1 value per byte
-        const int64_t weight_d         = (type == GGML_TYPE_Q4_0) ? src0->ne[0] / 2 : src0->ne[0];
-        const int64_t weight_m         = src0->ne[1];
-        const int64_t weight_n_experts = src0->ne[2];
-
-        int64_t weight_ne[3] = { weight_d, weight_m, weight_n_experts };
-        size_t  weight_nb[3] = { sizeof(int8_t), weight_d * sizeof(int8_t), weight_d * weight_m * sizeof(int8_t) };
-
-        acl_tensor_ptr all_weights =
-            ggml_cann_create_tensor(src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb, 3);
-
-        int64_t selected_weight_ne[3] = { weight_d, weight_m, n_select_experts };
-        size_t  selected_weight_nb[3] = { sizeof(int8_t), weight_d * sizeof(int8_t),
-                                          weight_d * weight_m * sizeof(int8_t) };
-
-        acl_tensor_ptr selected_weights = ggml_cann_create_tensor(selected_weight_buffer, ACL_INT8, sizeof(int8_t),
-                                                                  selected_weight_ne, selected_weight_nb, 3);
-
-        GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, all_weights.get(), 0, batch_indices.get(), selected_weights.get());
-
-        // Select scales using the same expert indices
-        const int64_t scale_d     = src0->ne[0] / group_size;
-        int64_t       scale_ne[3] = { scale_d, weight_m, weight_n_experts };
-        size_t scale_nb[3] = { scale_elem_size, scale_d * scale_elem_size, scale_d * weight_m * scale_elem_size };
-
-        acl_tensor_ptr all_scales =
-            ggml_cann_create_tensor(scale_data, ACL_FLOAT16, scale_elem_size, scale_ne, scale_nb, 3);
-
-        int64_t selected_scale_ne[3] = { scale_d, weight_m, n_select_experts };
-        size_t  selected_scale_nb[3] = { scale_elem_size, scale_d * scale_elem_size,
-                                         scale_d * weight_m * scale_elem_size };
-
-        acl_tensor_ptr selected_scales = ggml_cann_create_tensor(selected_scale_buffer, ACL_FLOAT16, scale_elem_size,
-                                                                 selected_scale_ne, selected_scale_nb, 3);
-
-        GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, all_scales.get(), 0, batch_indices.get(), selected_scales.get());
-
-        // Process each expert for current batch
-        // IndexSelect output layout: [D, M, K] in contiguous format
-        // WeightQuantBatchMatmulV2 expects: [M, D] with row-major stride
-        for (int64_t expert_idx = 0; expert_idx < n_select_experts; expert_idx++) {
-            // Determine input offset: broadcast if src1->ne[1]==1, otherwise use per-expert input
-            const size_t input_offset =
-                (batch_idx * src1->ne[1] + (src1->ne[1] == 1 ? 0 : expert_idx)) * src1->ne[0] * f16_elem_size;
-            const size_t output_offset = (batch_idx * dst->ne[1] + expert_idx) * dst->ne[0] * f16_elem_size;
-
-            // Create weight view for current expert: [D, M, K] -> [M, D]
-            int64_t      weight_view_ne[2]  = { weight_m, src0->ne[0] };
-            float        weight_view_nb[2]  = { src0->ne[0] * weight_elem_size, weight_elem_size };
-            const size_t weight_view_offset = expert_idx * selected_weight_nb[2];
-
-            acl_tensor_ptr weight_view =
-                ggml_cann_create_tensor(selected_weight_buffer, ggml_cann_type_mapping(type), weight_elem_size,
-                                        weight_view_ne, weight_view_nb, 2, ACL_FORMAT_ND, weight_view_offset);
-
-            // Create scale view for current expert: [D, M, K] -> [M, D]
-            int64_t      scale_view_ne[2]  = { weight_m, scale_d };
-            size_t       scale_view_nb[2]  = { selected_scale_nb[1], selected_scale_nb[0] };
-            const size_t scale_view_offset = expert_idx * selected_scale_nb[2];
-
-            acl_tensor_ptr scale_view =
-                ggml_cann_create_tensor(selected_scale_buffer, ACL_FLOAT16, scale_elem_size, scale_view_ne,
-                                        scale_view_nb, 2, ACL_FORMAT_ND, scale_view_offset);
-
-            // Create input activation tensor [D, 1]
-            int64_t input_ne[2] = { src1->ne[0], 1 };
-            size_t  input_nb[2] = { f16_elem_size, src1->ne[0] * f16_elem_size };
-
-            acl_tensor_ptr input_tensor = ggml_cann_create_tensor(input_buffer, ACL_FLOAT16, f16_elem_size, input_ne,
-                                                                  input_nb, 2, ACL_FORMAT_ND, input_offset);
-
-            // Create output tensor [M, 1]
-            int64_t output_ne[2] = { dst->ne[0], 1 };
-            size_t  output_nb[2] = { f16_elem_size, dst->ne[0] * f16_elem_size };
-
-            acl_tensor_ptr output_tensor = ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, f16_elem_size, output_ne,
-                                                                   output_nb, 2, ACL_FORMAT_ND, output_offset);
-
-            // Perform quantized matrix multiplication
-            GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, input_tensor.get(), weight_view.get(),
-                                    scale_view.get(), nullptr, nullptr, nullptr, nullptr, group_size,
-                                    output_tensor.get());
-        }
+    const enum ggml_type type = dst->src[0]->type;
+    float                weight_elem_size;
+    if (type == GGML_TYPE_Q4_0) {
+        weight_elem_size = float(sizeof(uint8_t)) / 2;
+    } else if (type == GGML_TYPE_Q8_0) {
+        weight_elem_size = float(sizeof(uint8_t));
+    } else {
+        GGML_ABORT("MUL_MAT_ID only support quant type Q4_0 and Q8_0 ");
    }

-    // Cast output back to original type if we used a temporary F16 buffer
-    if (dst->type != GGML_TYPE_F16) {
-        int64_t ne[GGML_MAX_DIMS];
-        size_t  nb[GGML_MAX_DIMS] = { f16_elem_size };
-        for (int i = 0; i < GGML_MAX_DIMS; i++) {
-            ne[i] = dst->ne[i];
-            if (i > 0) {
-                nb[i] = nb[i - 1] * ne[i - 1];
-            }
+    // src0_row [D, M, 1, 1] weight without permute
+    src0_row.ne[2]       = 1;
+    src0_row.ne[3]       = 1;
+    src0_row.nb[0]       = weight_elem_size;
+    src0_row.nb[1]       = weight_elem_size * ne00;
+    src0_row.nb[2]       = weight_elem_size * ne00;
+    src0_row.nb[3]       = weight_elem_size * ne00;
+    size_t weight_stride = ne00 * ne01 * weight_elem_size;
+    size_t weight_size   = weight_stride * ne02 * ne03;
+
+    // scale [D, M, 1, 1] -> scale && permute
+    size_t scale_elem_size = sizeof(uint16_t);
+    size_t scale_stride    = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
+
+    // src1_row [D, 1, 1, 1] -> input
+    src1_row.ne[1] = 1;
+    src1_row.ne[2] = 1;
+    src1_row.ne[3] = 1;
+    src1_row.nb[2] = nb11;
+    src1_row.nb[3] = nb11;
+
+    // dst_row [M, 1, 1, 1] -> out
+    dst_row.ne[1] = 1;
+    dst_row.ne[2] = 1;
+    dst_row.ne[3] = 1;
+    dst_row.nb[2] = nb1;
+    dst_row.nb[3] = nb1;
+
+    //create weight for one row
+    ggml_cann_pool_alloc weight_allocator(ctx.pool());
+    void *               weight_buffer = weight_allocator.alloc(nb02);
+    for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
+        for (int64_t id = 0; id < n_ids; id++) {
+            // expert index
+            int32_t i02 = *(int32_t *) (ids_host.data() + iid1 * ids->nb[1] + id * ids->nb[0]);
+            GGML_ASSERT(i02 >= 0 && i02 < n_as);
+
+            // If B = 1 (broadcast), always use 0; otherwise, use id.
+            int64_t i11 = (ne11 == 1 ? 0 : id);
+            int64_t i12 = iid1;
+
+            int64_t i1 = id;
+            int64_t i2 = i12;
+
+            void * src0_tmp_ptr  = src0_original + i02 * weight_stride;
+            void * scale_tmp_ptr = src0_original + weight_size + i02 * scale_stride;
+            void * src1_tmp_ptr  = src1_original + i11 * nb11 + i12 * nb12;
+            void * dst_tmp_ptr   = dst_original + i1 * nb1 + i2 * nb2;
+
+            // mem cpy
+            ACL_CHECK(aclrtMemcpyAsync(weight_buffer, weight_stride, src0_tmp_ptr, weight_stride,
+                                       ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
+            void * scale_buffer = (char *) weight_buffer + weight_stride;
+            ACL_CHECK(aclrtMemcpyAsync(scale_buffer, scale_stride, scale_tmp_ptr, scale_stride,
+                                       ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
+
+            src0_row.data  = weight_buffer;
+            src1_row.data  = src1_tmp_ptr;
+            dst_row.data   = dst_tmp_ptr;
+            dst_row.src[0] = &src0_row;
+            dst_row.src[1] = &src1_row;
+
+            ggml_cann_mul_mat(ctx, &dst_row);
        }
-
-        acl_tensor_ptr f16_output =
-            ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, f16_elem_size, ne, nb, GGML_MAX_DIMS);
-        acl_tensor_ptr dst_tensor = ggml_cann_create_tensor(dst);
-
-        aclnn_cast(ctx, f16_output.get(), dst_tensor.get(), ggml_cann_type_mapping(dst->type));
    }
+    return;
 }

 void ggml_cann_mul_mat_id(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -794,44 +794,19 @@ struct ggml_backend_cann_buffer_context {
    ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
 };

-// cann buffer type
 /**
- * @brief Structure representing context information for a specific backend
- * buffer type.
+ * @brief Check if a buffer is a CANN buffer.
+ *
+ * This function checks if a given buffer is a CANN buffer by comparing its
+ * `get_name` function pointer to `ggml_backend_cann_buffer_get_name`.
+ *
+ * @param buffer The buffer to check.
+ * @return true if the buffer is a CANN buffer, false otherwise.
 */
-struct ggml_backend_cann_buffer_type_context {
-    int32_t     device; /**< Device identifier associated with the buffer context. */
-    std::string name;   /**< Name associated with the buffer context. */
-};
+static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);

-/**
- * @brief Retrieves the name associated with a CANN buffer type.
- *
- * This function returns the descriptive name associated with the specified
- * CANN buffer type context.
- *
- * @param buft Pointer to the buffer type context.
- * @return Const pointer to the C-style string containing the name.
- */
-static const char * ggml_backend_cann_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
-
-    return buft_ctx->name.c_str();
-}
-
-/**
- * @brief Checks if the backend buffer type is associated with the CANN backend.
- *
- * This function checks whether the provided backend buffer type is associated
- * with the CANN backend based on the comparison of its name retrieval function
- * pointer.
- *
- * @param buft Pointer to the backend buffer type to check.
- * @return bool Returns true if the buffer type is associated with the CANN
- * backend, otherwise false.
- */
-static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
-    return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
+static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
+    return ggml_backend_buft_is_cann(buffer->buft);
 }

 /**
@@ -1296,7 +1271,7 @@ static void ggml_backend_cann_buffer_get_tensor(ggml_backend_buffer_t buffer,
 static bool ggml_backend_cann_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
                                                const ggml_tensor *   src,
                                                ggml_tensor *         dst) {
-    if (ggml_backend_buft_is_cann(src->buffer->buft)) {
+    if (ggml_backend_buffer_is_cann(src->buffer)) {
        ggml_backend_cann_buffer_context * src_ctx = (ggml_backend_cann_buffer_context *) src->buffer->context;
        ggml_backend_cann_buffer_context * dst_ctx = (ggml_backend_cann_buffer_context *) buffer->context;

@@ -1360,6 +1335,31 @@ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
    /* .reset           = */ NULL,
 };

+// cann buffer type
+/**
+ * @brief Structure representing context information for a specific backend
+ * buffer type.
+ */
+struct ggml_backend_cann_buffer_type_context {
+    int32_t     device; /**< Device identifier associated with the buffer context. */
+    std::string name;   /**< Name associated with the buffer context. */
+};
+
+/**
+ * @brief Retrieves the name associated with a CANN buffer type.
+ *
+ * This function returns the descriptive name associated with the specified
+ * CANN buffer type context.
+ *
+ * @param buft Pointer to the buffer type context.
+ * @return Const pointer to the C-style string containing the name.
+ */
+static const char * ggml_backend_cann_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
+
+    return buft_ctx->name.c_str();
+}
+
 /**
 * @brief Allocates a new CANN buffer of the specified type and size.
 *
@@ -1997,7 +1997,7 @@ static bool ggml_backend_cann_cpy_tensor_async(ggml_backend_t      backend_src,

    GGML_ASSERT(!is_matmul_weight((const ggml_tensor *) src));

-    if (!ggml_backend_buft_is_cann(src->buffer->buft) || !ggml_backend_buft_is_cann(dst->buffer->buft)) {
+    if (!ggml_backend_buffer_is_cann(src->buffer) || !ggml_backend_buffer_is_cann(dst->buffer)) {
        return false;
    }

@@ -2523,6 +2523,21 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
    GGML_UNUSED(dev);
 }

+/**
+ * @brief Checks if the backend buffer type is associated with the CANN backend.
+ *
+ * This function checks whether the provided backend buffer type is associated
+ * with the CANN backend based on the comparison of its name retrieval function
+ * pointer.
+ *
+ * @param buft Pointer to the backend buffer type to check.
+ * @return bool Returns true if the buffer type is associated with the CANN
+ * backend, otherwise false.
+ */
+static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
+}
+
 /**
 * @brief Records an event on the CANN backend stream.
 *
@@ -2567,7 +2582,11 @@ static const ggml_backend_i ggml_backend_cann_interface = {
    /* .free                    = */ ggml_backend_cann_free,
    /* .set_tensor_async        = */ ggml_backend_cann_set_tensor_async,
    /* .get_tensor_async        = */ ggml_backend_cann_get_tensor_async,
+    /* .get_tensor_2d_async     = */ NULL,
+    /* .set_tensor_2d_async     = */ NULL,
    /* .cpy_tensor_async        = */ ggml_backend_cann_cpy_tensor_async,
+    /* .shfl_tensor_async       = */ NULL,
+    /* .allreduce_tensor_async  = */ NULL,
    /* .synchronize             = */ ggml_backend_cann_synchronize,
    /* .graph_plan_create       = */ NULL,
    /* .graph_plan_free         = */ NULL,
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -9,11 +9,6 @@ function(ggml_add_cpu_backend_features cpu_name arch)
    target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
    target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
    set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    # Disable LTO for the feature detection code to prevent cross-module optimization
-    # from inlining architecture-specific instructions into the score function.
-    # Without this, LTO can cause SIGILL when loading backends on older CPUs
-    # (e.g., loading power10 backend on power9 crashes before feature check runs).
-    target_compile_options(${GGML_CPU_FEATS_NAME} PRIVATE -fno-lto)
    target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
 endfunction()

@@ -566,32 +561,35 @@ function(ggml_add_cpu_backend_variant_impl tag_name)

        # Fetch KleidiAI sources:
        include(FetchContent)
-        set(KLEIDIAI_COMMIT_TAG "v1.22.0")
+        set(KLEIDIAI_COMMIT_TAG "v1.16.0")
        set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
-        set(KLEIDIAI_ARCHIVE_MD5  "54049037570ab0ee0a0d126b2ba5ece1")
+        set(KLEIDIAI_ARCHIVE_MD5  "0a9e9008adb6031f9e8cf70dff4a3321")

        if (POLICY CMP0135)
            cmake_policy(SET CMP0135 NEW)
        endif()

-        # TODO: Use FetchContent_MakeAvailable with EXCLUDE_FROM_ALL after bumping minimum CMake version to 3.28+
-        # Using FetchContent_Populate instead to avoid EXCLUDE_FROM_ALL which requires CMake 3.28
        FetchContent_Declare(KleidiAI_Download
            URL ${KLEIDIAI_DOWNLOAD_URL}
            DOWNLOAD_EXTRACT_TIMESTAMP NEW
            URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5})

+        FetchContent_MakeAvailable(KleidiAI_Download)
        FetchContent_GetProperties(KleidiAI_Download
            SOURCE_DIR  KLEIDIAI_SRC
            POPULATED   KLEIDIAI_POPULATED)

        if (NOT KLEIDIAI_POPULATED)
-            FetchContent_Populate(KleidiAI_Download)
-            FetchContent_GetProperties(KleidiAI_Download SOURCE_DIR KLEIDIAI_SRC)
+            message(FATAL_ERROR "KleidiAI source downloaded failed.")
        endif()

        add_compile_definitions(GGML_USE_CPU_KLEIDIAI)

+        # Remove kleidiai target after fetching it
+        if (TARGET kleidiai)
+            set_target_properties(kleidiai PROPERTIES EXCLUDE_FROM_ALL TRUE)
+        endif()
+
        list(APPEND GGML_CPU_SOURCES
            ggml-cpu/kleidiai/kleidiai.cpp
            ggml-cpu/kleidiai/kernels.cpp
@@ -608,7 +606,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
-            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_f16p_qsi4c32p/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)

        set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
@@ -649,6 +646,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)

        if (NOT SME_ENABLED MATCHES -1)
            list(APPEND GGML_KLEIDIAI_SOURCES
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.c
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_asm.S
@@ -656,13 +654,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot_asm.S
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_asm.S
-                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_f16p_qsi4c32p/kai_matmul_clamp_f32_f16p1vlx2_qsi4c32p4vlx2_1vlx4vl_sme2_mopa.c
-                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_f16p_qsi4c32p/kai_matmul_clamp_f32_f16p1vlx2_qsi4c32p4vlx2_1vlx4vl_sme2_mopa_asm.S
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
-                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_f16pmrx2_f32_neon.c
                ${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
-            set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2+sme2+fp16")
+            set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
        endif()

        if (NOT SVE_ENABLED MATCHES -1)
--- a/ggml/src/ggml-cpu/amx/amx.cpp
+++ b/ggml/src/ggml-cpu/amx/amx.cpp
@@ -141,50 +141,27 @@ static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_typ
 namespace ggml::cpu::amx {
 class extra_buffer_type : ggml::cpu::extra_buffer_type {
    bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
-        if (op->op != GGML_OP_MUL_MAT) {
-            return false;
-        }
-        auto * src0 = op->src[0];
-        auto * src1 = op->src[1];
+        // handle only 2d gemm for now
+        auto is_contiguous_2d = [](const struct ggml_tensor * t) {
+            return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
+        };

-        if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
-            return false;
-        }
-        if (!src0->buffer || src0->buffer->buft != ggml_backend_amx_buffer_type()) {
-            return false;
-        }
-        if (src1->buffer && !ggml_backend_buft_is_host(src1->buffer->buft)) {
-            return false;
-        }
-        if (op->ne[0] % (TILE_N * 2)) {
-            return false;
-        }
-        int alignment;
-        switch (src0->type) {
-            case GGML_TYPE_Q4_0:
-            case GGML_TYPE_Q4_1:
-            case GGML_TYPE_Q8_0:
-                alignment = TILE_K;
-                break;
-            case GGML_TYPE_Q4_K:
-            case GGML_TYPE_Q5_K:
-            case GGML_TYPE_Q6_K:
-            case GGML_TYPE_IQ4_XS:
-                alignment = 256; // QK_K
-                break;
-            case GGML_TYPE_F16:
-                alignment = 16;
-                break;
-            default:
+        if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) &&  // src0 must be contiguous
+            is_contiguous_2d(op->src[1]) &&                               // src1 must be contiguous
+            op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
+            op->src[0]->ne[0] % (TILE_K * 2 * 32) == 0 && // TODO: not sure if correct (https://github.com/ggml-org/llama.cpp/pull/16315)
+            op->ne[0] % (TILE_N * 2) == 0 &&                              // out_features is 32x
+            (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
+            // src1 must be host buffer
+            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
                return false;
+            }
+            // src1 must be float32
+            if (op->src[1]->type == GGML_TYPE_F32) {
+                return true;
+            }
        }
-        if (src0->ne[0] % alignment) {
-            return false;
-        }
-        if (src1->type != GGML_TYPE_F32) {
-            return false;
-        }
-        return true;
+        return false;
    }

    ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
--- a/ggml/src/ggml-cpu/amx/common.h
+++ b/ggml/src/ggml-cpu/amx/common.h
@@ -9,8 +9,6 @@

 #if defined(GGML_USE_OPENMP)
 #include <omp.h>
-#else
-#include <thread>
 #endif

 #define TILE_M 16
@@ -58,40 +56,18 @@ inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
 }

 template <typename func_t>
-inline void parallel_for(int n, const func_t & f) {
-    if (n <= 0) {
-        return;
-    }
+inline void parallel_for(int n, const func_t& f) {
 #if defined(GGML_USE_OPENMP)
-    #pragma omp parallel
-    {
-        int nth = omp_get_num_threads();
-        int ith = omp_get_thread_num();
-        int tbegin, tend;
-        balance211(n, nth, ith, tbegin, tend);
-        f(tbegin, tend);
-    }
+#pragma omp parallel
+{
+    int nth = omp_get_num_threads();
+    int ith = omp_get_thread_num();
+    int tbegin, tend;
+    balance211(n, nth, ith, tbegin, tend);
+    f(tbegin, tend);
+}
 #else
-    int nth = std::thread::hardware_concurrency();
-    if (nth <= 1) {
-        f(0, n);
-        return;
-    }
-    if (nth > n) {
-        nth = n;
-    }
-    std::vector<std::thread> threads;
-    threads.reserve(nth);
-    for (int ith = 0; ith < nth; ++ith) {
-        threads.emplace_back([&f, n, ith, nth] {
-            int tbegin, tend;
-            balance211(n, nth, ith, tbegin, tend);
-            f(tbegin, tend);
-        });
-    }
-    for (auto & t : threads) {
-        t.join();
-    }
+    f(0, n);
 #endif
 }

--- a/ggml/src/ggml-cpu/amx/mmq.cpp
+++ b/ggml/src/ggml-cpu/amx/mmq.cpp
@@ -1,3 +1,4 @@
+
 #if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Wpedantic"
 #pragma GCC diagnostic ignored "-Wunused-local-typedefs"
@@ -201,27 +202,35 @@ struct tile_config_t{
 //    advanced-matrix-extensions-intrinsics-functions.html
 //

-inline void ggml_tile_config_init(void) {
-    static thread_local bool done = false;
+#define TC_CONFIG_TILE(i, r, cb) tc.rows[i] = r; tc.colsb[i] = cb
+void ggml_tile_config_init(void) {
+    static thread_local bool is_first_time = true;

-    if (done) {
+    if (!is_first_time) {
        return;
    }

-    alignas(64) tile_config_t tc = {};
-    tc.palette_id = 1;
-    tc.start_row = 0;
-    tc.rows[0] = 8;   tc.colsb[0] = 64;
-    tc.rows[1] = 8;   tc.colsb[1] = 64;
-    tc.rows[2] = 16;  tc.colsb[2] = 32;
-    tc.rows[3] = 16;  tc.colsb[3] = 32;
-    tc.rows[4] = 16;  tc.colsb[4] = 64;
-    tc.rows[5] = 16;  tc.colsb[5] = 64;
-    tc.rows[6] = 16;  tc.colsb[6] = 64;
-    tc.rows[7] = 16;  tc.colsb[7] = 64;
+    static thread_local tile_config_t tc;
+    tile_config_t current_tc;
+    _tile_storeconfig(&current_tc);

-    _tile_loadconfig(&tc);
-    done = true;
+    // load only when config changes
+    if (tc.palette_id == 0 || (memcmp(&current_tc.colsb, &tc.colsb, sizeof(uint16_t) * 8) != 0 &&
+                               memcmp(&current_tc.rows, &tc.rows, sizeof(uint8_t) * 8) != 0)) {
+        tc.palette_id = 1;
+        tc.start_row = 0;
+        TC_CONFIG_TILE(TMM0, 8, 64);
+        TC_CONFIG_TILE(TMM1, 8, 64);
+        TC_CONFIG_TILE(TMM2, 16, 32);
+        TC_CONFIG_TILE(TMM3, 16, 32);
+        TC_CONFIG_TILE(TMM4, 16, 64);
+        TC_CONFIG_TILE(TMM5, 16, 64);
+        TC_CONFIG_TILE(TMM6, 16, 64);
+        TC_CONFIG_TILE(TMM7, 16, 64);
+        _tile_loadconfig(&tc);
+    }
+
+    is_first_time = false;
 }

 // we need an extra 16 * 4B (TILE_N * int32_t) for each NB/KB block for compensation.
@@ -259,6 +268,33 @@ int get_row_size(int K) {
    return row_size;
 }

+// vectorized dtype conversion
+inline float FP16_TO_FP32(ggml_half val) {
+    __m256i v = _mm256_setr_epi16(
+        val, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    __m512 o = _mm512_cvtph_ps(v);
+    return _mm512_cvtss_f32(o);
+}
+
+inline __m512 FP16_TO_FP32_VEC(ggml_half val) {
+    __m256i v = _mm256_set1_epi16(val);
+    return _mm512_cvtph_ps(v);
+}
+
+// horizontal reduce
+inline float _mm512_reduce_max_ps(const __m512 x) {
+    __m512 v = x;
+    __m512 v1 = _mm512_shuffle_f32x4(v, v, 0x4E);
+    v = _mm512_max_ps(v, v1);
+    v1 = _mm512_shuffle_f32x4(v, v, 0xB1);
+    v = _mm512_max_ps(v, v1);
+    v1 = _mm512_shuffle_ps(v, v, 0x4E);
+    v = _mm512_max_ps(v, v1);
+    v1 = _mm512_shuffle_ps(v, v, 0xB1);
+    v = _mm512_max_ps(v, v1);
+    return _mm512_cvtss_f32(v);
+}
+
 // transpose utils
 #define SHUFFLE_EPI32(a, b, mask) \
    _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), mask))
@@ -1334,9 +1370,9 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>

 #define LAUNCH_TINYGEMM_KERNEL_AVX(MB_SIZE, NB_SIZE)                                \
    tinygemm_kernel_avx<float, type, float, MB_SIZE, NB_SIZE, blck_size>::apply(    \
-        K, (const float *)src1->data + src1_offset + mb_start * K,                  \
-        (const type *)src0->data + src0_offset + nb_start * K,                      \
-        (float *)dst->data + dst_offset + mb_start * ldc + nb_start, ldc)
+        K, (const float *)src1->data + mb_start * K,                                \
+        (const type *)src0->data + nb_start * K,                                    \
+        (float *)dst->data + mb_start * ldc + nb_start, ldc);


 // re-organize in the format {NB, KB, TILE_SIZE}:
@@ -1983,11 +2019,11 @@ struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, B
    }
 };

-#define LAUNCH_TINYGEMM_KERNEL_VNNI(NB_SIZE)                                                   \
-    tinygemm_kernel_vnni<vec_dot_type, type, float, 1, NB_SIZE, blck_size>::apply(             \
-        KB, wdata_batch,                                                                       \
-        (const char *)src0->data + src0_offset + PACKED_INDEX(nb * kTilesN, 0, KB, TILE_SIZE), \
-        (float *) dst->data + dst_offset + nb_start, ldc)
+#define LAUNCH_TINYGEMM_KERNEL_VNNI(NB_SIZE)                                         \
+    tinygemm_kernel_vnni<vec_dot_type, type, float, 1, NB_SIZE, blck_size>::apply(   \
+        KB, (const char *)wdata + 0 * row_size_A,                                    \
+        (const char *)src0->data + PACKED_INDEX(nb * kTilesN, 0, KB, TILE_SIZE),     \
+        (float *) dst->data + 0 * N + nb_start, ldc)

 template <typename TA, typename TB, typename TC, int BLOCK_K,
          typename std::enable_if<!is_type_qkk<TB>::value, int>::type = 0>
@@ -2043,7 +2079,7 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v
        _tile_stored(TMM5, Tile5(C_pre), TILE_N * sizeof(int32_t));

        if (need_unpack) {
-            unpack_B<TB>(Tile1, B_blk1);
+            unpack_B<TB>(Tile1, B_blk0);
            _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK);
        } else {
            _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK);
@@ -2300,13 +2336,6 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
    });
 }

-// ne2 is passed explicitly to help compiler optimize repeated calls
-inline int64_t ggml_batch_offset(const ggml_tensor * t, int64_t batch_idx, int64_t ne2) {
-    const int64_t i2 = batch_idx % ne2;
-    const int64_t i3 = batch_idx / ne2;
-    return i3 * t->nb[3] + i2 * t->nb[2];
-}
-
 size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) {
    struct ggml_tensor * src0 = dst->src[0];

@@ -2319,13 +2348,12 @@ size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) {

    const int M = dst->ne[1];
    const int K = src0->ne[0];
-    const int64_t n_batch = dst->ne[2] * dst->ne[3];

    size_t desired_wsize = 0;

    GGML_DISPATCH_QTYPES(TYPE, [&] {
        const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
-        desired_wsize = n_batch * M * row_size_A;
+        desired_wsize = M * row_size_A;
    });

    return desired_wsize;
@@ -2337,7 +2365,7 @@ size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) {
 // src1: input  in shape of {M, K}, float32
 // dst:  output in shape of {M, N}, float32
 //
-// the function performs: dst = src1 @ src0.T for each batch
+// the function performs: dst = src1 @ src0.T
 //
 void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_tensor * dst) {
    struct ggml_tensor * src0 = dst->src[0];
@@ -2354,26 +2382,17 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
    const int K = src0->ne[0];
    const int ldc = dst->nb[1] / dst->nb[0];

-    const int64_t ne2 = dst->ne[2];
-    const int64_t n_batch = ne2 * dst->ne[3];
-
    if (is_floating_type) {
        constexpr int BLOCK_M = 4;
        constexpr int BLOCK_N = 6;
        const int MB = div_up(M, BLOCK_M);
        const int NB = div_up(N, BLOCK_N);

-        parallel_for_ggml(params, n_batch * MB * NB, [&](int begin, int end) {
+        parallel_for_ggml(params, MB * NB, [&](int begin, int end) {
            GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] {
                for (int i = begin; i < end; ++i) {
-                    int batch_idx = i / (MB * NB);
-                    int remaining = i % (MB * NB);
-                    int mb = remaining / NB;
-                    int nb = remaining % NB;
-
-                    int64_t src0_offset = ggml_batch_offset(src0, batch_idx, ne2);
-                    int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2);
-                    int64_t dst_offset  = ggml_batch_offset(dst,  batch_idx, ne2);
+                    int mb = i / NB;
+                    int nb = i % NB;

                    int mb_start = mb * BLOCK_M;
                    int mb_size = std::min(BLOCK_M, M - mb_start);
@@ -2405,10 +2424,10 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
    void * wdata = params->wdata;

    //TODO: performance improvement: merge quant A
- // if (params->ith == 0) {
+    if (params->ith == 0) {
        GGML_DISPATCH_QTYPES(TYPE, [&] {
            const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
-            const size_t desired_wsize = n_batch * M * row_size_A;
+            const size_t desired_wsize = M * row_size_A;
            if (params->wsize < desired_wsize) {
                GGML_ABORT("insufficient work space size");
            }
@@ -2417,19 +2436,12 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
            // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
            GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);

-            parallel_for_ggml(params, n_batch, [&](int begin, int end) {
-                for (int batch_idx = begin; batch_idx < end; ++batch_idx) {
-                    int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2);
-                    const float * A_data = (const float *)((const char *)src1->data + src1_offset);
-                    char * wdata_batch = (char *)wdata + batch_idx * M * row_size_A;
-
-                    for (int m = 0; m < M; ++m) {
-                        from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
-                    }
-                }
-            });
+            const float * A_data = static_cast<const float *>(src1->data);
+            for (int m = 0; m < M; ++m) {
+                from_float<vec_dot_type>(A_data + m * K, (char *)wdata + m * row_size_A, K);
+            }
        });
- // }
+    }

    ggml_barrier(params->threadpool);

@@ -2439,19 +2451,13 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
        constexpr int BLOCK_N = TILE_N * kTilesN;
        const int NB = div_up(N, BLOCK_N);

-        parallel_for_ggml(params, n_batch * NB, [&](int begin, int end) {
+        parallel_for_ggml(params, NB, [&](int begin, int end) {
            GGML_DISPATCH_QTYPES(TYPE, [&] {
                const int KB = K / blck_size;
                const int TILE_SIZE = get_tile_size<type>();
                const int row_size_A = KB * sizeof(vec_dot_type);
                for (int i = begin; i < end; ++i) {
-                    int batch_idx = i / NB;
-                    int nb = i % NB;
-
-                    int64_t src0_offset = ggml_batch_offset(src0, batch_idx, ne2);
-                    int64_t dst_offset  = ggml_batch_offset(dst,  batch_idx, ne2);
-                    const char * wdata_batch = (const char *)wdata + batch_idx * row_size_A;
-
+                    int nb = i;
                    int nb_start = nb * BLOCK_N;
                    int nb_size = std::min(BLOCK_N, N - nb_start); // 32, 64, 96

@@ -2475,7 +2481,7 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
    const int MB = div_up(M, BLOCK_M);
    const int NB = div_up(N, BLOCK_N);

-    parallel_for_ggml(params, n_batch * MB * NB, [&](int begin, int end) {
+    parallel_for_ggml(params, MB * NB, [&](int begin, int end) {
        // init tile config for each thread
        ggml_tile_config_init();

@@ -2485,14 +2491,8 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
            const int row_size_A = KB * sizeof(vec_dot_type);

            for (int i = begin; i < end; ++i) {
-                int batch_idx = i / (MB * NB);
-                int remaining = i % (MB * NB);
-                int mb = remaining / NB;
-                int nb = remaining % NB;
-
-                int64_t src0_offset = ggml_batch_offset(src0, batch_idx, ne2);
-                int64_t dst_offset  = ggml_batch_offset(dst,  batch_idx, ne2);
-                const char * wdata_batch = (const char *)wdata + batch_idx * M * row_size_A;
+                int mb = i / NB;
+                int nb = i % NB;

                int mb_start = mb * BLOCK_M;
                int mb_size = std::min(BLOCK_M, M - mb_start);
@@ -2501,9 +2501,9 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te

                tinygemm_kernel_amx<vec_dot_type, type, float, blck_size>(
                    mb_size, nb_size, KB,
-                    wdata_batch + mb_start * row_size_A,
-                    (const char *)src0->data + src0_offset + PACKED_INDEX(nb * 2, 0, KB, TILE_SIZE),
-                    (float *) dst->data + dst_offset + mb_start * N + nb_start, ldc);
+                    (const char *)wdata + mb_start * row_size_A,
+                    (const char *)src0->data + PACKED_INDEX(nb * 2, 0, KB, TILE_SIZE),
+                    (float *) dst->data + mb_start * N + nb_start, ldc);
            }
        });
    });
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@@ -42,14 +42,10 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
-#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
-#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
 #define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
-#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
-#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
 #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
 #define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
@@ -58,14 +54,10 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
-#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
-#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
-#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
+#    define ggml_gemm_q6_K_8x8_q8_K_generic   ggml_gemm_q6_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
-#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
-#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
 #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
 #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
@@ -73,10 +65,8 @@
 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
-#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
-#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
 // repack.cpp
@@ -85,23 +75,17 @@
 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
-#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
-#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
 #define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
-#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
 #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
 #define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
-#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
-#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
 #define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
-#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
 #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
 #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__POWERPC__) || defined(__powerpc__)
@@ -122,14 +106,10 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
-#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
-#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
 #define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
-#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
-#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
 #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
 #define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
@@ -138,14 +118,10 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
-#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
-#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
 #define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
-#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
-#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
 #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
 #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__loongarch64)
@@ -166,14 +142,10 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
-#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
-#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
 #define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
-#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
-#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
 #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
 #define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
@@ -182,22 +154,24 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
-#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
-#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
 #define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
-#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
-#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
 #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
 #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__riscv)
 // quants.c
 #define quantize_row_q8_K_generic quantize_row_q8_K
+#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
+#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
 #define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
 #define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
+#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
 #define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
+#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
+#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
+#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
 #define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
 #define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
 #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
@@ -211,14 +185,10 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
-#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
-#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
 #define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
-#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
-#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
 #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
 #define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
@@ -226,14 +196,10 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
-#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
-#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
 #define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
-#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
-#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
 #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
 #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__s390x__)
@@ -260,14 +226,10 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
-#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
-#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
 #define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
-#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
-#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
 #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
 #define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
@@ -276,14 +238,10 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
-#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
-#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
 #define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
-#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
-#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
 #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
 #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__wasm__)
@@ -312,14 +270,10 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
-#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
 #define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
-#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
 #define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
-#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
-#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
 #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
 #define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
@@ -328,14 +282,10 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
-#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
 #define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
-#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
 #define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
-#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
-#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
 #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
 #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #endif
--- a/ggml/src/ggml-cpu/arch/arm/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp
--- a/ggml/src/ggml-cpu/arch/riscv/quants.c
+++ b/ggml/src/ggml-cpu/arch/riscv/quants.c
@@ -1954,773 +1954,3 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
 #endif
 }

-static const uint8_t sign_gather_indices_arr[64] = {
-    0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,
-    4,4,4,4,4,4,4,4, 5,5,5,5,5,5,5,5, 6,6,6,6,6,6,6,6, 7,7,7,7,7,7,7,7
-};
-
-static const uint8_t sign_bit_masks_arr[64] = {
-    1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128,
-    1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128
-};
-
-static void ggml_vec_dot_iq2_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
-
-    const block_iq2_s * GGML_RESTRICT x = vx;
-    const block_q8_K  * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-    const uint64_t * grid64 = (const uint64_t *)iq2s_grid;
-
-    // --- Pre-load Constants ---
-    uint16_t gather_qh_arr[8] = {0, 0, 0, 0, 1, 1, 1, 1};
-    vuint16mf2_t v_gather_qh = __riscv_vle16_v_u16mf2(gather_qh_arr, 8);
-    uint16_t shift_qh_arr[8] = {11, 9, 7, 5, 11, 9, 7, 5};
-    vuint16mf2_t v_shift_qh = __riscv_vle16_v_u16mf2(shift_qh_arr, 8);
-
-    // Constants for sign extraction
-    vuint8m2_t v_sign_gather_indices = __riscv_vle8_v_u8m2(sign_gather_indices_arr, 64);
-    vuint8m2_t v_sign_masks = __riscv_vle8_v_u8m2(sign_bit_masks_arr, 64);
-
-    float sumf = 0.0f;
-
-    for (int i = 0; i < nb; ++i) {
-        const float combined_scale = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
-
-        const uint8_t * GGML_RESTRICT qs = x[i].qs;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const uint8_t * GGML_RESTRICT scales = x[i].scales;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        const uint8_t * signs_ptr = qs + 32;
-
-        float sum_block = 0.0f;
-
-        for (int ib = 0; ib < 4; ++ib) {
-            // Combine low + high bits
-            vuint8mf4_t v_qs_u8 = __riscv_vle8_v_u8mf4(qs, 8);
-            qs += 8;
-            uint16_t qh_val;
-            memcpy(&qh_val, qh, 2);
-            qh += 2;
-            vuint8mf8_t v_qh_raw = __riscv_vle8_v_u8mf8((const uint8_t*)&qh_val, 2);
-            vuint16mf4_t v_qh_u16 = __riscv_vwcvtu_x_x_v_u16mf4(v_qh_raw, 2);
-            vuint16mf2_t v_qh_u16_ext = __riscv_vlmul_ext_v_u16mf4_u16mf2(v_qh_u16);
-            vuint16mf2_t v_qh_expanded = __riscv_vrgather_vv_u16mf2(v_qh_u16_ext, v_gather_qh, 8);
-            v_qh_expanded = __riscv_vsll_vv_u16mf2(v_qh_expanded, v_shift_qh, 8);
-
-            // Mask: We want bits 11-12. 0x1800 = 0001 1000 0000 0000
-            v_qh_expanded = __riscv_vand_vx_u16mf2(v_qh_expanded, 0x1800, 8);
-            vuint16mf2_t v_qs_u16 = __riscv_vwcvtu_x_x_v_u16mf2(v_qs_u8, 8);
-
-            // Multiply by 8 to get byte offset, instead of element offset
-            v_qs_u16 = __riscv_vsll_vx_u16mf2(v_qs_u16, 3, 8);
-            vuint16mf2_t v_grid_offsets = __riscv_vor_vv_u16mf2(v_qs_u16, v_qh_expanded, 8);
-
-            // Lookup Grid using Byte Offsets
-            vuint64m2_t v_grid_vals = __riscv_vluxei16_v_u64m2(grid64, v_grid_offsets, 8);
-
-            vuint8m2_t v_grid_u8 = __riscv_vreinterpret_v_u64m2_u8m2(v_grid_vals);
-            vint8m2_t v_grid_i8 = __riscv_vreinterpret_v_u8m2_i8m2(v_grid_u8);
-
-            // Load signs and generate sign mask
-            vuint8mf4_t v_signs_raw = __riscv_vle8_v_u8mf4(signs_ptr, 8);
-            signs_ptr += 8;
-
-            vuint8m2_t v_signs_source = __riscv_vlmul_ext_v_u8mf4_u8m2(v_signs_raw);
-            vuint8m2_t v_signs_bcast = __riscv_vrgather_vv_u8m2(v_signs_source, v_sign_gather_indices, 64);
-
-            vuint8m2_t v_sign_bits = __riscv_vand_vv_u8m2(v_signs_bcast, v_sign_masks, 64);
-            vbool4_t m_negative = __riscv_vmsne_vx_u8m2_b4(v_sign_bits, 0, 64);
-
-            vint8m2_t v_q8 = __riscv_vle8_v_i8m2(q8, 64);
-            q8 += 64;
-
-            vint8m2_t v_q8_signed = __riscv_vrsub_vx_i8m2_mu(m_negative, v_q8, v_q8, 0, 64);
-            vint16m4_t v_dot = __riscv_vwmul_vv_i16m4(v_grid_i8, v_q8_signed, 64);
-
-            vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
-
-            int32_t s0 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
-                __riscv_vget_v_i16m4_i16m1(v_dot, 0), v_zero, 16));
-            int32_t s1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
-                __riscv_vget_v_i16m4_i16m1(v_dot, 1), v_zero, 16));
-            int32_t s2 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
-                __riscv_vget_v_i16m4_i16m1(v_dot, 2), v_zero, 16));
-            int32_t s3 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
-                __riscv_vget_v_i16m4_i16m1(v_dot, 3), v_zero, 16));
-
-            uint8_t sc0 = scales[0];
-            uint8_t sc1 = scales[1];
-            scales += 2;
-
-            sum_block += s0 * (2 * (sc0 & 0xF) + 1);
-            sum_block += s1 * (2 * (sc0 >> 4)  + 1);
-            sum_block += s2 * (2 * (sc1 & 0xF) + 1);
-            sum_block += s3 * (2 * (sc1 >> 4)  + 1);
-        }
-        sumf += sum_block * combined_scale;
-    }
-    *s = 0.125f * sumf;
-}
-
-static void ggml_vec_dot_iq2_s_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
-
-    const block_iq2_s * GGML_RESTRICT x = vx;
-    const block_q8_K  * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-    const uint64_t * grid64 = (const uint64_t *)iq2s_grid;
-
-    // Pre-load Constants
-    vuint8m2_t v_ids = __riscv_vid_v_u8m2(32);
-    vuint8m2_t v_sign_gather_indices = __riscv_vsrl_vx_u8m2(v_ids, 3, 32);
-    vuint8m2_t v_ones = __riscv_vmv_v_x_u8m2(1, 32);
-    vuint8m2_t v_shift_amts = __riscv_vand_vx_u8m2(v_ids, 7, 32);
-    vuint8m2_t v_sign_masks = __riscv_vsll_vv_u8m2(v_ones, v_shift_amts, 32);
-    uint16_t shift_qh_arr[4] = {11, 9, 7, 5};
-    vuint16mf2_t v_shift_qh = __riscv_vle16_v_u16mf2(shift_qh_arr, 4);
-
-    float sumf = 0.0f;
-
-    for (int i = 0; i < nb; ++i) {
-        const float combined_scale = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
-
-        const uint8_t * GGML_RESTRICT qs = x[i].qs;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const uint8_t * GGML_RESTRICT scales = x[i].scales;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        const uint8_t * signs_ptr = qs + 32;
-        float sum_block = 0.0f;
-
-        for (int ib = 0; ib < 8; ++ib) {
-
-            // Load Low Bits [4 bytes]
-            vuint8mf4_t v_qs_u8 = __riscv_vle8_v_u8mf4(qs, 4);
-            qs += 4;
-
-            // Load 1 byte. It contains bits for 4 mini-blocks.
-            uint8_t qh_val = *qh++;
-
-            // Combine Low + High bits of 10bit indices
-            vuint8mf4_t v_qh_raw = __riscv_vmv_v_x_u8mf4(qh_val, 4);
-            vuint16mf2_t v_qh_u16 = __riscv_vwcvtu_x_x_v_u16mf2(v_qh_raw, 4);
-            vuint16mf2_t v_qh_mf2 = __riscv_vsll_vv_u16mf2(v_qh_u16, v_shift_qh, 4);
-            v_qh_mf2 = __riscv_vand_vx_u16mf2(v_qh_mf2, 0x1800, 4);
-            vuint16mf2_t v_qs_u16_mf2 = __riscv_vwcvtu_x_x_v_u16mf2(v_qs_u8, 4);
-            vuint16mf2_t v_qs_u16 = __riscv_vsll_vx_u16mf2(v_qs_u16_mf2, 3, 4);
-            vuint16mf2_t v_grid_offsets = __riscv_vor_vv_u16mf2(v_qs_u16, v_qh_mf2, 4);
-
-            // Lookup Grid
-            vint8m2_t v_grid_i8 = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vreinterpret_v_u64m2_u8m2(__riscv_vluxei16_v_u64m2(grid64, v_grid_offsets, 4)));
-
-            vuint8mf4_t v_signs_raw = __riscv_vle8_v_u8mf4(signs_ptr, 4);
-            signs_ptr += 4;
-            vuint8m2_t v_signs_source = __riscv_vlmul_ext_v_u8mf4_u8m2(v_signs_raw);
-            vuint8m2_t v_signs_bcast = __riscv_vrgather_vv_u8m2(v_signs_source, v_sign_gather_indices, 32);
-
-            // generating sign mask
-            vuint8m2_t v_sign_bits = __riscv_vand_vv_u8m2(v_signs_bcast, v_sign_masks, 32);
-            vbool4_t m_negative = __riscv_vmsne_vx_u8m2_b4(v_sign_bits, 0, 32);
-
-            vint8m2_t v_q8 = __riscv_vle8_v_i8m2(q8, 32);
-            q8 += 32;
-
-            // apply signs
-            vint8m2_t v_q8_signed = __riscv_vrsub_vx_i8m2_mu(m_negative,v_q8, v_q8, 0, 32);
-            vint16m4_t v_dot = __riscv_vwmul_vv_i16m4(v_grid_i8, v_q8_signed, 32);
-
-            // Reduction
-            vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
-
-            // Reduce 0-15 (First Half)
-            int32_t s0 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(
-                __riscv_vget_v_i16m4_i16m2(v_dot, 0), v_zero, 16));
-
-            // Reduce 16-31 (Second Half)
-            int32_t s1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(
-                __riscv_vget_v_i16m4_i16m2(v_dot, 1), v_zero, 16));
-
-            // Apply sub Scales
-            uint8_t sc = *scales++;
-
-            sum_block += s0 * (2 * (sc & 0xF) + 1);
-            sum_block += s1 * (2 * (sc >> 4)  + 1);
-        }
-        sumf += sum_block * combined_scale;
-    }
-    *s = 0.125f * sumf;
-}
-
-void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-#if defined __riscv_v_intrinsic
-    switch (__riscv_vlenb() * 8) {
-        case 128:
-            ggml_vec_dot_iq2_s_q8_K_vl128(n, s, bs, vx, bx, vy, by, nrc);
-            break;
-        case 256:
-            ggml_vec_dot_iq2_s_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
-            break;
-        default:
-            ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
-            break;
-    }
-#else
-    ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
-#endif
-}
-
-static void ggml_vec_dot_iq3_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq3_s * GGML_RESTRICT x = vx;
-    const block_q8_K  * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-    const uint64_t * grid64 = (const uint64_t *)iq3s_grid;
-
-    // --- Pre-load Constants ---
-    const uint16_t qh_bit_shifts_arr[16] = {
-        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-    };
-    vuint8m2_t v_sign_gather_indices = __riscv_vle8_v_u8m2(sign_gather_indices_arr, 64);
-    vuint8m2_t v_sign_masks = __riscv_vle8_v_u8m2(sign_bit_masks_arr, 64);
-    vuint16m1_t v_qh_shifts = __riscv_vle16_v_u16m1(qh_bit_shifts_arr, 16);
-
-    float sumf = 0.0f;
-
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_CPU_FP16_TO_FP32(x[i].d);
-        const float combined_scale = d * y[i].d;
-
-        const uint8_t * GGML_RESTRICT qs = x[i].qs;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const uint8_t * GGML_RESTRICT scales = x[i].scales;
-        const uint8_t * GGML_RESTRICT signs = x[i].signs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        float sum_block = 0.0f;
-
-        // Loop: Process 64 weights (16 mini-blocks of 4) per iteration
-        for (int ib = 0; ib < 4; ++ib) {
-
-            vuint8mf2_t v_qs_u8 = __riscv_vle8_v_u8mf2(qs, 16);
-            qs += 16;
-
-            uint16_t qh_val;
-            memcpy(&qh_val, qh, 2);
-            qh += 2;
-
-            vuint16m1_t v_qh_val = __riscv_vmv_v_x_u16m1(qh_val, 16);
-            // Extract bits: (qh >> i) & 1
-            v_qh_val = __riscv_vsrl_vv_u16m1(v_qh_val, v_qh_shifts, 16);
-            v_qh_val = __riscv_vand_vx_u16m1(v_qh_val, 1, 16);
-
-            vuint16m1_t v_qs_u16 = __riscv_vwcvtu_x_x_v_u16m1(v_qs_u8, 16);
-            v_qs_u16 = __riscv_vsll_vx_u16m1(v_qs_u16, 2, 16);
-            v_qh_val = __riscv_vsll_vx_u16m1(v_qh_val, 10, 16);
-            vuint16m1_t v_grid_offsets = __riscv_vor_vv_u16m1(v_qs_u16, v_qh_val, 16);
-
-            // Grid value is 4xuint8
-            vuint32m2_t v_grid_packed = __riscv_vluxei16_v_u32m2((const uint32_t *)grid64, v_grid_offsets, 16);
-            vuint8m2_t v_grid_u8 = __riscv_vreinterpret_v_u32m2_u8m2(v_grid_packed);
-            vuint8mf4_t v_signs_raw = __riscv_vle8_v_u8mf4(signs, 8);
-            signs += 8;
-
-            // Generate sign mask
-            vuint8m2_t v_signs_source = __riscv_vlmul_ext_v_u8mf4_u8m2(v_signs_raw);
-            vuint8m2_t v_signs_bcast = __riscv_vrgather_vv_u8m2(v_signs_source, v_sign_gather_indices, 64);
-            vuint8m2_t v_sign_bits = __riscv_vand_vv_u8m2(v_signs_bcast, v_sign_masks, 64);
-            vbool4_t m_negative = __riscv_vmsne_vx_u8m2_b4(v_sign_bits, 0, 64);
-
-            vint8m2_t v_q8 = __riscv_vle8_v_i8m2(q8, 64);
-            q8 += 64;
-
-            // Apply Signs
-            vint8m2_t v_q8_signed = __riscv_vrsub_vx_i8m2_mu(m_negative, v_q8, v_q8, 0, 64);
-            vint16m4_t v_dot = __riscv_vwmulsu_vv_i16m4(v_q8_signed, v_grid_u8, 64);
-
-            // Reduction
-            vint16m2_t v_dot_lo = __riscv_vget_v_i16m4_i16m2(v_dot, 0);
-            vint16m2_t v_dot_hi = __riscv_vget_v_i16m4_i16m2(v_dot, 1);
-            vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
-
-            int32_t s_lo = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(v_dot_lo, v_zero, 32));
-            int32_t s_hi = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(v_dot_hi, v_zero, 32));
-
-            // Apply sub-scales
-            uint8_t sc_byte = *scales++;
-            int sc_lo = (sc_byte & 0xF) * 2 + 1;
-            int sc_hi = (sc_byte >> 4)  * 2 + 1;
-
-            sum_block += s_lo * sc_lo + s_hi * sc_hi;
-        }
-        sumf += sum_block * combined_scale;
-    }
-    *s = 0.125f * sumf;
-}
-
-void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-#if defined __riscv_v_intrinsic
-    switch (__riscv_vlenb() * 8) {
-        case 256:
-            ggml_vec_dot_iq3_s_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
-            break;
-        default:
-            ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
-            break;
-    }
-#else
-    ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
-#endif
-}
-
-static void ggml_vec_dot_tq1_0_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_tq1_0 * GGML_RESTRICT x = vx;
-    const block_q8_K  * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-    float sumf = 0.0f;
-    uint8_t pow[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27};
-
-    for (int i = 0; i < nb; i++) {
-        // First loop.
-        vint32m4_t suml1;
-        {
-            const int vl = 32;
-            vuint8m1_t tq = __riscv_vle8_v_u8m1(x[i].qs, vl);
-
-            vuint16m2_t tq0 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(tq, 3, vl), 8, vl);
-            vuint16m2_t tq1 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 3, vl), 3, vl), 8, vl);
-            vuint16m2_t tq2 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 9, vl), 3, vl), 8, vl);
-            vuint16m2_t tq3 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 27, vl), 3, vl), 8, vl);
-            vuint16m2_t tq4 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 81, vl), 3, vl), 8, vl);
-
-            vint16m2_t q80 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 0, vl), vl);
-            vint16m2_t q81 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 32, vl), vl);
-            vint16m2_t q82 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 64, vl), vl);
-            vint16m2_t q83 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 96, vl), vl);
-            vint16m2_t q84 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 128, vl), vl);
-
-            vint16m2_t sum0 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq0, 1, vl)), q80, vl);
-            vint16m2_t sum1 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq1, 1, vl)), q81, vl);
-            vint16m2_t sum2 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq2, 1, vl)), q82, vl);
-            vint16m2_t sum3 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq3, 1, vl)), q83, vl);
-            vint16m2_t sum4 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq4, 1, vl)), q84, vl);
-
-            vint32m4_t sumi0 = __riscv_vwadd_vv_i32m4(sum0, sum1, vl);
-            vint32m4_t sumi1 = __riscv_vwadd_vv_i32m4(sum2, sum3, vl);
-            suml1 = __riscv_vadd_vv_i32m4(__riscv_vwcvt_x_x_v_i32m4(sum4, vl), __riscv_vadd_vv_i32m4(sumi0, sumi1, vl), vl);
-        }
-
-        // Second loop.
-        vint32m2_t suml2;
-        {
-            const int vl = 16;
-            vuint8mf2_t tq = __riscv_vle8_v_u8mf2(x[i].qs + 32, vl);
-
-            vuint16m1_t tq0 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(tq, 3 * 1, vl), 8, vl);
-            vuint16m1_t tq1 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 3, vl), 3, vl), 8, vl);
-            vuint16m1_t tq2 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 9, vl), 3, vl), 8, vl);
-            vuint16m1_t tq3 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 27, vl), 3, vl), 8, vl);
-            vuint16m1_t tq4 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 81, vl), 3, vl), 8, vl);
-
-            vint16m1_t q80 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 160, vl), vl);
-            vint16m1_t q81 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 176, vl), vl);
-            vint16m1_t q82 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 192, vl), vl);
-            vint16m1_t q83 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 208, vl), vl);
-            vint16m1_t q84 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 224, vl), vl);
-
-            vint16m1_t sum0 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq0, 1, vl)), q80, vl);
-            vint16m1_t sum1 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq1, 1, vl)), q81, vl);
-            vint16m1_t sum2 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq2, 1, vl)), q82, vl);
-            vint16m1_t sum3 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq3, 1, vl)), q83, vl);
-            vint16m1_t sum4 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq4, 1, vl)), q84, vl);
-
-            vint32m2_t sumi0 = __riscv_vwadd_vv_i32m2(sum0, sum1, vl);
-            vint32m2_t sumi1 = __riscv_vwadd_vv_i32m2(sum2, sum3, vl);
-            suml2 = __riscv_vadd_vv_i32m2(__riscv_vwcvt_x_x_v_i32m2(sum4, vl), __riscv_vadd_vv_i32m2(sumi0, sumi1, vl), vl);
-        }
-
-        // Third loop.
-        vint32m2_t suml3;
-        {
-            const int vl = 16;
-
-            uint32_t qh;
-            memcpy(&qh, &x[i].qh[0], 4);
-            // Prevent fusion with vmv.
-            __asm__ __volatile__("" : "+r"(qh));
-            vuint8mf2_t tq = __riscv_vreinterpret_v_u32mf2_u8mf2(__riscv_vmv_v_x_u32mf2(qh, vl / 4));
-
-            vuint8mf2_t p = __riscv_vle8_v_u8mf2(pow, vl);
-
-            vuint16m1_t tq0 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vv_u8mf2(tq, p, vl), 3, vl), 8, vl);
-
-            vint16m1_t q80 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 240, vl), vl);
-
-            vint16m1_t sum0 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq0, 1, vl)), q80, vl);
-            suml3 = __riscv_vwcvt_x_x_v_i32m2(sum0, vl);
-        }
-
-        vint32m2_t sumb = __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(suml1, 0), __riscv_vget_v_i32m4_i32m2(suml1, 1), 16);
-        sumb = __riscv_vadd_vv_i32m2(sumb, suml2, 16);
-        sumb = __riscv_vadd_vv_i32m2(sumb, suml3, 16);
-
-        vint32m1_t sum = __riscv_vredsum_vs_i32m2_i32m1(sumb, __riscv_vmv_v_x_i32m1(0, 1), 16);
-        sumf += __riscv_vmv_x_s_i32m1_i32(sum) * y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
-    }
-
-    *s = sumf;
-}
-
-void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-#if defined __riscv_v_intrinsic
-    switch (__riscv_vlenb() * 8) {
-        case 256:
-            ggml_vec_dot_tq1_0_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
-            break;
-        default:
-            ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
-            break;
-    }
-#else
-    ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
-#endif
-}
-
-static void ggml_vec_dot_tq2_0_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_tq2_0 * GGML_RESTRICT x = vx;
-    const block_q8_K  * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-    float sumf = 0.0f;
-    for (int i = 0; i < nb; ++i) {
-        int32_t sumi = 0;
-
-        for (size_t j = 0; j < sizeof(x[0].qs); j += 32) {
-            const int8_t * py0 = &y[i].qs[j * 4 + 0 * 32];
-            const int8_t * py1 = &y[i].qs[j * 4 + 1 * 32];
-            const int8_t * py2 = &y[i].qs[j * 4 + 2 * 32];
-            const int8_t * py3 = &y[i].qs[j * 4 + 3 * 32];
-            const uint8_t* px  = &x[i].qs[j];
-
-            size_t vlmax_16m2 = __riscv_vsetvl_e16m2(32);
-            vint16m2_t vacc16 = __riscv_vmv_v_x_i16m2(0, vlmax_16m2);
-
-            size_t vl = __riscv_vsetvl_e8m1(32);
-
-            vuint8m1_t vx_u8 = __riscv_vle8_v_u8m1(px, vl);
-
-            vint8m1_t vy0 = __riscv_vle8_v_i8m1(py0 , vl);
-            vint8m1_t vy1 = __riscv_vle8_v_i8m1(py1, vl);
-            vint8m1_t vy2 = __riscv_vle8_v_i8m1(py2, vl);
-            vint8m1_t vy3 = __riscv_vle8_v_i8m1(py3, vl);
-
-            // l=0 (bits 1:0)
-            vuint8m1_t t0 = __riscv_vand_vx_u8m1(vx_u8, 0x03, vl);
-            vint8m1_t vq0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t0), 1, vl);
-
-            // l=1 (bits 3:2)
-            vuint8m1_t t1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(vx_u8, 2, vl), 0x03, vl);
-            vint8m1_t vq1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t1), 1, vl);
-
-            // l=2 (bits 5:4)
-            vuint8m1_t t2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(vx_u8, 4, vl), 0x03, vl);
-            vint8m1_t vq2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t2), 1, vl);
-
-            // l=3 (bits 7:6)
-            vuint8m1_t t3 = __riscv_vsrl_vx_u8m1(vx_u8, 6, vl); // No final AND needed as vsrl shifts in zeros
-            vint8m1_t vq3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t3), 1, vl);
-
-            // 4. Multiply and accumulate
-            vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq0, vy0, vl);
-            vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq1, vy1, vl);
-            vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq2, vy2, vl);
-            vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq3, vy3, vl);
-
-            vlmax_16m2 = __riscv_vsetvl_e16m2(32);
-            vint32m1_t vzero32 = __riscv_vmv_v_x_i32m1(0, 1);
-            vint32m1_t vred32 = __riscv_vwredsum_vs_i16m2_i32m1(vacc16, vzero32, vlmax_16m2);
-
-            sumi += __riscv_vmv_x_s_i32m1_i32(vred32);
-        }
-        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
-        sumf += (float)sumi * d;
-    }
-
-    *s = sumf;
-}
-
-void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-#if defined __riscv_v_intrinsic
-    switch (__riscv_vlenb() * 8) {
-        case 256:
-            ggml_vec_dot_tq2_0_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
-            break;
-        default:
-            ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
-            break;
-    }
-#else
-    ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
-#endif
-}
-
-static void ggml_vec_dot_iq1_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq1_s * GGML_RESTRICT x = vx;
-    const block_q8_K  * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        // Load qh once for the entire superblock.
-        vuint16mf2_t qh = __riscv_vle16_v_u16mf2(x[i].qh, 8);
-
-        // Calculate ls.
-        vuint16mf2_t temp = __riscv_vsrl_vx_u16mf2(qh, 12, 8);
-        temp = __riscv_vand_vx_u16mf2(temp, 7, 8);
-        vint32m1_t ls = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwmulu_vx_u32m1(temp, 2, 8));
-        ls = __riscv_vadd_vx_i32m1(ls, 1, 8);
-
-        // Calculate delta.
-        vbool32_t mask = __riscv_vmseq_vx_u16mf2_b32(__riscv_vand_vx_u16mf2(qh, 0x8000, 8), 0, 8);
-        vint32m1_t delta_neg = __riscv_vmv_v_x_i32m1(-1, 8);
-        vint32m1_t delta_pos = __riscv_vmv_v_x_i32m1(1, 8);
-        vint32m1_t delta = __riscv_vmerge_vvm_i32m1(delta_neg, delta_pos, mask, 8);
-
-        // Load qs.
-        vuint8m1_t qs = __riscv_vle8_v_u8m1(x[i].qs, 32);
-
-        // Prepare the indices.
-        const uint64_t shift = 0x0009000600030000;
-        vuint16m2_t qh_shift = __riscv_vreinterpret_v_u64m2_u16m2(__riscv_vmv_v_x_u64m2(shift, 8));
-        vuint16m2_t qh_gather_index = __riscv_vreinterpret_v_i16m2_u16m2(
-            __riscv_vdiv_vx_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vid_v_u16m2(32)), 4, 32));
-        vuint16m2_t qh_ext = __riscv_vlmul_ext_v_u16m1_u16m2(__riscv_vlmul_ext_v_u16mf2_u16m1(qh));
-        vuint16m2_t qh_index = __riscv_vrgather_vv_u16m2(qh_ext, qh_gather_index, 32);
-        qh_index = __riscv_vsrl_vv_u16m2(qh_index, qh_shift, 32);
-        qh_index = __riscv_vand_vx_u16m2(qh_index, 7, 32);
-        qh_index = __riscv_vsll_vx_u16m2(qh_index, 8, 32);
-        qh_index = __riscv_vor_vv_u16m2(qh_index, __riscv_vzext_vf2_u16m2(qs, 32), 32);
-        vuint16m2_t index = __riscv_vsll_vx_u16m2(qh_index, 3, 32);
-
-        // Final lsums.
-        int32_t lsums_s[8];
-        vint32m1_t one_scalar = __riscv_vmv_v_x_i32m1(0, 1);
-
-        // Sub-blocks 1-4
-        {
-            vuint16m1_t grid_index0 = __riscv_vget_v_u16m2_u16m1(index, 0);
-            vint8m4_t grid0 = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vluxei16_v_i64m4((const int64_t*)iq1s_grid, grid_index0, 16));
-            vint8m4_t q80 = __riscv_vle8_v_i8m4(y[i].qs, 128);
-            vint16m8_t lsum0 = __riscv_vwmul_vv_i16m8(grid0, q80, 128);
-            lsums_s[0] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 0), one_scalar, 32));
-            lsums_s[1] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 1), one_scalar, 32));
-            lsums_s[2] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 2), one_scalar, 32));
-            lsums_s[3] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 3), one_scalar, 32));
-        }
-        __asm__ __volatile__("" ::: "memory");
-        // Sub-blocks 5-8
-        {
-            vuint16m1_t grid_index1 = __riscv_vget_v_u16m2_u16m1(index, 1);
-            vint8m4_t grid1 = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vluxei16_v_i64m4((const int64_t*)iq1s_grid, grid_index1, 16));
-            vint8m4_t q81 = __riscv_vle8_v_i8m4(&y[i].qs[128], 128);
-            vint16m8_t lsum1 = __riscv_vwmul_vv_i16m8(grid1, q81, 128);
-            lsums_s[4] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 0), one_scalar, 32));
-            lsums_s[5] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 1), one_scalar, 32));
-            lsums_s[6] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 2), one_scalar, 32));
-            lsums_s[7] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 3), one_scalar, 32));
-        }
-        __asm__ __volatile__("" ::: "memory");
-        vint32m1_t lsums = __riscv_vle32_v_i32m1(&lsums_s[0], 8);
-
-        // Calculate the bsums.
-        vint16m1_t bsums_0 = __riscv_vle16_v_i16m1(y[i].bsums, 16);
-        const vuint32m1_t bsums_i32 = __riscv_vreinterpret_v_u16m1_u32m1(__riscv_vreinterpret_v_i16m1_u16m1(bsums_0));
-        const vint16mf2_t bsums_i32_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(bsums_i32, 0, 8));
-        const vint16mf2_t bsums_i32_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(bsums_i32, 16, 8));
-        const vint32m1_t bsums = __riscv_vwadd_vv_i32m1(bsums_i32_0, bsums_i32_1, 8);
-
-        // Accumulation.
-        vint32m1_t sumi_v = __riscv_vmul_vv_i32m1(ls, lsums, 8);
-        vint32m1_t sumi1_v = __riscv_vmul_vv_i32m1(__riscv_vmul_vv_i32m1(ls, delta, 8), bsums, 8);
-
-        // Update sumf.
-        int sumi = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m1_i32m1(sumi_v, __riscv_vmv_v_x_i32m1(0.0f, 1), 8));
-        int sumi1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m1_i32m1(sumi1_v, __riscv_vmv_v_x_i32m1(0.0f, 1), 8));
-        sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
-    }
-
-    *s = sumf;
-}
-
-void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-#if defined __riscv_v_intrinsic
-    switch (__riscv_vlenb() * 8) {
-        case 256:
-            ggml_vec_dot_iq1_s_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
-            break;
-        default:
-            ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
-            break;
-    }
-#else
-    ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
-#endif
-}
-
-static void ggml_vec_dot_iq1_m_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq1_m * GGML_RESTRICT x = vx;
-    const block_q8_K  * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-    iq1m_scale_t scale;
-    float sumf = 0.0f;
-    for (int i = 0; i < nb; ++i) {
-        const int8_t   * q8 = y[i].qs;
-        const uint8_t  * qs = x[i].qs;
-        const uint8_t  * qh = x[i].qh;
-        const uint16_t * sc = (const uint16_t *)x[i].scales;
-
-        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
-
-        // Accumulators.
-        vint32m2_t acc1 = __riscv_vmv_v_x_i32m2(0, 16);
-        vint32m2_t acc2 = __riscv_vmv_v_x_i32m2(0, 16);
-
-        // We process 4 sub-blocks together.
-        for (int ib = 0; ib < QK_K/128; ib++) {
-            // Load qh for 4 sub-blocks.
-            const vuint8mf4_t qh_8 = __riscv_vle8_v_u8mf4(qh, 8);
-            const vuint16mf2_t qh_16_lo = __riscv_vzext_vf2_u16mf2(qh_8, 8);
-            const vuint16mf2_t qh_16_hi = __riscv_vsll_vx_u16mf2(qh_16_lo, 8, 8);
-            const vuint16m1_t qhb = __riscv_vzext_vf2_u16m1(
-                __riscv_vreinterpret_v_u16mf2_u8mf2(__riscv_vor_vv_u16mf2(qh_16_lo, qh_16_hi, 8)), 16);
-            qh += 8;
-
-            // Prepare grid indices.
-            const vuint16m1_t qsb = __riscv_vzext_vf2_u16m1(__riscv_vle8_v_u8mf2(&qs[0], 16), 16);
-            const vuint16m1_t shift = __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vmv_v_x_u32m1(0x00040008, 8));
-            vuint16m1_t index = __riscv_vor_vv_u16m1(qsb, __riscv_vand_vx_u16m1(__riscv_vsll_vv_u16m1(qhb, shift, 16), 0x700, 16), 16);
-            index = __riscv_vsll_vx_u16m1(index, 3, 16);
-            qs += 16;
-
-            // Load the grid.
-            const vint8m4_t iq1b = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vreinterpret_v_u64m4_i64m4(
-                __riscv_vluxei16_v_u64m4(iq1s_grid, index, 16)));
-
-            // Prepare the deltas.
-            const vbool16_t mask = __riscv_vmsgtu_vx_u16m1_b16(
-                __riscv_vand_vv_u16m1(qhb, __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vmv_v_x_u32m1(0x00800008, 8)), 16), 0, 16);
-            const vint64m4_t delta_pos = __riscv_vmv_v_x_i64m4(0x0101010101010101, 16);
-            const vint64m4_t delta_neg = __riscv_vmv_v_x_i64m4(0xffffffffffffffff, 16);
-            const vint8m4_t delta = __riscv_vreinterpret_v_i64m4_i8m4(
-                __riscv_vmerge_vvm_i64m4(delta_pos, delta_neg, mask, 16));
-
-            // Load q8 for sub-blocks.
-            const vint8m4_t q8b = __riscv_vle8_v_i8m4(q8, 128);
-            q8 += 128;
-
-            // Calculate the lsums.
-            const vint16m8_t lsum1 = __riscv_vwmul_vv_i16m8(iq1b, q8b, 128);
-            const vint16m8_t lsum2 = __riscv_vwmul_vv_i16m8(delta, q8b, 128);
-
-            // Prepare the scales.
-            const int16_t ls_0_0 = 2*((sc[0] >> 0) & 0x7) + 1;
-            const int16_t ls_0_1 = 2*((sc[0] >> 3) & 0x7) + 1;
-            const int16_t ls_1_0 = 2*((sc[0] >> 6) & 0x7) + 1;
-            const int16_t ls_1_1 = 2*((sc[0] >> 9) & 0x7) + 1;
-            const int16_t ls_2_0 = 2*((sc[1] >> 0) & 0x7) + 1;
-            const int16_t ls_2_1 = 2*((sc[1] >> 3) & 0x7) + 1;
-            const int16_t ls_3_0 = 2*((sc[1] >> 6) & 0x7) + 1;
-            const int16_t ls_3_1 = 2*((sc[1] >> 9) & 0x7) + 1;
-            sc += 2;
-
-            // Accumulate in acc0 and acc1 for each sub-block.
-            acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_0_0, __riscv_vget_v_i16m8_i16m1(lsum1, 0), 16);
-            acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_0_1, __riscv_vget_v_i16m8_i16m1(lsum1, 1), 16);
-            acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_0_0, __riscv_vget_v_i16m8_i16m1(lsum2, 0), 16);
-            acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_0_1, __riscv_vget_v_i16m8_i16m1(lsum2, 1), 16);
-            //
-            acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_1_0, __riscv_vget_v_i16m8_i16m1(lsum1, 2), 16);
-            acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_1_1, __riscv_vget_v_i16m8_i16m1(lsum1, 3), 16);
-            acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_1_0, __riscv_vget_v_i16m8_i16m1(lsum2, 2), 16);
-            acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_1_1, __riscv_vget_v_i16m8_i16m1(lsum2, 3), 16);
-            //
-            acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_2_0, __riscv_vget_v_i16m8_i16m1(lsum1, 4), 16);
-            acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_2_1, __riscv_vget_v_i16m8_i16m1(lsum1, 5), 16);
-            acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_2_0, __riscv_vget_v_i16m8_i16m1(lsum2, 4), 16);
-            acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_2_1, __riscv_vget_v_i16m8_i16m1(lsum2, 5), 16);
-            //
-            acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_3_0, __riscv_vget_v_i16m8_i16m1(lsum1, 6), 16);
-            acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_3_1, __riscv_vget_v_i16m8_i16m1(lsum1, 7), 16);
-            acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_3_0, __riscv_vget_v_i16m8_i16m1(lsum2, 6), 16);
-            acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_3_1, __riscv_vget_v_i16m8_i16m1(lsum2, 7), 16);
-        }
-
-        // Reduce and accumulate in `sumf`.
-        vint32m1_t one = __riscv_vmv_v_x_i32m1(0, 1);
-        int sumi1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m2_i32m1(acc1, one, 16));
-        int sumi2 = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m2_i32m1(acc2, one, 16));
-        sumf += y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16) * (sumi1 + IQ1M_DELTA * sumi2);
-    }
-
-    *s = sumf;
-}
-
-void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-#if defined __riscv_v_intrinsic
-    switch (__riscv_vlenb() * 8) {
-        case 256:
-            ggml_vec_dot_iq1_m_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
-            break;
-        default:
-            ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
-            break;
-    }
-#else
-    ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
-#endif
-}
--- a/ggml/src/ggml-cpu/arch/s390/quants.c
+++ b/ggml/src/ggml-cpu/arch/s390/quants.c
@@ -181,11 +181,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
        const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);

        const int16x8_t v_xylso = vec_mulo(v_xls, v_yl);
-        const int16x8_t v_xyl = vec_meadd(v_xls, v_yl, v_xylso);
+        const int16x8_t v_xylse = vec_mule(v_xls, v_yl);
        const int16x8_t v_xyhso = vec_mulo(v_xhs, v_yh);
-        const int16x8_t v_xyh = vec_meadd(v_xhs, v_yh, v_xyhso);
+        const int16x8_t v_xyhse = vec_mule(v_xhs, v_yh);

-        int16x8_t v_xy_ = v_xyl + v_xyh; v_xy_ += vec_reve(v_xy_);
+        int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);

        const float32x4_t v_xy = vec_float(vec_unpackh(v_xy_));
        const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
@@ -890,7 +890,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
        const int16x8_t v_minsh = (int16x8_t)vec_unpackh((uint8x16_t)v_mins8);

        const int32x4_t v_minso = vec_mulo(v_ysums, v_minsh);
-        const int32x4_t v_mins = vec_meadd(v_ysums, v_minsh, v_minso);
+        const int32x4_t v_minse = vec_mule(v_ysums, v_minsh);
+        const int32x4_t v_mins = v_minso + v_minse;
        sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]);

        const uint8_t * scales = (const uint8_t *)utmp;
@@ -1003,7 +1004,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
        const int16x8_t v_minsh = (int16x8_t)vec_unpackh(v_mins8);

        const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
-        const int32x4_t v_mins = vec_meadd(v_ysums, v_minsh, v_minsho);
+        const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
+        const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
        const int32_t mins = vec_hsum_i32x4(v_mins);

        const uint8_t * scales = (const uint8_t *)utmp;
@@ -1108,10 +1110,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
        const int16x8_t v_scaleh = vec_unpackl(v_scale);

        const int32x4_t v_minslo = vec_mulo(v_ysumsl, v_scalel);
-        const int32x4_t v_minsl = vec_meadd(v_ysumsl, v_scalel, v_minslo);
+        const int32x4_t v_minsle = vec_mule(v_ysumsl, v_scalel);
        const int32x4_t v_minsho = vec_mulo(v_ysumsh, v_scaleh);
-        const int32x4_t v_minsh = vec_meadd(v_ysumsh, v_scaleh, v_minsho);
-        const int32x4_t v_mins = vec_add(v_minsl, v_minsh);
+        const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
+        const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;

        const int32_t mins = vec_hsum_i32x4(v_mins);

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	e7fbfc9b80	ci : tmp fixes	2026-02-11 15:48:40 +02:00
Andreas Kieslinger	d46bd7ef2d	Apply suggestion from @ggerganov (src->buffer to buf_src) v2 Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2026-02-11 13:54:57 +02:00
Andreas Kieslinger	070933684f	Apply suggestion from @ggerganov (src->buffer to buf_src) Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2026-02-11 13:54:25 +02:00
aendk	1528c841dc	Simplifies synchronizations to adhere to `saaasg` pattern.	2026-02-11 13:54:25 +02:00
aendk	ff28ae93a2	Corrects initialization of ggml_backend_sync_mode in ggml_backend_sched_split initialization	2026-02-11 13:54:25 +02:00
aendk	01d89f9b96	Reintroduces stricter check for CPU->CUDA backend async copy via GGML_DEVICE_TYPE_CPU.	2026-02-11 13:54:25 +02:00
aendk	e74b070e30	Makes opt-in to relax use of explicit syncs more general. Backends like vulkan which require a synchronization between HtoD copies and graph execution could also adopt this change now.	2026-02-11 13:54:25 +02:00
aendk	b7376c3ed7	Minor cleanup	2026-02-11 13:54:24 +02:00
aendk	d776354dc9	Relax requirement of checks in async CUDA copies from backend and buffer type to just buffer type, to avoid linking issues	2026-02-11 13:54:23 +02:00
aendk	79a77277ad	Reworked backend detection in ggml-backend.cpp to avoid linking conflicts	2026-02-11 13:53:43 +02:00
aendk	44e481bb34	Adds macro guards to allow compilation in non-CUDA builds	2026-02-11 13:53:43 +02:00
aendk	91c6026b5c	Exchanges synchronous copy with async copy function.	2026-02-11 13:53:43 +02:00
aendk	2ad0d391e1	Adds function to relax sync requirements between input copies on supported backends (CUDA for now)	2026-02-11 13:53:43 +02:00
aendk	dd9f1faf42	Adds CPU-to-CUDA copy capability to ggml_backend_cuda_cpy_tensor_async()	2026-02-11 13:53:41 +02:00
Georgi Gerganov	a554bdd70f	metal : fix event synchronization in cpy_tensor_async (#19402 )	2026-02-11 13:52:47 +02:00
Johannes Gäßler	02ee504f90	fix output pattern	2026-02-09 23:04:31 +01:00
Johannes Gäßler	94c66557b8	re-use buffers + ggml contexts	2026-02-09 00:37:11 +01:00
Johannes Gäßler	6ab02d0908	unconditional peer access	2026-02-08 12:20:28 +01:00
Johannes Gäßler	2b7055381c	add support for 4/8 GPUs	2026-02-07 23:08:10 +01:00
Johannes Gäßler	b7fd10664e	partial Vulkan fix	2026-02-07 15:31:57 +01:00
Johannes Gäßler	a630b27da7	support for GPT-OSS, Qwen 3 MoE	2026-02-06 22:13:53 +01:00
Johannes Gäßler	39b96f8fe1	ggml: backend-agnostic tensor parallelism	2026-02-05 21:49:34 +01:00