ggml-cpu: aarm64: q4_K repack gemm and gemv implementations (dotprod only) (#17494 )

* Enabled q4_K_4x8 path * Fixed generic Q4_K 8x4 implementation * wip: dotprod gemm * Working arm q4_K dotprod gemm Signed-off-by: Alberto Cabrera <alberto.cabrera@liquid.ai> * Undo acc rename Signed-off-by: Alberto Cabrera <alberto.cabrera@liquid.ai> * Q4_K arm dotprod gemm Signed-off-by: Alberto Cabrera <alberto.cabrera@liquid.ai> * Fix: q4_qs reinterpret from uint to int Signed-off-by: Alberto Cabrera <alberto.cabrera@liquid.ai> * Removed comments * Fixed macro guards * Fixed unused vars in generic implementation * Fixed unused vars in 8x4 repack * Fixed unused vars in generic implementation, unneeded comment * Missing arch fallback for x86 * minor : style --------- Signed-off-by: Alberto Cabrera <alberto.cabrera@liquid.ai> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
devops: Add build-essential to Ubuntu 26.04 image (#17531 )
2026-02-05 13:53:23 +02:00 · 2025-11-27 13:25:14 +02:00 · 2025-11-27 18:35:47 +08:00 · 2025-11-27 11:35:38 +01:00 · 2025-11-27 06:54:19 +01:00 · 2025-11-27 06:32:30 +01:00
596 changed files with 174275 additions and 58708 deletions
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@@ -3,7 +3,8 @@
 # ==============================================================================

 # Define the CANN base image for easier version updates later
-ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
+ARG CHIP_TYPE=910b
+ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc1.alpha001-${CHIP_TYPE}-openeuler22.03-py3.11

 # ==============================================================================
 # BUILD STAGE
@@ -11,9 +12,6 @@ ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
 # ==============================================================================
 FROM ${CANN_BASE_IMAGE} AS build

-# Define the Ascend chip model for compilation. Default is Ascend910B3
-ARG ASCEND_SOC_TYPE=Ascend910B3
-
 # -- Install build dependencies --
 RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
    yum clean all && \
@@ -36,20 +34,21 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
 # For brevity, only core variables are listed here. You can paste the original ENV list here.

 # -- Build llama.cpp --
-# Use the passed ASCEND_SOC_TYPE argument and add general build options
+# Use the passed CHIP_TYPE argument and add general build options
+ARG CHIP_TYPE
 RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
    && \
    cmake -B build \
        -DGGML_CANN=ON \
        -DCMAKE_BUILD_TYPE=Release \
-        -DSOC_TYPE=${ASCEND_SOC_TYPE} \
+        -DSOC_TYPE=ascend${CHIP_TYPE} \
        . && \
    cmake --build build --config Release -j$(nproc)

 # -- Organize build artifacts for copying in later stages --
 # Create a lib directory to store all .so files
 RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
+    find build -name "*.so*" -exec cp -P {} /app/lib \;

 # Create a full directory to store all executables and Python scripts
 RUN mkdir -p /app/full && \
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -20,7 +20,7 @@ RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
    cmake --build build -j $(nproc)

 RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
+    find build -name "*.so*" -exec cp -P {} /app/lib \;

 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -25,7 +25,7 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
+    find build -name "*.so*" -exec cp -P {} /app/lib \;

 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -21,7 +21,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
+    find build -name "*.so*" -exec cp -P {} /app/lib \;

 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -32,7 +32,7 @@ RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
+    find build -name "*.so*" -exec cp -P {} /app/lib \;

 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -34,6 +34,7 @@
  rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
  enableCurl ? true,
  useVulkan ? false,
+  useRpc ? false,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake

  # It's necessary to consistently use backendStdenv when building with CUDA support,
@@ -175,6 +176,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
      (cmakeBool "GGML_METAL" useMetalKit)
      (cmakeBool "GGML_VULKAN" useVulkan)
      (cmakeBool "GGML_STATIC" enableStatic)
+      (cmakeBool "GGML_RPC" useRpc)
    ]
    ++ optionals useCuda [
      (
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -45,7 +45,7 @@ RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
    && cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib \
-    && find build -name "*.so" -exec cp {} /app/lib \;
+    && find build -name "*.so*" -exec cp -P {} /app/lib \;

 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -24,8 +24,9 @@ RUN --mount=type=cache,target=/root/.ccache \
        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
        -DLLAMA_BUILD_TESTS=OFF \
-        -DGGML_BACKEND_DL=OFF \
        -DGGML_NATIVE=OFF \
+        -DGGML_BACKEND_DL=ON \
+        -DGGML_CPU_ALL_VARIANTS=ON \
        -DGGML_BLAS=ON \
        -DGGML_BLAS_VENDOR=OpenBLAS && \
    cmake --build build --config Release -j $(nproc) && \
@@ -103,6 +104,7 @@ FROM base AS light
 WORKDIR /llama.cpp/bin

 # Copy llama.cpp binaries and libraries
+COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
 COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin

 ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
@@ -116,6 +118,7 @@ ENV LLAMA_ARG_HOST=0.0.0.0
 WORKDIR /llama.cpp/bin

 # Copy llama.cpp binaries and libraries
+COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
 COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin

 EXPOSE 8080
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -1,42 +1,24 @@
-ARG UBUNTU_VERSION=24.04
+ARG UBUNTU_VERSION=26.04

 FROM ubuntu:$UBUNTU_VERSION AS build

-# Ref: https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html
-
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget xz-utils

-# Install Vulkan SDK
-ARG VULKAN_VERSION=1.4.321.1
-RUN ARCH=$(uname -m) && \
-    wget -qO /tmp/vulkan-sdk.tar.xz https://sdk.lunarg.com/sdk/download/${VULKAN_VERSION}/linux/vulkan-sdk-linux-${ARCH}-${VULKAN_VERSION}.tar.xz && \
-    mkdir -p /opt/vulkan && \
-    tar -xf /tmp/vulkan-sdk.tar.xz -C /tmp --strip-components=1 && \
-    mv /tmp/${ARCH}/* /opt/vulkan/ && \
-    rm -rf /tmp/*
-
 # Install cURL and Vulkan SDK dependencies
 RUN apt install -y libcurl4-openssl-dev curl \
-    libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev
-
-# Set environment variables
-ENV VULKAN_SDK=/opt/vulkan
-ENV PATH=$VULKAN_SDK/bin:$PATH
-ENV LD_LIBRARY_PATH=$VULKAN_SDK/lib:$LD_LIBRARY_PATH
-ENV CMAKE_PREFIX_PATH=$VULKAN_SDK:$CMAKE_PREFIX_PATH
-ENV PKG_CONFIG_PATH=$VULKAN_SDK/lib/pkgconfig:$PKG_CONFIG_PATH
+    libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc

 # Build it
 WORKDIR /app

 COPY . .

-RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1  -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
+    find build -name "*.so*" -exec cp -P {} /app/lib \;

 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
@@ -50,7 +32,7 @@ RUN mkdir -p /app/full \
 FROM ubuntu:$UBUNTU_VERSION AS base

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl libvulkan-dev \
+    && apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -68,6 +50,7 @@ WORKDIR /app

 RUN apt-get update \
    && apt-get install -y \
+    build-essential \
    git \
    python3 \
    python3-pip \
--- a/.editorconfig
+++ b/.editorconfig
@@ -60,3 +60,11 @@ end_of_line = unset
 charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset
+
+[benches/**]
+indent_style = unset
+indent_size = unset
+end_of_line = unset
+charset = unset
+trim_trailing_whitespace = unset
+insert_final_newline = unset
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -9,7 +9,7 @@ llama.cpp is a large-scale C/C++ project for efficient LLM (Large Language Model
 - **Size**: ~200k+ lines of code across 1000+ files
 - **Architecture**: Modular design with main library (`libllama`) and 40+ executable tools/examples
 - **Core dependency**: ggml tensor library (vendored in `ggml/` directory)
- **Backends supported**: CPU (AVX/NEON optimized), CUDA, Metal, Vulkan, SYCL, ROCm, MUSA
+- **Backends supported**: CPU (AVX/NEON/RVV optimized), CUDA, Metal, Vulkan, SYCL, ROCm, MUSA
 - **License**: MIT

 ## Build Instructions
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -76,6 +76,10 @@ ggml:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/**
+model:
+    - changed-files:
+        - any-glob-to-any-file:
+            - src/models/**
 nix:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/workflows/build-amd.yml
+++ b/.github/workflows/build-amd.yml
@@ -1,52 +0,0 @@
-name: CI (AMD)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-amd.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.cu',
-      '**/*.cuh',
-      '**/*.comp'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  ggml-ci-x64-amd-vulkan:
-    runs-on: [self-hosted, Linux, X64, AMD]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
-  ggml-ci-x64-amd-rocm:
-    runs-on: [self-hosted, Linux, X64, AMD]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          amd-smi static
-          GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -4,49 +4,49 @@ on:
  workflow_call:

 jobs:
-  ubuntu-24-riscv64-cpu-cross:
-    runs-on: ubuntu-24.04
+  # ubuntu-24-riscv64-cpu-cross:
+  #   runs-on: ubuntu-24.04

-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup Riscv
-        run: |
-          sudo dpkg --add-architecture riscv64
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #     - name: Setup Riscv
+  #       run: |
+  #         sudo dpkg --add-architecture riscv64

-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-          EOF
+  #         # Add arch-specific repositories for non-amd64 architectures
+  #         cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
+  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
+  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
+  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
+  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
+  #         EOF

-          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
+  #         sudo apt-get update || true    ;# Prevent failure due to missing URLs.

-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu
+  #         sudo apt-get install -y --no-install-recommends \
+  #                 build-essential \
+  #                 gcc-14-riscv64-linux-gnu \
+  #                 g++-14-riscv64-linux-gnu

-      - name: Build
-        run: |
-          cmake -B build -DLLAMA_CURL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+  #     - name: Build
+  #       run: |
+  #         cmake -B build -DLLAMA_CURL=OFF \
+  #                        -DCMAKE_BUILD_TYPE=Release \
+  #                        -DGGML_OPENMP=OFF \
+  #                        -DLLAMA_BUILD_EXAMPLES=ON \
+  #                        -DLLAMA_BUILD_TOOLS=ON \
+  #                        -DLLAMA_BUILD_TESTS=OFF \
+  #                        -DCMAKE_SYSTEM_NAME=Linux \
+  #                        -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
+  #                        -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+  #                        -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
+  #                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+  #                        -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
+  #                        -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+  #                        -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+  #                        -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH

-          cmake --build build --config Release -j $(nproc)
+  #         cmake --build build --config Release -j $(nproc)

  # ubuntu-24-riscv64-vulkan-cross:
  #   runs-on: ubuntu-24.04
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -69,13 +69,6 @@ jobs:
          key: macOS-latest-cmake-arm64
          evict-old-files: 1d

-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
      - name: Build
        id: cmake_build
        run: |
@@ -83,6 +76,8 @@ jobs:
          cmake -B build \
            -DCMAKE_BUILD_RPATH="@loader_path" \
            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_BUILD_BORINGSSL=ON \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=OFF \
            -DGGML_METAL_SHADER_DEBUG=ON \
@@ -110,13 +105,6 @@ jobs:
          key: macOS-latest-cmake-x64
          evict-old-files: 1d

-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
      - name: Build
        id: cmake_build
        run: |
@@ -126,6 +114,8 @@ jobs:
          cmake -B build \
            -DCMAKE_BUILD_RPATH="@loader_path" \
            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_BUILD_BORINGSSL=ON \
            -DGGML_METAL=OFF \
            -DGGML_RPC=ON \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
@@ -151,25 +141,19 @@ jobs:
          key: macOS-latest-cmake-arm64-webgpu
          evict-old-files: 1d

-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
      - name: Dawn Dependency
        id: dawn-depends
        run: |
-          DAWN_VERSION="v1.0.0"
+          DAWN_VERSION="v2.0.0"
          DAWN_OWNER="reeselevine"
          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-macos-latest-Release.tar.gz"
+          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.zip"
          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
-          curl -L -o artifact.tar.gz \
+          curl -L -o artifact.zip \
            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
          mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
+          unzip artifact.zip
+          tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.tar.gz -C dawn --strip-components=1

      - name: Build
        id: cmake_build
@@ -216,7 +200,7 @@ jobs:
          sudo apt-get update
          sudo apt-get install -y --no-install-recommends \
            python3 python3-pip python3-dev \
-            libjpeg-dev build-essential libcurl4-openssl-dev \
+            libjpeg-dev build-essential libssl-dev \
            git-lfs

      - name: Python Dependencies
@@ -237,6 +221,8 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DGGML_RPC=ON
          cmake --build build --config Release -j $(nproc)
@@ -293,13 +279,15 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
+          sudo apt-get install build-essential libssl-dev

      - name: Build
        id: cmake_build
        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
@@ -310,6 +298,8 @@ jobs:
        if: ${{ matrix.sanitizer == 'THREAD' }}
        run: |
          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
@@ -334,7 +324,7 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
+          sudo apt-get install build-essential libssl-dev

      - name: Build
        id: cmake_build
@@ -342,6 +332,8 @@ jobs:
          mkdir build
          cd build
          cmake .. \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_LLGUIDANCE=ON
          cmake --build . --config Release -j $(nproc)
@@ -372,12 +364,14 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
+          sudo apt-get install build-essential libssl-dev

      - name: Build
        id: cmake_build
        run: |
          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DGGML_RPC=ON
          cmake --build build --config Release -j $(nproc)

@@ -404,12 +398,14 @@ jobs:
      - name: Dependencies
        id: depends
        run: |
-          sudo apt-get install -y glslc libvulkan-dev libcurl4-openssl-dev
+          sudo apt-get install -y glslc libvulkan-dev libssl-dev

      - name: Configure
        id: cmake_configure
        run: |
          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
            -DGGML_BACKEND_DL=ON \
            -DGGML_CPU_ALL_VARIANTS=ON \
@@ -439,7 +435,7 @@ jobs:
        run: |
          sudo add-apt-repository -y ppa:kisak/kisak-mesa
          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev
+          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev

      - name: Get latest Vulkan SDK version
        id: vulkan_sdk_version
@@ -465,6 +461,8 @@ jobs:
        run: |
          source ./vulkan_sdk/setup-env.sh
          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DGGML_VULKAN=ON
          cmake --build build --config Release -j $(nproc)

@@ -496,7 +494,7 @@ jobs:
        run: |
          sudo add-apt-repository -y ppa:kisak/kisak-mesa
          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev
+          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev

      - name: Get latest Vulkan SDK version
        id: vulkan_sdk_version
@@ -521,21 +519,25 @@ jobs:
        id: dawn-depends
        run: |
          sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
-          DAWN_VERSION="v1.0.0"
+          DAWN_VERSION="v2.0.0"
          DAWN_OWNER="reeselevine"
          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-ubuntu-latest-Release.tar.gz"
+          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.zip"
          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
-          curl -L -o artifact.tar.gz \
+          curl -L -o artifact.zip \
            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
          mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
+          unzip artifact.zip
+          tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.tar.gz -C dawn --strip-components=1

      - name: Build
        id: cmake_build
        run: |
          export Dawn_DIR=dawn/lib64/cmake/Dawn
-          cmake -B build -DGGML_WEBGPU=ON
+          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
+            -DGGML_WEBGPU=ON
          cmake --build build --config Release -j $(nproc)

      - name: Test
@@ -558,7 +560,7 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev rocwmma-dev
+          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev

      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
@@ -570,6 +572,8 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build -S . \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
            -DGGML_HIP_ROCWMMA_FATTN=ON \
            -DGGML_HIP=ON
@@ -588,7 +592,7 @@ jobs:
        id: depends
        run: |
          apt-get update
-          apt-get install -y build-essential git cmake libcurl4-openssl-dev
+          apt-get install -y build-essential git cmake libssl-dev

      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
@@ -600,6 +604,8 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build -S . \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DGGML_MUSA=ON
          cmake --build build --config Release -j $(nproc)

@@ -624,7 +630,7 @@ jobs:
        shell: bash
        run: |
          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev

      - name: install oneAPI MKL library
        shell: bash
@@ -646,6 +652,8 @@ jobs:
        run: |
          source /opt/intel/oneapi/setvars.sh
          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DGGML_SYCL=ON \
            -DCMAKE_C_COMPILER=icx \
            -DCMAKE_CXX_COMPILER=icpx
@@ -672,7 +680,7 @@ jobs:
        shell: bash
        run: |
          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev

      - name: install oneAPI MKL library
        shell: bash
@@ -694,6 +702,8 @@ jobs:
        run: |
          source /opt/intel/oneapi/setvars.sh
          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DGGML_SYCL=ON \
            -DCMAKE_C_COMPILER=icx \
            -DCMAKE_CXX_COMPILER=icpx \
@@ -720,12 +730,6 @@ jobs:
          key: macOS-latest-cmake-ios
          evict-old-files: 1d

-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
      - name: Build
        id: cmake_build
        run: |
@@ -757,12 +761,6 @@ jobs:
          key: macOS-latest-cmake-tvos
          evict-old-files: 1d

-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
      - name: Build
        id: cmake_build
        run: |
@@ -788,12 +786,6 @@ jobs:
        id: checkout
        uses: actions/checkout@v4

-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
      - name: Build
        id: cmake_build
        run: |
@@ -836,12 +828,6 @@ jobs:
          name: llama-xcframework
          path: build-apple/llama.xcframework/

-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
      - name: Build llama.cpp with CMake
        id: cmake_build
        run: |
@@ -993,21 +979,12 @@ jobs:
            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
          cmake --build build-arm64-release --target install --config release

-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-        with:
-          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
-
      - name: Build
        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
          cmake -S . -B build ${{ matrix.defines }} `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
+            -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
-          cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release

      - name: Add libopenblas.dll
        id: add_libopenblas_dll
@@ -1051,7 +1028,7 @@ jobs:
            DEBIAN_FRONTEND: noninteractive
          run: |
              apt update
-              apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev
+              apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev

        - name: ccache
          uses: ggml-org/ccache-action@v1.2.16
@@ -1062,10 +1039,12 @@ jobs:
        - name: Build with CMake
          run: |
            cmake -S . -B build -G Ninja \
+              -DLLAMA_CURL=OFF \
+              -DLLAMA_OPENSSL=ON \
+              -DLLAMA_FATAL_WARNINGS=ON \
              -DCMAKE_BUILD_TYPE=Release \
              -DCMAKE_CUDA_ARCHITECTURES=89-real \
              -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
-              -DLLAMA_FATAL_WARNINGS=ON \
              -DGGML_NATIVE=OFF \
              -DGGML_CUDA=ON
            cmake --build build
@@ -1099,25 +1078,20 @@ jobs:
        run: |
          choco install ninja

-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
      - name: Build
        id: cmake_build
        shell: cmd
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
          cmake -S . -B build -G "Ninja Multi-Config" ^
            -DLLAMA_BUILD_SERVER=ON ^
+            -DLLAMA_CURL=OFF ^
+            -DLLAMA_BUILD_BORINGSSL=ON ^
            -DGGML_NATIVE=OFF ^
            -DGGML_BACKEND_DL=ON ^
            -DGGML_CPU_ALL_VARIANTS=ON ^
            -DGGML_CUDA=ON ^
-            -DGGML_RPC=ON ^
-            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include"
+            -DGGML_RPC=ON
          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
          cmake --build build --config Release
@@ -1149,7 +1123,7 @@ jobs:
        run:  |
          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL

-      # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args

      - name: Build
        id: cmake_build
@@ -1206,14 +1180,8 @@ jobs:
          key: ${{ github.job }}
          evict-old-files: 1d

-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
      - name: Build
        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
@@ -1222,11 +1190,12 @@ jobs:
            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" `
            -DCMAKE_BUILD_TYPE=Release `
+            -DLLAMA_CURL=OFF `
+            -DLLAMA_BUILD_BORINGSSL=ON `
            -DROCM_DIR="${env:HIP_PATH}" `
            -DGGML_HIP=ON `
            -DGGML_HIP_ROCWMMA_FATTN=ON `
-            -DGGML_RPC=ON `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
+            -DGGML_RPC=ON
          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}

  ios-xcode-build:
@@ -1305,6 +1274,81 @@ jobs:
          cd examples/llama.android
          ./gradlew build --no-daemon

+  android-ndk-build:
+    runs-on: ubuntu-latest
+
+    env:
+      OPENCL_VERSION: 2025.07.22
+
+    strategy:
+      matrix:
+        include:
+          - build: 'arm64-cpu'
+            defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_CURL=OFF -D GGML_OPENMP=OFF'
+          - build: 'arm64-snapdragon'
+            defines: '--preset arm64-android-snapdragon-release'
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Install OpenCL Headers and Libs
+        id: install_opencl
+        if: ${{ matrix.build == 'arm64-snapdragon' }}
+        run: |
+          mkdir opencl
+          curl -L -o opencl/clhpp.tar.gz      https://github.com/KhronosGroup/OpenCL-CLHPP/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
+          curl -L -o opencl/headers.tar.gz    https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
+          curl -L -o opencl/icd-loader.tar.gz https://github.com/KhronosGroup/OpenCL-ICD-Loader/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
+          tar -xaf opencl/headers.tar.gz    -C opencl
+          tar -xaf opencl/clhpp.tar.gz      -C opencl
+          tar -xaf opencl/icd-loader.tar.gz -C opencl
+          sudo cp -r opencl/OpenCL-Headers-${OPENCL_VERSION}/CL         ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+          sudo cp -r opencl/OpenCL-CLHPP-${OPENCL_VERSION}/include/CL/* ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include/CL
+          cd opencl/OpenCL-ICD-Loader-${OPENCL_VERSION}
+          cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -DOPENCL_ICD_LOADER_HEADERS_DIR=${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=31 -DANDROID_STL=c++_shared
+          cmake --build build
+          sudo cp build/libOpenCL.so ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+          rm -rf opencl
+
+      - name: Install Hexagon SDK
+        id: install_hexsdk
+        if: ${{ matrix.build == 'arm64-snapdragon' }}
+        env:
+          HEXSDK_VER: 6.4.0.2
+          HEXTLS_VER: 19.0.04
+        run: |
+          curl -L -o hex-sdk.tar.gz https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v$HEXSDK_VER/hexagon-sdk-v$HEXSDK_VER-amd64-lnx.tar.xz
+          mkdir hex-sdk
+          tar -xaf hex-sdk.tar.gz -C hex-sdk
+          ls -l hex-sdk
+          sudo mv hex-sdk /opt/hexagon
+          echo "HEXAGON_SDK_ROOT=/opt/hexagon/$HEXSDK_VER"                                     >> "$GITHUB_ENV"
+          echo "HEXAGON_TOOLS_ROOT=/opt/hexagon/$HEXSDK_VER/tools/HEXAGON_Tools/$HEXTLS_VER"   >> "$GITHUB_ENV"
+          echo "DEFAULT_HLOS_ARCH=64"                                                          >> "$GITHUB_ENV"
+          echo "DEFAULT_TOOLS_VARIANT=toolv19"                                                 >> "$GITHUB_ENV"
+          echo "DEFAULT_NO_QURT_INC=0"                                                         >> "$GITHUB_ENV"
+          echo "DEFAULT_DSP_ARCH=v73"                                                          >> "$GITHUB_ENV"
+
+      - name: Update CMake presets
+        id: update_presets
+        if: ${{ matrix.build == 'arm64-snapdragon' }}
+        run: |
+          cp docs/backend/hexagon/CMakeUserPresets.json .
+
+      - name: Build
+        id: ndk_build
+        run: |
+          cmake ${{ matrix.defines }} -B build
+          cmake --build build
+          cmake --install build --prefix pkg-adb/llama.cpp
+
+      - name: Test
+        id: cmake_test
+        run: |
+          echo "FIXME: test on devices"
+
  openEuler-latest-cmake-cann:
    if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
    defaults:
@@ -1313,14 +1357,10 @@ jobs:
    strategy:
      matrix:
        arch: [x86, aarch64]
-        cann:
-          - '8.1.RC1.alpha001-910b-openeuler22.03-py3.10'
-        device:
-          - 'ascend910b3'
-        build:
-          - 'Release'
+        chip_type: ['910b', '310p']
+        build: ['Release']
    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    container: ascendai/cann:${{ matrix.cann }}
+    container: ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.2.rc1-310p-openeuler22.03-py3.11' }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -1337,7 +1377,7 @@ jobs:
          cmake -S . -B build \
              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
              -DGGML_CANN=on \
-              -DSOC_TYPE=${{ matrix.device }}
+              -DSOC_TYPE=ascend${{ matrix.chip_type }}
          cmake --build build -j $(nproc)

 # TODO: simplify the following workflows using a matrix
@@ -1522,6 +1562,34 @@ jobs:
        run: |
          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

+  ggml-ci-x64-amd-vulkan:
+    runs-on: [self-hosted, Linux, X64, AMD]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          vulkaninfo --summary
+          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
+  ggml-ci-x64-amd-rocm:
+    runs-on: [self-hosted, Linux, X64, AMD]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          amd-smi static
+          GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
  ggml-ci-mac-metal:
    runs-on: [self-hosted, macOS, ARM64]

@@ -1574,3 +1642,50 @@ jobs:
         run: |
           GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

+  ggml-ci-arm64-graviton4-kleidiai:
+     runs-on: ah-ubuntu_22_04-c8g_8x
+
+     steps:
+       - name: Clone
+         id: checkout
+         uses: actions/checkout@v4
+
+       - name: Dependencies
+         id: depends
+         run: |
+           set -euxo pipefail
+           sudo apt-get update
+           sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
+           apt-get install -y \
+            build-essential \
+            libcurl4-openssl-dev \
+            python3-venv \
+            gpg \
+            wget \
+            time \
+            git-lfs
+
+           git lfs install
+
+           # install the latest cmake
+           sudo install -d /usr/share/keyrings
+           wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
+            | gpg --dearmor \
+            | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
+           echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
+            | sudo tee /etc/apt/sources.list.d/kitware.list
+           sudo apt-get update
+           sudo apt-get install -y cmake
+
+       - name: ccache
+         uses: ggml-org/ccache-action@v1.2.16
+         with:
+           key: ggml-ci-arm64-graviton4-kleidiai
+           evict-old-files: 1d
+
+       - name: Test
+         id: ggml-ci
+         run: |
+           GG_BUILD_KLEIDIAI=1 \
+           GG_BUILD_EXTRA_TESTS_0=1 \
+           bash ./ci/run.sh ./tmp/results ./tmp/mnt
--- a/.github/workflows/check-vendor.yml
+++ b/.github/workflows/check-vendor.yml
@@ -0,0 +1,52 @@
+name: Check vendor
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      'vendor/**',
+      'scripts/sync_vendor.py'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      'vendor/**',
+      'scripts/sync_vendor.py'
+    ]
+
+jobs:
+  check-vendor:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.x'
+
+      - name: Run vendor sync
+        run: |
+          set -euo pipefail
+          python3 scripts/sync_vendor.py
+
+      - name: Check for changes
+        run: |
+          set -euo pipefail
+          # detect modified or untracked files
+          changed=$(git status --porcelain --untracked-files=all || true)
+          if [ -n "$changed" ]; then
+            echo "Vendor sync modified files:"
+            echo "$changed" | awk '{ print $2 }' | sed '/^$/d'
+            echo "Failing because vendor files mismatch. Please update scripts/sync_vendor.py"
+            exit 1
+          else
+            echo "Vendor files are up-to-date."
+          fi
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -40,7 +40,7 @@ jobs:
          # https://github.com/ggml-org/llama.cpp/issues/11888
          #- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
          - { tag: "cpu",    dockerfile: ".devops/cpu.Dockerfile",    platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
-          - { tag: "cuda",   dockerfile: ".devops/cuda.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
+          - { tag: "cuda",   dockerfile: ".devops/cuda.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
          - { tag: "musa",   dockerfile: ".devops/musa.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
          - { tag: "intel",  dockerfile: ".devops/intel.Dockerfile",  platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -134,6 +134,8 @@ jobs:
        include:
          - build: 'x64'
            os: ubuntu-22.04
+          - build: 's390x'
+            os: ubuntu-24.04-s390x
          # GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
          # - build: 'arm64'
          #   os: ubuntu-22.04-arm
@@ -691,6 +693,51 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
          name: llama-${{ steps.tag.outputs.name }}-xcframework

+  openEuler-cann:
+    strategy:
+      matrix:
+        arch: [x86, aarch64]
+        chip_type: ['910b', '310p']
+        build: ['Release']
+    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+    container: ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.2.rc1-310p-openeuler22.03-py3.11' }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Dependencies
+        run: |
+          yum update -y
+          yum install -y git gcc gcc-c++ make cmake libcurl-devel
+          git config --global --add safe.directory "$GITHUB_WORKSPACE"
+
+      - name: Build
+        run: |
+          export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+
+          cmake -S . -B build \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
+              -DGGML_CANN=on \
+              -DSOC_TYPE=ascend${{ matrix.chip_type }}
+          cmake --build build -j $(nproc)
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip ./build/bin/*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip
+          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip
+
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

@@ -712,6 +759,7 @@ jobs:
      - macOS-arm64
      - macOS-x64
      - ios-xcode-build
+      - openEuler-cann

    steps:
      - name: Clone
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -56,7 +56,7 @@ jobs:
            curl \
            wget \
            language-pack-en \
-            libcurl4-openssl-dev
+            libssl-dev

      - name: Clone
        id: checkout
@@ -209,7 +209,7 @@ jobs:
        working-directory: tools/server/webui

      - name: Run UI tests
-        run: npm run test:ui
+        run: npm run test:ui -- --testTimeout=60000
        working-directory: tools/server/webui

      - name: Run E2E tests
@@ -242,7 +242,7 @@ jobs:
            curl \
            wget \
            language-pack-en \
-            libcurl4-openssl-dev
+            libssl-dev

      - name: Clone
        id: checkout
@@ -283,6 +283,8 @@ jobs:
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
+              -DLLAMA_CURL=OFF \
+              -DLLAMA_OPENSSL=ON \
              -DLLAMA_BUILD_SERVER=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
@@ -295,6 +297,8 @@ jobs:
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
+              -DLLAMA_CURL=OFF \
+              -DLLAMA_OPENSSL=ON \
              -DLLAMA_BUILD_SERVER=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
@@ -306,6 +310,8 @@ jobs:
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
+              -DLLAMA_CURL=OFF \
+              -DLLAMA_OPENSSL=ON \
              -DLLAMA_BUILD_SERVER=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
@@ -345,16 +351,10 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
      - name: Build
        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
-          cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
+          cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server

      - name: Python setup
@@ -368,13 +368,6 @@ jobs:
        run: |
          pip install -r tools/server/tests/requirements.txt

-      - name: Copy Libcurl
-        id: prepare_libcurl
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
-
      - name: Tests
        id: server_integration_tests
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
--- a/.github/workflows/update-ops-docs.yml
+++ b/.github/workflows/update-ops-docs.yml
@@ -3,10 +3,12 @@ name: Update Operations Documentation
 on:
    push:
        paths:
+            - 'docs/ops.md'
            - 'docs/ops/**'
            - 'scripts/create_ops_docs.py'
    pull_request:
        paths:
+            - 'docs/ops.md'
            - 'docs/ops/**'
            - 'scripts/create_ops_docs.py'

--- a/.gitignore
+++ b/.gitignore
@@ -20,52 +20,40 @@
 *.so
 *.swp
 *.tmp
+*.DS_Store

 # IDE / OS

-.cache/
-.ccls-cache/
-.direnv/
-.DS_Store
-.envrc
-.idea/
-.swiftpm
-.vs/
-.vscode/
-nppBackup
+/.cache/
+/.ccls-cache/
+/.direnv/
+/.envrc
+/.idea/
+/.swiftpm
+/.vs/
+/.vscode/
+/nppBackup


 # Coverage

-gcovr-report/
-lcov-report/
+/gcovr-report/
+/lcov-report/

 # Build Artifacts

-tags
-.build/
-build*
-release
-debug
-!build-info.cmake
-!build-info.cpp.in
-!build-info.sh
-!build.zig
-!docs/build.md
+/tags
+/.build/
+/build*
+/release
+/debug
 /libllama.so
 /llama-*
 /vulkan-shaders-gen
-android-ndk-*
-arm_neon.h
-cmake-build-*
-CMakeSettings.json
-compile_commands.json
-ggml-metal-embed.metal
-llama-batched-swift
 /rpc-server
-out/
-tmp/
-autogen-*.md
+/out/
+/tmp/
+/autogen-*.md

 # Deprecated

@@ -74,44 +62,38 @@ autogen-*.md

 # CI

-!.github/workflows/*.yml
+!/.github/workflows/*.yml

 # Models

-models/*
-models-mnt
-!models/.editorconfig
-!models/ggml-vocab-*.gguf*
-!models/templates
+/models/*
+/models-mnt
+!/models/.editorconfig
+!/models/ggml-vocab-*.gguf*
+!/models/templates

 # Zig
-zig-out/
-zig-cache/
-
-# Logs
-
-ppl-*.txt
-qnt-*.txt
-perf-*.txt
+/zig-out/
+/zig-cache/

 # Examples

-examples/jeopardy/results.txt
-tools/server/*.css.hpp
-tools/server/*.html.hpp
-tools/server/*.js.hpp
-tools/server/*.mjs.hpp
-tools/server/*.gz.hpp
-!build_64.sh
-!examples/*.bat
-!examples/*/*.kts
-!examples/*/*/*.kts
-!examples/sycl/*.bat
-!examples/sycl/*.sh
+/examples/jeopardy/results.txt
+/tools/server/*.css.hpp
+/tools/server/*.html.hpp
+/tools/server/*.js.hpp
+/tools/server/*.mjs.hpp
+/tools/server/*.gz.hpp
+!/build_64.sh
+!/examples/*.bat
+!/examples/*/*.kts
+!/examples/*/*/*.kts
+!/examples/sycl/*.bat
+!/examples/sycl/*.sh

 # Server Web UI temporary files
-node_modules
-tools/server/webui/dist
+/tools/server/webui/node_modules
+/tools/server/webui/dist

 # Python

@@ -147,8 +129,8 @@ poetry.toml
 # Local scripts
 /run-vim.sh
 /run-chat.sh
-.ccache/
+/.ccache/

 # IDE
-*.code-workspace
-.windsurf/
+/*.code-workspace
+/.windsurf/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -92,6 +92,7 @@ option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_

 # 3rd party libs
 option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
+option(LLAMA_HTTPLIB    "llama: if libcurl is disabled, use httplib to download model from an URL" ON)
 option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" OFF)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)

@@ -200,6 +201,9 @@ endif()

 if (LLAMA_BUILD_COMMON)
    add_subdirectory(common)
+    if (LLAMA_HTTPLIB)
+        add_subdirectory(vendor/cpp-httplib)
+    endif()
 endif()

 if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
--- a/35
+++ b/35
@@ -2,10 +2,8 @@
 # multiplie collaborators per item can be specified

 /.devops/*.Dockerfile                   @ngxson
-/.github/actions/                       @slaren @CISC
+/.github/actions/                       @CISC
 /.github/workflows/                     @CISC
-/.github/workflows/release.yml          @slaren
-/.github/workflows/winget.yml           @slaren
 /ci/                                    @ggerganov
 /cmake/                                 @ggerganov
 /common/CMakeLists.txt                  @ggerganov
@@ -40,40 +38,34 @@
 /examples/passkey/                      @ggerganov
 /examples/retrieval/                    @ggerganov
 /examples/save-load-state/              @ggerganov
-/examples/simple-chat/                  @slaren
-/examples/simple/                       @slaren
 /examples/speculative-simple/           @ggerganov
 /examples/speculative/                  @ggerganov
 /ggml/cmake/                            @ggerganov
-/ggml/include/                          @ggerganov @slaren
-/ggml/src/ggml-alloc.c                  @slaren
-/ggml/src/ggml-backend*                 @slaren
-/ggml/src/ggml-blas/                    @slaren
-/ggml/src/ggml-common.h                 @ggerganov @slaren
-/ggml/src/ggml-cpu/                     @ggerganov @slaren
+/ggml/include/                          @ggerganov
+/ggml/src/ggml-common.h                 @ggerganov
+/ggml/src/ggml-cpu/                     @ggerganov
 /ggml/src/ggml-cpu/spacemit/            @alex-spacemit
-/ggml/src/ggml-cuda/common.cuh          @slaren
 /ggml/src/ggml-cuda/fattn*              @JohannesGaessler
-/ggml/src/ggml-cuda/ggml-cuda.cu        @slaren
-/ggml/src/ggml-cuda/mmf.*               @JohannesGaessler
+/ggml/src/ggml-cuda/mmf.*               @JohannesGaessler @am17an
 /ggml/src/ggml-cuda/mmq.*               @JohannesGaessler
 /ggml/src/ggml-cuda/mmvf.*              @JohannesGaessler
 /ggml/src/ggml-cuda/mmvq.*              @JohannesGaessler
 /ggml/src/ggml-cuda/fattn-wmma*         @IMbackK
 /ggml/src/ggml-hip/                     @IMbackK
 /ggml/src/ggml-cuda/vendors/hip.h       @IMbackK
-/ggml/src/ggml-impl.h                   @ggerganov @slaren
+/ggml/src/ggml-impl.h                   @ggerganov
 /ggml/src/ggml-metal/                   @ggerganov
 /ggml/src/ggml-opencl/                  @lhez @max-krasnyansky
+/ggml/src/ggml-hexagon/                 @max-krasnyansky @lhez
 /ggml/src/ggml-opt.cpp                  @JohannesGaessler
 /ggml/src/ggml-quants.*                 @ggerganov
 /ggml/src/ggml-rpc/                     @rgerganov
-/ggml/src/ggml-threading.*              @ggerganov @slaren
+/ggml/src/ggml-threading.*              @ggerganov
 /ggml/src/ggml-vulkan/                  @0cc4m
 /ggml/src/ggml-webgpu/                  @reeselevine
 /ggml/src/ggml-zdnn/                    @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
-/ggml/src/ggml.c                        @ggerganov @slaren
-/ggml/src/ggml.cpp                      @ggerganov @slaren
+/ggml/src/ggml.c                        @ggerganov
+/ggml/src/ggml.cpp                      @ggerganov
 /ggml/src/gguf.cpp                      @JohannesGaessler @Green-Sky
 /gguf-py/                               @CISC
 /media/                                 @ggerganov
@@ -85,14 +77,11 @@
 /src/llama-arch.*                       @CISC
 /src/llama-chat.*                       @ngxson
 /src/llama-graph.*                      @CISC
-/src/llama-model-loader.*               @slaren
 /src/llama-model.*                      @CISC
 /src/llama-vocab.*                      @CISC
+/src/models/                            @CISC
 /tests/                                 @ggerganov
-/tests/test-backend-ops.cpp             @slaren
-/tests/test-thread-safety.cpp           @slaren
 /tools/batched-bench/                   @ggerganov
-/tools/llama-bench/                     @slaren
 /tools/main/                            @ggerganov
 /tools/mtmd/                            @ngxson
 /tools/perplexity/                      @ggerganov
@@ -104,8 +93,6 @@
 /tools/tokenize/                        @ggerganov
 /tools/tts/                             @ggerganov
 /vendor/                                @ggerganov
-/.clang-format                          @slaren
-/.clang-tidy                            @slaren
 /AUTHORS                                @ggerganov
 /CMakeLists.txt                         @ggerganov
 /CONTRIBUTING.md                        @ggerganov
--- a/README.md
+++ b/README.md
@@ -17,14 +17,13 @@ LLM inference in C/C++

 ## Hot topics

- **[guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)**
- **[[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)**
+- **[guide : using the new WebUI of llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/16938)**
+- [guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)
+- [[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)
 - Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095)
- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
 - Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
 - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
 - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
 - Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)

@@ -62,6 +61,7 @@ range of hardware - locally and in the cloud.
 - Plain C/C++ implementation without any dependencies
 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
 - AVX, AVX2, AVX512 and AMX support for x86 architectures
+- RVV, ZVFH, ZFH and ZICBOP support for RISC-V architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
 - Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
 - Vulkan and SYCL backend support
@@ -84,6 +84,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
 - [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
+- [x] [Jamba](https://huggingface.co/ai21labs)
 - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
@@ -138,6 +139,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
 - [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
 - [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7)
+- [x] [BailingMoeV2 (Ring/Ling 2.0) models](https://huggingface.co/collections/inclusionAI/ling-v2-68bf1dd2fc34c306c1fa6f86)

 #### Multimodal

@@ -187,6 +189,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
 - Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
 - Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi)
+- Go (no CGo needed): [hybridgroup/yzma](https://github.com/hybridgroup/yzma)

 </details>

@@ -239,6 +242,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
 - [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
+- [unslothai/unsloth](https://github.com/unslothai/unsloth) – 🦥 exports/saves fine-tuned and trained models to GGUF (Apache-2.0)

 </details>

@@ -278,6 +282,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
 | [WebGPU [In Progress]](docs/build.md#webgpu) | All |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
+| [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon |

 ## Obtaining and quantizing models

--- a/benches/dgx-spark/aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547.html
+++ b/benches/dgx-spark/aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547.html
--- a/benches/dgx-spark/aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547.json
+++ b/benches/dgx-spark/aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547.json
@@ -0,0 +1,6 @@
+{
+  "chars": 2296.1916666666666,
+  "chars:std": 986.051306946325,
+  "score": 0.925,
+  "score:std": 0.26339134382131846
+}
--- a/benches/dgx-spark/aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547_allresults.json
+++ b/benches/dgx-spark/aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547_allresults.json
--- a/benches/dgx-spark/dgx-spark.md
+++ b/benches/dgx-spark/dgx-spark.md
@@ -0,0 +1,264 @@
+## System info
+
+```bash
+uname --all
+Linux spark-17ed 6.11.0-1016-nvidia #16-Ubuntu SMP PREEMPT_DYNAMIC Sun Sep 21 16:52:46 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux
+
+g++ --version
+g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
+
+nvidia-smi
+Sun Nov  2 10:43:25 2025
+-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA GB10                    On  |   0000000F:01:00.0 Off |                  N/A |
+| N/A   35C    P8              4W /  N/A  | Not Supported          |      0%      Default |
+|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
+```
+
+## ggml-org/gpt-oss-20b-GGUF
+
+Model: https://huggingface.co/ggml-org/gpt-oss-20b-GGUF
+
+- `llama-batched-bench`
+
+
+main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
+
+|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
+|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
+|   512 |     32 |    1 |    544 |    0.374 |  1369.01 |    0.383 |    83.64 |    0.757 |   719.01 |
+|   512 |     32 |    2 |   1088 |    0.274 |  3741.35 |    0.659 |    97.14 |    0.933 |  1166.66 |
+|   512 |     32 |    4 |   2176 |    0.526 |  3896.47 |    0.817 |   156.73 |    1.342 |  1621.08 |
+|   512 |     32 |    8 |   4352 |    1.044 |  3925.10 |    0.987 |   259.44 |    2.030 |  2143.56 |
+|   512 |     32 |   16 |   8704 |    2.076 |  3945.84 |    1.248 |   410.32 |    3.324 |  2618.60 |
+|   512 |     32 |   32 |  17408 |    4.170 |  3929.28 |    1.630 |   628.40 |    5.799 |  3001.76 |
+|  4096 |     32 |    1 |   4128 |    1.083 |  3782.66 |    0.394 |    81.21 |    1.477 |  2795.13 |
+|  4096 |     32 |    2 |   8256 |    2.166 |  3782.72 |    0.725 |    88.28 |    2.891 |  2856.14 |
+|  4096 |     32 |    4 |  16512 |    4.333 |  3780.88 |    0.896 |   142.82 |    5.230 |  3157.38 |
+|  4096 |     32 |    8 |  33024 |    8.618 |  3802.14 |    1.155 |   221.69 |    9.773 |  3379.08 |
+|  4096 |     32 |   16 |  66048 |   17.330 |  3781.73 |    1.598 |   320.34 |   18.928 |  3489.45 |
+|  4096 |     32 |   32 | 132096 |   34.671 |  3780.48 |    2.336 |   438.35 |   37.007 |  3569.51 |
+|  8192 |     32 |    1 |   8224 |    2.233 |  3668.56 |    0.438 |    72.98 |    2.671 |  3078.44 |
+|  8192 |     32 |    2 |  16448 |    4.425 |  3702.95 |    0.756 |    84.66 |    5.181 |  3174.95 |
+|  8192 |     32 |    4 |  32896 |    8.859 |  3698.64 |    0.967 |   132.38 |    9.826 |  3347.72 |
+|  8192 |     32 |    8 |  65792 |   17.714 |  3699.57 |    1.277 |   200.52 |   18.991 |  3464.35 |
+|  8192 |     32 |   16 | 131584 |   35.494 |  3692.84 |    1.841 |   278.12 |   37.335 |  3524.46 |
+|  8192 |     32 |   32 | 263168 |   70.949 |  3694.82 |    2.798 |   365.99 |   73.747 |  3568.53 |
+
+
+- `llama-bench`
+
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |          pp2048 |      3714.25 ± 20.36 |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |            tg32 |         86.58 ± 0.43 |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d4096 |      3445.17 ± 17.85 |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d4096 |         81.72 ± 0.53 |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d8192 |      3218.78 ± 11.34 |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d8192 |         74.86 ± 0.64 |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d16384 |       2732.83 ± 7.17 |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d16384 |         71.57 ± 0.51 |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |      2119.75 ± 12.81 |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |         62.33 ± 0.24 |
+
+build: eeee367de (6989)
+
+## ggml-org/gpt-oss-120b-GGUF
+
+Model: https://huggingface.co/ggml-org/gpt-oss-120b-GGUF
+
+- `llama-batched-bench`
+
+
+main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
+
+|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
+|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
+|   512 |     32 |    1 |    544 |    0.571 |   897.18 |    0.543 |    58.96 |    1.113 |   488.60 |
+|   512 |     32 |    2 |   1088 |    0.593 |  1725.37 |    1.041 |    61.45 |    1.635 |   665.48 |
+|   512 |     32 |    4 |   2176 |    1.043 |  1963.15 |    1.334 |    95.95 |    2.377 |   915.36 |
+|   512 |     32 |    8 |   4352 |    2.099 |  1951.63 |    1.717 |   149.07 |    3.816 |  1140.45 |
+|   512 |     32 |   16 |   8704 |    4.207 |  1947.12 |    2.311 |   221.56 |    6.518 |  1335.35 |
+|   512 |     32 |   32 |  17408 |    8.422 |  1945.36 |    3.298 |   310.46 |   11.720 |  1485.27 |
+|  4096 |     32 |    1 |   4128 |    2.138 |  1915.88 |    0.571 |    56.09 |    2.708 |  1524.12 |
+|  4096 |     32 |    2 |   8256 |    4.266 |  1920.25 |    1.137 |    56.27 |    5.404 |  1527.90 |
+|  4096 |     32 |    4 |  16512 |    8.564 |  1913.02 |    1.471 |    86.99 |   10.036 |  1645.29 |
+|  4096 |     32 |    8 |  33024 |   17.092 |  1917.19 |    1.979 |   129.33 |   19.071 |  1731.63 |
+|  4096 |     32 |   16 |  66048 |   34.211 |  1915.65 |    2.850 |   179.66 |   37.061 |  1782.15 |
+|  4096 |     32 |   32 | 132096 |   68.394 |  1916.44 |    4.381 |   233.72 |   72.775 |  1815.13 |
+|  8192 |     32 |    1 |   8224 |    4.349 |  1883.45 |    0.620 |    51.65 |    4.969 |  1655.04 |
+|  8192 |     32 |    2 |  16448 |    8.674 |  1888.83 |    1.178 |    54.33 |    9.852 |  1669.48 |
+|  8192 |     32 |    4 |  32896 |   17.351 |  1888.55 |    1.580 |    81.01 |   18.931 |  1737.68 |
+|  8192 |     32 |    8 |  65792 |   34.743 |  1886.31 |    2.173 |   117.80 |   36.916 |  1782.20 |
+|  8192 |     32 |   16 | 131584 |   69.413 |  1888.29 |    3.297 |   155.28 |   72.710 |  1809.70 |
+|  8192 |     32 |   32 | 263168 |  138.903 |  1887.24 |    5.004 |   204.63 |  143.907 |  1828.73 |
+
+
+- `llama-bench`
+
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |          pp2048 |       1919.36 ± 5.01 |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |            tg32 |         60.40 ± 0.30 |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d4096 |       1825.30 ± 6.37 |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d4096 |         56.94 ± 0.29 |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d8192 |       1739.19 ± 6.00 |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d8192 |         52.51 ± 0.42 |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d16384 |       1536.75 ± 4.27 |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d16384 |         49.33 ± 0.27 |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |       1255.85 ± 3.26 |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |         42.99 ± 0.18 |
+
+build: eeee367de (6989)
+
+## ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF
+
+Model: https://huggingface.co/ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF
+
+- `llama-batched-bench`
+
+
+main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
+
+|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
+|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
+|   512 |     32 |    1 |    544 |    0.398 |  1285.90 |    0.530 |    60.41 |    0.928 |   586.27 |
+|   512 |     32 |    2 |   1088 |    0.386 |  2651.65 |    0.948 |    67.50 |    1.334 |   815.38 |
+|   512 |     32 |    4 |   2176 |    0.666 |  3076.37 |    1.209 |   105.87 |    1.875 |  1160.71 |
+|   512 |     32 |    8 |   4352 |    1.325 |  3091.39 |    1.610 |   158.98 |    2.935 |  1482.65 |
+|   512 |     32 |   16 |   8704 |    2.664 |  3075.58 |    2.150 |   238.19 |    4.813 |  1808.39 |
+|   512 |     32 |   32 |  17408 |    5.336 |  3070.31 |    2.904 |   352.59 |    8.240 |  2112.50 |
+|  4096 |     32 |    1 |   4128 |    1.444 |  2836.81 |    0.581 |    55.09 |    2.025 |  2038.81 |
+|  4096 |     32 |    2 |   8256 |    2.872 |  2852.14 |    1.084 |    59.06 |    3.956 |  2086.99 |
+|  4096 |     32 |    4 |  16512 |    5.744 |  2852.32 |    1.440 |    88.90 |    7.184 |  2298.47 |
+|  4096 |     32 |    8 |  33024 |   11.463 |  2858.68 |    2.068 |   123.78 |   13.531 |  2440.65 |
+|  4096 |     32 |   16 |  66048 |   22.915 |  2859.95 |    3.018 |   169.67 |   25.933 |  2546.90 |
+|  4096 |     32 |   32 | 132096 |   45.956 |  2852.10 |    4.609 |   222.18 |   50.565 |  2612.39 |
+|  8192 |     32 |    1 |   8224 |    3.063 |  2674.72 |    0.693 |    46.20 |    3.755 |  2189.92 |
+|  8192 |     32 |    2 |  16448 |    6.109 |  2681.87 |    1.214 |    52.71 |    7.323 |  2245.98 |
+|  8192 |     32 |    4 |  32896 |   12.197 |  2686.63 |    1.682 |    76.11 |   13.878 |  2370.30 |
+|  8192 |     32 |    8 |  65792 |   24.409 |  2684.94 |    2.556 |   100.17 |   26.965 |  2439.95 |
+|  8192 |     32 |   16 | 131584 |   48.753 |  2688.50 |    3.994 |   128.20 |   52.747 |  2494.64 |
+|  8192 |     32 |   32 | 263168 |   97.508 |  2688.42 |    6.528 |   156.86 |  104.037 |  2529.57 |
+
+
+- `llama-bench`
+
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |          pp2048 |       2925.55 ± 4.25 |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |            tg32 |         62.80 ± 0.27 |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d4096 |       2531.01 ± 6.79 |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d4096 |         55.86 ± 0.33 |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d8192 |       2244.39 ± 5.33 |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d8192 |         45.95 ± 0.33 |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d16384 |       1783.17 ± 3.68 |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d16384 |         39.07 ± 0.10 |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |       1241.90 ± 3.13 |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |         29.92 ± 0.06 |
+
+build: eeee367de (6989)
+
+## ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF
+
+Model: https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF
+
+- `llama-batched-bench`
+
+
+main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
+
+|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
+|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
+|   512 |     32 |    1 |    544 |    0.211 |  2421.57 |    1.055 |    30.33 |    1.266 |   429.57 |
+|   512 |     32 |    2 |   1088 |    0.419 |  2441.34 |    1.130 |    56.65 |    1.549 |   702.32 |
+|   512 |     32 |    4 |   2176 |    0.873 |  2345.54 |    1.174 |   108.99 |    2.048 |  1062.74 |
+|   512 |     32 |    8 |   4352 |    1.727 |  2371.85 |    1.254 |   204.22 |    2.980 |  1460.19 |
+|   512 |     32 |   16 |   8704 |    3.452 |  2373.22 |    1.492 |   343.16 |    4.944 |  1760.56 |
+|   512 |     32 |   32 |  17408 |    6.916 |  2368.93 |    1.675 |   611.51 |    8.591 |  2026.36 |
+|  4096 |     32 |    1 |   4128 |    1.799 |  2277.26 |    1.084 |    29.51 |    2.883 |  1431.91 |
+|  4096 |     32 |    2 |   8256 |    3.577 |  2290.01 |    1.196 |    53.50 |    4.774 |  1729.51 |
+|  4096 |     32 |    4 |  16512 |    7.172 |  2284.36 |    1.313 |    97.50 |    8.485 |  1946.00 |
+|  4096 |     32 |    8 |  33024 |   14.341 |  2284.96 |    1.520 |   168.46 |   15.860 |  2082.18 |
+|  4096 |     32 |   16 |  66048 |   28.675 |  2285.44 |    1.983 |   258.21 |   30.658 |  2154.33 |
+|  4096 |     32 |   32 | 132096 |   57.354 |  2285.32 |    2.640 |   387.87 |   59.994 |  2201.82 |
+|  8192 |     32 |    1 |   8224 |    3.701 |  2213.75 |    1.119 |    28.59 |    4.820 |  1706.34 |
+|  8192 |     32 |    2 |  16448 |    7.410 |  2211.19 |    1.272 |    50.31 |    8.682 |  1894.56 |
+|  8192 |     32 |    4 |  32896 |   14.802 |  2213.83 |    1.460 |    87.68 |   16.261 |  2022.96 |
+|  8192 |     32 |    8 |  65792 |   29.609 |  2213.35 |    1.781 |   143.74 |   31.390 |  2095.93 |
+|  8192 |     32 |   16 | 131584 |   59.229 |  2212.96 |    2.495 |   205.17 |   61.725 |  2131.79 |
+|  8192 |     32 |   32 | 263168 |  118.449 |  2213.15 |    3.714 |   275.75 |  122.162 |  2154.25 |
+
+
+- `llama-bench`
+
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |          pp2048 |       2272.74 ± 4.68 |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |            tg32 |         30.66 ± 0.02 |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d4096 |       2107.80 ± 9.55 |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d4096 |         29.71 ± 0.05 |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d8192 |       1937.80 ± 6.75 |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d8192 |         28.86 ± 0.04 |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d16384 |       1641.12 ± 1.78 |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d16384 |         27.24 ± 0.04 |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |       1296.02 ± 2.67 |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |         23.78 ± 0.03 |
+
+build: eeee367de (6989)
+
+## ggml-org/gemma-3-4b-it-qat-GGUF
+
+Model: https://huggingface.co/ggml-org/gemma-3-4b-it-qat-GGUF
+
+- `llama-batched-bench`
+
+
+main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
+
+|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
+|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
+|   512 |     32 |    1 |    544 |    0.094 |  5434.73 |    0.394 |    81.21 |    0.488 |  1114.15 |
+|   512 |     32 |    2 |   1088 |    0.168 |  6091.68 |    0.498 |   128.52 |    0.666 |  1633.41 |
+|   512 |     32 |    4 |   2176 |    0.341 |  6010.68 |    0.542 |   236.37 |    0.882 |  2466.43 |
+|   512 |     32 |    8 |   4352 |    0.665 |  6161.46 |    0.678 |   377.74 |    1.342 |  3241.72 |
+|   512 |     32 |   16 |   8704 |    1.323 |  6193.19 |    0.902 |   567.41 |    2.225 |  3911.74 |
+|   512 |     32 |   32 |  17408 |    2.642 |  6202.03 |    1.231 |   832.03 |    3.872 |  4495.36 |
+|  4096 |     32 |    1 |   4128 |    0.701 |  5840.49 |    0.439 |    72.95 |    1.140 |  3621.23 |
+|  4096 |     32 |    2 |   8256 |    1.387 |  5906.82 |    0.574 |   111.48 |    1.961 |  4210.12 |
+|  4096 |     32 |    4 |  16512 |    2.758 |  5940.33 |    0.651 |   196.58 |    3.409 |  4843.33 |
+|  4096 |     32 |    8 |  33024 |    5.491 |  5967.56 |    0.876 |   292.40 |    6.367 |  5187.12 |
+|  4096 |     32 |   16 |  66048 |   10.978 |  5969.58 |    1.275 |   401.69 |   12.253 |  5390.38 |
+|  4096 |     32 |   32 | 132096 |   21.944 |  5972.93 |    1.992 |   514.16 |   23.936 |  5518.73 |
+|  8192 |     32 |    1 |   8224 |    1.402 |  5841.91 |    0.452 |    70.73 |    1.855 |  4434.12 |
+|  8192 |     32 |    2 |  16448 |    2.793 |  5865.34 |    0.637 |   100.55 |    3.430 |  4795.51 |
+|  8192 |     32 |    4 |  32896 |    5.564 |  5889.64 |    0.770 |   166.26 |    6.334 |  5193.95 |
+|  8192 |     32 |    8 |  65792 |   11.114 |  5896.44 |    1.122 |   228.07 |   12.237 |  5376.51 |
+|  8192 |     32 |   16 | 131584 |   22.210 |  5901.38 |    1.789 |   286.15 |   24.000 |  5482.74 |
+|  8192 |     32 |   32 | 263168 |   44.382 |  5906.56 |    3.044 |   336.38 |   47.426 |  5549.02 |
+
+
+- `llama-bench`
+
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |          pp2048 |      5810.04 ± 21.71 |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |            tg32 |         84.54 ± 0.18 |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d4096 |       5288.04 ± 3.54 |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d4096 |         78.82 ± 1.37 |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d8192 |      4960.43 ± 16.64 |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d8192 |         74.13 ± 0.30 |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d16384 |      4495.92 ± 31.11 |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d16384 |         72.37 ± 0.29 |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |      3746.90 ± 40.01 |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |         63.02 ± 0.20 |
+
+build: eeee367de (6989)
+
--- a/benches/dgx-spark/run-aime-120b-t8-x8-high.log
+++ b/benches/dgx-spark/run-aime-120b-t8-x8-high.log
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -454,6 +454,8 @@ cmake -B build-visionos -G Xcode \
    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
    -DLLAMA_CURL=OFF \
+    -DLLAMA_HTTPLIB=OFF \
+    -DLLAMA_BUILD_SERVER=OFF \
    -S .
 cmake --build build-visionos --config Release -- -quiet

@@ -468,6 +470,8 @@ cmake -B build-visionos-sim -G Xcode \
    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
    -DLLAMA_CURL=OFF \
+    -DLLAMA_HTTPLIB=OFF \
+    -DLLAMA_BUILD_SERVER=OFF \
    -S .
 cmake --build build-visionos-sim --config Release -- -quiet

--- a/ci/run.sh
+++ b/ci/run.sh
@@ -75,7 +75,7 @@ if [ ! -z ${GG_BUILD_ROCM} ]; then
        exit 1
    fi

-    CMAKE_EXTRA="${CMAKE_EXTRA} -DAMDGPU_TARGETS=${GG_BUILD_AMDGPU_TARGETS}"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGPU_TARGETS=${GG_BUILD_AMDGPU_TARGETS}"
 fi

 if [ ! -z ${GG_BUILD_SYCL} ]; then
@@ -121,7 +121,12 @@ fi
 if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
    echo ">>===== Enabling KleidiAI support"

-    CANDIDATES=("armv9-a+dotprod+i8mm" "armv8.6-a+dotprod+i8mm" "armv8.2-a+dotprod")
+    CANDIDATES=(
+        "armv9-a+dotprod+i8mm+sve2"
+        "armv9-a+dotprod+i8mm"
+        "armv8.6-a+dotprod+i8mm"
+        "armv8.2-a+dotprod"
+    )
    CPU=""

    for cpu in "${CANDIDATES[@]}"; do
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -50,12 +50,16 @@ add_library(${TARGET} STATIC
    base64.hpp
    chat-parser.cpp
    chat-parser.h
+    chat-parser-xml-toolcall.h
+    chat-parser-xml-toolcall.cpp
    chat.cpp
    chat.h
    common.cpp
    common.h
    console.cpp
    console.h
+    download.cpp
+    download.h
    http.h
    json-partial.cpp
    json-partial.h
@@ -77,10 +81,11 @@ if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()

+# TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...)
 set(LLAMA_COMMON_EXTRA_LIBS build_info)

-# Use curl to download model url
 if (LLAMA_CURL)
+    # Use curl to download model url
    find_package(CURL)
    if (NOT CURL_FOUND)
        message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF")
@@ -88,42 +93,10 @@ if (LLAMA_CURL)
    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
    include_directories(${CURL_INCLUDE_DIRS})
    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
-endif()
-
-if (LLAMA_OPENSSL)
-    find_package(OpenSSL)
-    if (OpenSSL_FOUND)
-        include(CheckCSourceCompiles)
-        set(SAVED_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
-        set(CMAKE_REQUIRED_INCLUDES ${OPENSSL_INCLUDE_DIR})
-        check_c_source_compiles("
-        #include <openssl/opensslv.h>
-        #if defined(OPENSSL_IS_BORINGSSL) || defined(LIBRESSL_VERSION_NUMBER)
-        #    if OPENSSL_VERSION_NUMBER < 0x1010107f
-        #        error bad version
-        #    endif
-        #else
-        #    if OPENSSL_VERSION_NUMBER < 0x30000000L
-        #        error bad version
-        #    endif
-        #endif
-        int main() { return 0; }
-        " OPENSSL_VERSION_SUPPORTED)
-        set(CMAKE_REQUIRED_INCLUDES ${SAVED_CMAKE_REQUIRED_INCLUDES})
-        if (OPENSSL_VERSION_SUPPORTED)
-            message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
-            target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT)
-            target_link_libraries(${TARGET} PUBLIC OpenSSL::SSL OpenSSL::Crypto)
-            if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
-                target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
-                find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED)
-                find_library(SECURITY_FRAMEWORK Security REQUIRED)
-                target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK})
-            endif()
-        endif()
-    else()
-        message(STATUS "OpenSSL not found, SSL support disabled")
-    endif()
+elseif (LLAMA_HTTPLIB)
+    # otherwise, use cpp-httplib
+    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_HTTPLIB)
+    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
 endif()

 if (LLAMA_LLGUIDANCE)
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@@ -59,8 +59,8 @@ struct common_arg {
    common_arg & set_sparam();
    bool in_example(enum llama_example ex);
    bool is_exclude(enum llama_example ex);
-    bool get_value_from_env(std::string & output);
-    bool has_value_from_env();
+    bool get_value_from_env(std::string & output) const;
+    bool has_value_from_env() const;
    std::string to_string();
 };

--- a/common/chat-parser-xml-toolcall.cpp
+++ b/common/chat-parser-xml-toolcall.cpp
@@ -0,0 +1,861 @@
+#include "chat.h"
+#include "chat-parser.h"
+#include "common.h"
+#include "json-partial.h"
+#include "json-schema-to-grammar.h"
+#include "log.h"
+#include "regex-partial.h"
+
+using json = nlohmann::ordered_json;
+
+class xml_toolcall_syntax_exception : public std::runtime_error {
+  public:
+    xml_toolcall_syntax_exception(const std::string & message) : std::runtime_error(message) {}
+};
+
+template<typename T>
+inline void sort_uniq(std::vector<T> &vec) {
+    std::sort(vec.begin(), vec.end());
+    vec.erase(std::unique(vec.begin(), vec.end()), vec.end());
+}
+
+template<typename T>
+inline bool all_space(const T &str) {
+    return std::all_of(str.begin(), str.end(), [](unsigned char ch) { return std::isspace(ch); });
+}
+
+static size_t utf8_truncate_safe(const std::string_view s) {
+    size_t len = s.size();
+    if (len == 0) return 0;
+    size_t i = len;
+    for (size_t back = 0; back < 4 && i > 0; ++back) {
+        --i;
+        unsigned char c = s[i];
+        if ((c & 0x80) == 0) {
+            return len;
+        } else if ((c & 0xC0) == 0xC0) {
+            size_t expected_len = 0;
+            if ((c & 0xE0) == 0xC0) expected_len = 2;
+            else if ((c & 0xF0) == 0xE0) expected_len = 3;
+            else if ((c & 0xF8) == 0xF0) expected_len = 4;
+            else return i;
+            if (len - i >= expected_len) {
+                return len;
+            } else {
+                return i;
+            }
+        }
+    }
+    return len - std::min(len, size_t(3));
+}
+
+inline void utf8_truncate_safe_resize(std::string &s) {
+    s.resize(utf8_truncate_safe(s));
+}
+
+inline std::string_view utf8_truncate_safe_view(const std::string_view s) {
+    return s.substr(0, utf8_truncate_safe(s));
+}
+
+static std::optional<common_chat_msg_parser::find_regex_result> try_find_2_literal_splited_by_spaces(common_chat_msg_parser & builder, const std::string & literal1, const std::string & literal2) {
+    if (literal1.size() == 0) return builder.try_find_literal(literal2);
+    const auto saved_pos = builder.pos();
+    while (auto res = builder.try_find_literal(literal1)) {
+        builder.consume_spaces();
+        const auto match_len = std::min(literal2.size(), builder.input().size() - builder.pos());
+        if (builder.input().compare(builder.pos(), match_len, literal2, 0, match_len) == 0) {
+            if (res->prelude.size() != res->groups[0].begin - saved_pos) {
+                res->prelude = builder.str({saved_pos, res->groups[0].begin});
+            }
+            builder.move_to(builder.pos() + match_len);
+            res->groups[0].end = builder.pos();
+            GGML_ASSERT(res->groups[0].begin != res->groups[0].end);
+            return res;
+        }
+        builder.move_to(res->groups[0].begin + 1);
+    }
+    builder.move_to(saved_pos);
+    return std::nullopt;
+}
+
+/**
+ * make a GBNF that accept any strings except those containing any of the forbidden strings.
+ */
+std::string make_gbnf_excluding(std::vector<std::string> forbids) {
+    constexpr auto charclass_escape = [](unsigned char c) -> std::string {
+        if (c == '\\' || c == ']' || c == '^' || c == '-') {
+            std::string s = "\\";
+            s.push_back((char)c);
+            return s;
+        }
+        if (isprint(c)) {
+            return std::string(1, (char)c);
+        }
+        char buf[16];
+        snprintf(buf, 15, "\\x%02X", c);
+        return std::string(buf);
+    };
+    constexpr auto build_expr = [charclass_escape](auto self, const std::vector<std::string>& forbids, int l, int r, int depth) -> std::string {
+        std::vector<std::pair<unsigned char, std::pair<int,int>>> children;
+        int i = l;
+        while (i < r) {
+            const std::string &s = forbids[i];
+            if ((int)s.size() == depth) {
+                ++i;
+                continue;
+            }
+            unsigned char c = (unsigned char)s[depth];
+            int j = i;
+            while (j < r && (int)forbids[j].size() > depth &&
+                   (unsigned char)forbids[j][depth] == c) {
+                ++j;
+            }
+            children.push_back({c, {i, j}});
+            i = j;
+        }
+        std::vector<std::string> alts;
+        if (!children.empty()) {
+            std::string cls;
+            for (auto &ch : children) cls += charclass_escape(ch.first);
+            alts.push_back(std::string("[^") + cls + "]");
+        }
+        for (auto &ch : children) {
+            std::string childExpr = self(self, forbids, ch.second.first, ch.second.second, depth+1);
+            if (!childExpr.empty()) {
+                std::string quoted_ch = "\"";
+                if (ch.first == '\\') quoted_ch += "\\\\";
+                else if (ch.first == '"') quoted_ch += "\\\"";
+                else if (isprint(ch.first)) quoted_ch.push_back(ch.first);
+                else {
+                    char buf[16];
+                    snprintf(buf, 15, "\\x%02X", ch.first);
+                    quoted_ch += buf;
+                }
+                quoted_ch += "\"";
+                std::string branch = quoted_ch + std::string(" ") + childExpr;
+                alts.push_back(branch);
+            }
+        }
+        if (alts.empty()) return "";
+        std::ostringstream oss;
+        oss << "( ";
+        for (size_t k = 0; k < alts.size(); ++k) {
+            if (k) oss << " | ";
+            oss << alts[k];
+        }
+        oss << " )";
+        return oss.str();
+    };
+    if (forbids.empty()) return "( . )*";
+    sort(forbids.begin(), forbids.end());
+    std::string expr = build_expr(build_expr, forbids, 0, forbids.size(), 0);
+    if (expr.empty()) {
+        std::string cls;
+        for (auto &s : forbids) if (!s.empty()) cls += charclass_escape((unsigned char)s[0]);
+        expr = std::string("( [^") + cls + "] )";
+    }
+    if (forbids.size() == 1)
+        return expr + "*";
+    else
+        return std::string("( ") + expr + " )*";
+}
+
+/**
+ * Build grammar for xml-style tool call
+ * form.scope_start and form.scope_end can be empty.
+ * Requires data.format for model-specific hacks.
+ */
+void build_grammar_xml_tool_call(common_chat_params & data, const json & tools, const struct xml_tool_call_format & form) {
+    GGML_ASSERT(!form.tool_start.empty());
+    GGML_ASSERT(!form.tool_sep.empty());
+    GGML_ASSERT(!form.key_start.empty());
+    GGML_ASSERT(!form.val_end.empty());
+    GGML_ASSERT(!form.tool_end.empty());
+
+    std::string key_val_sep = form.key_val_sep;
+    if (form.key_val_sep2) {
+        key_val_sep += "\n";
+        key_val_sep += *form.key_val_sep2;
+    }
+    GGML_ASSERT(!key_val_sep.empty());
+
+    if (tools.is_array() && !tools.empty()) {
+        data.grammar = build_grammar([&](const common_grammar_builder &builder) {
+            auto string_arg_val = form.last_val_end ?
+                    builder.add_rule("string-arg-val", make_gbnf_excluding({form.val_end, *form.last_val_end})) :
+                    builder.add_rule("string-arg-val", make_gbnf_excluding({form.val_end}));
+
+            std::vector<std::string> tool_rules;
+            for (const auto & tool : tools) {
+                if (!tool.contains("type") || tool.at("type") != "function" || !tool.contains("function")) {
+                    LOG_WRN("Skipping tool without function: %s", tool.dump(2).c_str());
+                    continue;
+                }
+                const auto & function = tool.at("function");
+                if (!function.contains("name") || !function.at("name").is_string()) {
+                    LOG_WRN("Skipping invalid function (invalid name): %s", function.dump(2).c_str());
+                    continue;
+                }
+                if (!function.contains("parameters") || !function.at("parameters").is_object()) {
+                    LOG_WRN("Skipping invalid function (invalid parameters): %s", function.dump(2).c_str());
+                    continue;
+                }
+                std::string name = function.at("name");
+                auto parameters = function.at("parameters");
+                builder.resolve_refs(parameters);
+
+                struct parameter_rule {
+                    std::string symbol_name;
+                    bool is_required;
+                };
+                std::vector<parameter_rule> arg_rules;
+                if (!parameters.contains("properties") || !parameters.at("properties").is_object()) {
+                    LOG_WRN("Skipping invalid function (invalid properties): %s", function.dump(2).c_str());
+                    continue;
+                } else {
+                    std::vector<std::string> requiredParameters;
+                    if (parameters.contains("required")) {
+                        try { parameters.at("required").get_to(requiredParameters); }
+                        catch (const std::runtime_error&) {
+                            LOG_WRN("Invalid function required parameters, ignoring: %s", function.at("required").dump(2).c_str());
+                        }
+                    }
+                    sort_uniq(requiredParameters);
+                    for (const auto & [key, value] : parameters.at("properties").items()) {
+                        std::string quoted_key = key;
+                        bool required = std::binary_search(requiredParameters.begin(), requiredParameters.end(), key);
+                        if (form.key_start.back() == '"' && key_val_sep[0] == '"') {
+                            quoted_key = gbnf_format_literal(key);
+                            quoted_key = quoted_key.substr(1, quoted_key.size() - 2);
+                        }
+                        arg_rules.push_back(parameter_rule {builder.add_rule("func-" + name + "-kv-" + key,
+                            gbnf_format_literal(form.key_start) + " " +
+                            gbnf_format_literal(quoted_key) + " " +
+                            gbnf_format_literal(key_val_sep) + " " +
+                            ((value.contains("type") && value["type"].is_string() && value["type"] == "string" && (!form.raw_argval || *form.raw_argval)) ?
+                                    (form.raw_argval ?
+                                            string_arg_val :
+                                            "( " + string_arg_val + " | " + builder.add_schema(name + "-arg-" + key, value) + " )"
+                                    ) :
+                                    builder.add_schema(name + "-arg-" + key, value)
+                            )
+                        ), required});
+                    }
+                }
+
+                auto next_arg_with_sep = builder.add_rule(name + "-last-arg-end", form.last_val_end ? gbnf_format_literal(*form.last_val_end) : gbnf_format_literal(form.val_end));
+                decltype(next_arg_with_sep) next_arg = "\"\"";
+                for (auto i = arg_rules.size() - 1; /* i >= 0 && */ i < arg_rules.size(); --i) {
+                    std::string include_this_arg = arg_rules[i].symbol_name + " " + next_arg_with_sep;
+                    next_arg = builder.add_rule(name + "-arg-after-" + std::to_string(i), arg_rules[i].is_required ?
+                            include_this_arg : "( " + include_this_arg + " ) | " + next_arg
+                    );
+                    include_this_arg = gbnf_format_literal(form.val_end) + " " + include_this_arg;
+                    next_arg_with_sep = builder.add_rule(name + "-arg-after-" + std::to_string(i) + "-with-sep", arg_rules[i].is_required ?
+                            include_this_arg : "( " + include_this_arg + " ) | " + next_arg_with_sep
+                    );
+                }
+
+                std::string quoted_name = name;
+                if (form.tool_start.back() == '"' && form.tool_sep[0] == '"') {
+                    quoted_name = gbnf_format_literal(name);
+                    quoted_name = quoted_name.substr(1, quoted_name.size() - 2);
+                }
+                quoted_name = gbnf_format_literal(quoted_name);
+                // Kimi-K2 uses functions.{{ tool_call['function']['name'] }}:{{ loop.index }} as function name
+                if (data.format == COMMON_CHAT_FORMAT_KIMI_K2) {
+                    quoted_name = "\"functions.\" " + quoted_name + " \":\" [0-9]+";
+                }
+                tool_rules.push_back(builder.add_rule(name + "-call",
+                        gbnf_format_literal(form.tool_start) + " " +
+                        quoted_name + " " +
+                        gbnf_format_literal(form.tool_sep) + " " +
+                        next_arg
+                ));
+            }
+
+            auto tool_call_once = builder.add_rule("root-tool-call-once", string_join(tool_rules, " | "));
+            auto tool_call_more = builder.add_rule("root-tool-call-more", gbnf_format_literal(form.tool_end) + " " + tool_call_once);
+            auto call_end = builder.add_rule("root-call-end", form.last_tool_end ? gbnf_format_literal(*form.last_tool_end) : gbnf_format_literal(form.tool_end));
+            auto tool_call_multiple_with_end = builder.add_rule("root-tool-call-multiple-with-end", tool_call_once + " " + tool_call_more + "* " + call_end);
+            builder.add_rule("root",
+                (form.scope_start.empty() ? "" : gbnf_format_literal(form.scope_start) + " ") +
+                tool_call_multiple_with_end  + "?" +
+                (form.scope_end.empty() ? "" : " " + gbnf_format_literal(form.scope_end))
+            );
+        });
+
+        // grammar trigger for tool call
+        data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, form.scope_start + form.tool_start });
+    }
+}
+
+/**
+ * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
+ * Throws xml_toolcall_syntax_exception if there is invalid syntax and cannot recover the original status for common_chat_msg_parser.
+ * form.scope_start, form.tool_sep and form.scope_end can be empty.
+ */
+inline bool parse_xml_tool_calls(common_chat_msg_parser & builder, const struct xml_tool_call_format & form) {
+    GGML_ASSERT(!form.tool_start.empty());
+    GGML_ASSERT(!form.key_start.empty());
+    GGML_ASSERT(!form.key_val_sep.empty());
+    GGML_ASSERT(!form.val_end.empty());
+    GGML_ASSERT(!form.tool_end.empty());
+
+    // Helper to choose return false or throw error
+    constexpr auto return_error = [](common_chat_msg_parser & builder, auto &start_pos, const bool &recovery) {
+        LOG_DBG("Failed to parse XML-Style tool call at position: %s\n", gbnf_format_literal(builder.consume_rest().substr(0, 20)).c_str());
+        if (recovery) {
+            builder.move_to(start_pos);
+            return false;
+        } else throw xml_toolcall_syntax_exception("Tool call parsing failed with unrecoverable errors. Try using a grammar to constrain the model’s output.");
+    };
+    // Drop substring from needle to end from a JSON
+    constexpr auto partial_json = [](std::string &json_str, std::string_view needle = "XML_TOOL_CALL_PARTIAL_FLAG") {
+        auto pos = json_str.rfind(needle);
+        if (pos == std::string::npos) {
+            return false;
+        }
+        for (auto i = pos + needle.size(); i < json_str.size(); ++i) {
+            unsigned char ch = static_cast<unsigned char>(json_str[i]);
+            if (ch != '\'' && ch != '"' && ch != '}' && ch != ':' && !std::isspace(ch)) {
+                return false;
+            }
+        }
+        if (pos != 0 && json_str[pos - 1] == '"') {
+            --pos;
+        }
+        json_str.resize(pos);
+        return true;
+    };
+    // Helper to generate a partial argument JSON
+    constexpr auto gen_partial_json = [partial_json](auto set_partial_arg, auto &arguments, auto &builder, auto &function_name) {
+        auto rest = builder.consume_rest();
+        utf8_truncate_safe_resize(rest);
+        set_partial_arg(rest, "XML_TOOL_CALL_PARTIAL_FLAG");
+        auto tool_str = arguments.dump();
+        if (partial_json(tool_str)) {
+            if (builder.add_tool_call(function_name, "", tool_str)) {
+                return;
+            }
+        }
+        LOG_DBG("Failed to parse partial XML-Style tool call, fallback to non-partial: %s\n", tool_str.c_str());
+    };
+    // Helper to find a close (because there may be form.last_val_end or form.last_tool_end)
+    constexpr auto try_find_close = [](
+            common_chat_msg_parser & builder,
+            const std::string & end,
+            const std::optional<std::string> & alt_end,
+            const std::string & end_next,
+            const std::optional<std::string> & alt_end_next
+    ) {
+        auto saved_pos = builder.pos();
+        auto tc = builder.try_find_literal(end);
+        auto val_end_size = end.size();
+        if (alt_end) {
+            auto pos_1 = builder.pos();
+            builder.move_to(saved_pos);
+            auto tc2 = try_find_2_literal_splited_by_spaces(builder, *alt_end, end_next);
+            if (alt_end_next) {
+                builder.move_to(saved_pos);
+                auto tc3 = try_find_2_literal_splited_by_spaces(builder, *alt_end, *alt_end_next);
+                if (tc3 && (!tc2 || tc2->prelude.size() > tc3->prelude.size())) {
+                    tc2 = tc3;
+                }
+            }
+            if (tc2 && (!tc || tc->prelude.size() > tc2->prelude.size())) {
+                tc = tc2;
+                tc->groups[0].end = std::min(builder.input().size(), tc->groups[0].begin + alt_end->size());
+                builder.move_to(tc->groups[0].end);
+                val_end_size = alt_end->size();
+            } else {
+                builder.move_to(pos_1);
+            }
+        }
+        return std::make_pair(val_end_size, tc);
+    };
+    // Helper to find a val_end or last_val_end, returns matched pattern size
+    const auto try_find_val_end = [try_find_close, &builder, &form]() {
+        return try_find_close(builder, form.val_end, form.last_val_end, form.tool_end, form.last_tool_end);
+    };
+    // Helper to find a tool_end or last_tool_end, returns matched pattern size
+    const auto try_find_tool_end = [try_find_close, &builder, &form]() {
+        return try_find_close(builder, form.tool_end, form.last_tool_end, form.scope_end, std::nullopt);
+    };
+
+    bool recovery = true;
+    const auto start_pos = builder.pos();
+    if (!all_space(form.scope_start)) {
+        if (auto tc = builder.try_find_literal(form.scope_start)) {
+            if (all_space(tc->prelude)) {
+                if (form.scope_start.size() != tc->groups[0].end - tc->groups[0].begin)
+                    throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.scope_start));
+            } else {
+                builder.move_to(start_pos);
+                return false;
+            }
+        } else return false;
+    }
+    while (auto tc = builder.try_find_literal(form.tool_start)) {
+        if (!all_space(tc->prelude)) {
+            LOG_DBG("XML-Style tool call: Expected %s, but found %s, trying to match next pattern\n",
+                    gbnf_format_literal(form.tool_start).c_str(),
+                    gbnf_format_literal(tc->prelude).c_str()
+            );
+            builder.move_to(tc->groups[0].begin - tc->prelude.size());
+            break;
+        }
+
+        // Find tool name
+        auto func_name = builder.try_find_literal(all_space(form.tool_sep) ? form.key_start : form.tool_sep);
+        if (!func_name) {
+            auto [sz, tc] = try_find_tool_end();
+            func_name = tc;
+        }
+        if (!func_name) {
+            // Partial tool name not supported
+            throw common_chat_msg_partial_exception("incomplete tool_call");
+        }
+        // If the model generate multiple tool call and the first tool call has no argument
+        if (func_name->prelude.find(form.tool_end) != std::string::npos || (form.last_tool_end ? func_name->prelude.find(*form.last_tool_end) != std::string::npos : false)) {
+            builder.move_to(func_name->groups[0].begin - func_name->prelude.size());
+            auto [sz, tc] = try_find_tool_end();
+            func_name = tc;
+        }
+
+        // Parse tool name
+        builder.move_to(all_space(form.tool_sep) ? func_name->groups[0].begin : func_name->groups[0].end);
+        std::string function_name = string_strip(func_name->prelude);
+        // Kimi-K2 uses functions.{{ tool_call['function']['name'] }}:{{ loop.index }} as function name
+        if (builder.syntax().format == COMMON_CHAT_FORMAT_KIMI_K2) {
+            if (string_starts_with(function_name, "functions.")) {
+                static const std::regex re(":\\d+$");
+                if (std::regex_search(function_name, re)) {
+                    function_name = function_name.substr(10, function_name.rfind(":") - 10);
+                }
+            }
+        }
+
+        // Argument JSON
+        json arguments = json::object();
+
+        // Helper to generate a partial argument JSON
+        const auto gen_partial_args = [&](auto set_partial_arg) {
+            gen_partial_json(set_partial_arg, arguments, builder, function_name);
+        };
+
+        // Parse all arg_key/arg_value pairs
+        while (auto tc = builder.try_find_literal(form.key_start)) {
+            if (!all_space(tc->prelude)) {
+                LOG_DBG("XML-Style tool call: Expected %s, but found %s, trying to match next pattern\n",
+                        gbnf_format_literal(form.key_start).c_str(),
+                        gbnf_format_literal(tc->prelude).c_str()
+                );
+                builder.move_to(tc->groups[0].begin - tc->prelude.size());
+                break;
+            }
+            if (tc->groups[0].end - tc->groups[0].begin != form.key_start.size()) {
+                auto tool_call_arg = arguments.dump();
+                if (tool_call_arg.size() != 0 && tool_call_arg[tool_call_arg.size() - 1] == '}') {
+                    tool_call_arg.resize(tool_call_arg.size() - 1);
+                }
+                builder.add_tool_call(function_name, "", tool_call_arg);
+                throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.key_start));
+            }
+
+            // Parse arg_key
+            auto key_res = builder.try_find_literal(form.key_val_sep);
+            if (!key_res) {
+                gen_partial_args([&](auto &rest, auto &needle) {arguments[rest + needle] = "";});
+                throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(form.key_val_sep) + " after " + gbnf_format_literal(form.key_start));
+            }
+            if (key_res->groups[0].end - key_res->groups[0].begin != form.key_val_sep.size()) {
+                gen_partial_args([&](auto &, auto &needle) {arguments[key_res->prelude + needle] = "";});
+                throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.key_val_sep));
+            }
+            auto &key = key_res->prelude;
+            recovery = false;
+
+            // Parse arg_value
+            if (form.key_val_sep2) {
+                if (auto tc = builder.try_find_literal(*form.key_val_sep2)) {
+                    if (!all_space(tc->prelude)) {
+                        LOG_DBG("Failed to parse XML-Style tool call: Unexcepted %s between %s and %s\n",
+                                gbnf_format_literal(tc->prelude).c_str(),
+                                gbnf_format_literal(form.key_val_sep).c_str(),
+                                gbnf_format_literal(*form.key_val_sep2).c_str()
+                        );
+                        return return_error(builder, start_pos, false);
+                    }
+                    if (tc->groups[0].end - tc->groups[0].begin != form.key_val_sep2->size()) {
+                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
+                        throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(*form.key_val_sep2));
+                    }
+                } else {
+                    gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
+                    throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(*form.key_val_sep2) + " after " + gbnf_format_literal(form.key_val_sep));
+                }
+            }
+            auto val_start = builder.pos();
+
+            // Test if arg_val is a partial JSON
+            std::optional<common_json> value_json = std::nullopt;
+            if (!form.raw_argval || !*form.raw_argval) {
+                try { value_json = builder.try_consume_json(); }
+                catch (const std::runtime_error&) { builder.move_to(val_start); }
+                // TODO: Delete this when json_partial adds top-level support for null/true/false
+                if (builder.pos() == val_start) {
+                    const static std::regex number_regex(R"([0-9-][0-9]*(\.\d*)?([eE][+-]?\d*)?)");
+                    builder.consume_spaces();
+                    std::string_view sv = utf8_truncate_safe_view(builder.input());
+                    sv.remove_prefix(builder.pos());
+                    std::string rest = "a";
+                    if (sv.size() < 6) rest = sv;
+                    if (string_starts_with("null", rest) || string_starts_with("true", rest) || string_starts_with("false", rest) || std::regex_match(sv.begin(), sv.end(), number_regex)) {
+                        value_json = {123, {"123", "123"}};
+                        builder.consume_rest();
+                    } else {
+                        builder.move_to(val_start);
+                    }
+                }
+            }
+
+            // If it is a JSON and followed by </arg_value>, parse as json
+            // cannot support streaming because it may be a plain text starting with JSON
+            if (value_json) {
+                auto json_end = builder.pos();
+                builder.consume_spaces();
+                if (builder.pos() == builder.input().size()) {
+                    if (form.raw_argval && !*form.raw_argval && (value_json->json.is_string() || value_json->json.is_object() || value_json->json.is_array())) {
+                        arguments[key] = value_json->json;
+                        auto json_str = arguments.dump();
+                        if (!value_json->healing_marker.json_dump_marker.empty()) {
+                            GGML_ASSERT(std::string::npos != json_str.rfind(value_json->healing_marker.json_dump_marker));
+                            json_str.resize(json_str.rfind(value_json->healing_marker.json_dump_marker));
+                        } else {
+                            GGML_ASSERT(json_str.back() == '}');
+                            json_str.resize(json_str.size() - 1);
+                        }
+                        builder.add_tool_call(function_name, "", json_str);
+                    } else {
+                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
+                    }
+                    LOG_DBG("Possible JSON arg_value: %s\n", value_json->json.dump().c_str());
+                    throw common_chat_msg_partial_exception("JSON arg_value detected. Waiting for more tokens for validations.");
+                }
+                builder.move_to(json_end);
+                auto [val_end_size, tc] = try_find_val_end();
+                if (tc && all_space(tc->prelude) && value_json->healing_marker.marker.empty()) {
+                    if (tc->groups[0].end - tc->groups[0].begin != val_end_size) {
+                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
+                        LOG_DBG("Possible terminated JSON arg_value: %s\n", value_json->json.dump().c_str());
+                        throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.val_end) + (form.last_val_end ? gbnf_format_literal(*form.last_val_end) : ""));
+                    } else arguments[key] = value_json->json;
+                } else builder.move_to(val_start);
+            }
+
+            // If not, parse as plain text
+            if (val_start == builder.pos()) {
+                if (auto [val_end_size, value_plain] = try_find_val_end(); value_plain) {
+                    auto &value_str = value_plain->prelude;
+                    if (form.trim_raw_argval) value_str = string_strip(value_str);
+                    if (value_plain->groups[0].end - value_plain->groups[0].begin != val_end_size) {
+                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = value_str + needle;});
+                        throw common_chat_msg_partial_exception(
+                                "Expected " + gbnf_format_literal(form.val_end) +
+                                " after " + gbnf_format_literal(form.key_val_sep) +
+                                (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
+                        );
+                    }
+                    arguments[key] = value_str;
+                } else {
+                    if (form.trim_raw_argval) {
+                        gen_partial_args([&](auto &rest, auto &needle) {arguments[key] = string_strip(rest) + needle;});
+                    } else {
+                        gen_partial_args([&](auto &rest, auto &needle) {arguments[key] = rest + needle;});
+                    }
+                    throw common_chat_msg_partial_exception(
+                            "Expected " + gbnf_format_literal(form.val_end) +
+                            " after " + gbnf_format_literal(form.key_val_sep) +
+                            (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
+                    );
+                }
+            }
+        }
+
+        // Consume closing tag
+        if (auto [tool_end_size, tc] = try_find_tool_end(); tc) {
+            if (!all_space(tc->prelude)) {
+                LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
+                        gbnf_format_literal(form.tool_end).c_str(),
+                        gbnf_format_literal(tc->prelude).c_str()
+                );
+                return return_error(builder, start_pos, recovery);
+            }
+            if (tc->groups[0].end - tc->groups[0].begin == tool_end_size) {
+                // Add the parsed tool call
+                if (!builder.add_tool_call(function_name, "", arguments.dump())) {
+                    throw common_chat_msg_partial_exception("Failed to add XML-Style tool call");
+                }
+                recovery = false;
+                continue;
+            }
+        }
+
+        auto tool_call_arg = arguments.dump();
+        if (tool_call_arg.size() != 0 && tool_call_arg[tool_call_arg.size() - 1] == '}') {
+            tool_call_arg.resize(tool_call_arg.size() - 1);
+        }
+        builder.add_tool_call(function_name, "", tool_call_arg);
+        throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(form.tool_end) + " after " + gbnf_format_literal(form.val_end));
+    }
+    if (auto tc = builder.try_find_literal(form.scope_end)) {
+        if (!all_space(tc->prelude)) {
+            LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
+                    gbnf_format_literal(form.scope_end).c_str(),
+                    gbnf_format_literal(tc->prelude).c_str()
+            );
+            return return_error(builder, start_pos, recovery);
+        }
+    } else {
+        if (all_space(form.scope_end)) return true;
+        builder.consume_spaces();
+        if (builder.pos() == builder.input().size())
+            throw common_chat_msg_partial_exception("incomplete tool calls");
+        LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
+                gbnf_format_literal(form.scope_end).c_str(),
+                gbnf_format_literal(builder.consume_rest()).c_str()
+        );
+        return return_error(builder, start_pos, recovery);
+    }
+
+    return true;
+}
+
+/**
+ * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
+ * May cause std::runtime_error if there is invalid syntax because partial valid tool call is already sent out to client.
+ * form.scope_start, form.tool_sep and form.scope_end can be empty.
+ */
+bool common_chat_msg_parser::try_consume_xml_tool_calls(const struct xml_tool_call_format & form) {
+    auto pos = pos_;
+    auto tsize = result_.tool_calls.size();
+    try { return parse_xml_tool_calls(*this, form); }
+    catch (const xml_toolcall_syntax_exception&) {}
+    move_to(pos);
+    result_.tool_calls.resize(tsize);
+    return false;
+}
+
+/**
+ * Parse content uses reasoning and XML-Style tool call
+ * TODO: Note that form.allow_toolcall_in_think is not tested yet. If anyone confirms it works, this comment can be removed.
+ */
+inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, const struct xml_tool_call_format & form, const std::string & start_think = "<think>", const std::string & end_think = "</think>") {
+    constexpr auto rstrip = [](std::string &s) {
+        s.resize(std::distance(s.begin(), std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) { return !std::isspace(ch); }).base()));
+    };
+    // Erase substring from l to r, along with additional spaces nearby
+    constexpr auto erase_spaces = [](auto &str, size_t l, size_t r) {
+        while (/* l > -1 && */ --l < str.size() && std::isspace(static_cast<unsigned char>(str[l])));
+        ++l;
+        while (++r < str.size() && std::isspace(static_cast<unsigned char>(str[r])));
+        if (l < r) str[l] = '\n';
+        if (l + 1 < r) str[l + 1] = '\n';
+        if (l != 0) l += 2;
+        str.erase(l, r - l);
+        return l;
+    };
+    constexpr auto trim_suffix = [](std::string &content, std::initializer_list<std::string_view> list) {
+        auto best_match = content.size();
+        for (auto pattern: list) {
+            if (pattern.size() == 0) continue;
+            for (auto match_idx = content.size() - std::min(pattern.size(), content.size()); content.size() > match_idx; match_idx++) {
+                auto match_len = content.size() - match_idx;
+                if (content.compare(match_idx, match_len, pattern.data(), match_len) == 0 && best_match > match_idx) {
+                    best_match = match_idx;
+                }
+            }
+        }
+        if (content.size() > best_match) {
+            content.erase(best_match);
+        }
+    };
+    const auto trim_potential_partial_word = [&start_think, &end_think, &form, trim_suffix](std::string &content) {
+        return trim_suffix(content, {
+            start_think, end_think, form.scope_start, form.tool_start, form.tool_sep, form.key_start,
+            form.key_val_sep, form.key_val_sep2 ? form.key_val_sep2->c_str() : "",
+            form.val_end, form.last_val_end ? form.last_val_end->c_str() : "",
+            form.tool_end, form.last_tool_end ? form.last_tool_end->c_str() : "",
+            form.scope_end
+        });
+    };
+
+
+    // Trim leading spaces without affecting keyword matching
+    static const common_regex spaces_regex("\\s*");
+    {
+        auto tc = builder.consume_regex(spaces_regex);
+        auto spaces = builder.str(tc.groups[0]);
+        auto s1 = spaces.size();
+        trim_potential_partial_word(spaces);
+        auto s2 = spaces.size();
+        builder.move_to(builder.pos() - (s1 - s2));
+    }
+
+    // Parse content
+    bool reasoning_unclosed = builder.syntax().thinking_forced_open;
+    std::string unclosed_reasoning_content("");
+    for (;;) {
+        auto tc = try_find_2_literal_splited_by_spaces(builder, form.scope_start, form.tool_start);
+        std::string content;
+        std::string tool_call_start;
+
+        if (tc) {
+            content = std::move(tc->prelude);
+            tool_call_start = builder.str(tc->groups[0]);
+            LOG_DBG("Matched tool start: %s\n", gbnf_format_literal(tool_call_start).c_str());
+        } else {
+            content = builder.consume_rest();
+            utf8_truncate_safe_resize(content);
+        }
+
+        // Handle unclosed think block
+        if (reasoning_unclosed) {
+            if (auto pos = content.find(end_think); pos == std::string::npos && builder.pos() != builder.input().size()) {
+                unclosed_reasoning_content += content;
+                if (form.allow_toolcall_in_think) {
+                    builder.move_to(tc->groups[0].begin);
+                    if (!builder.try_consume_xml_tool_calls(form)) {
+                        unclosed_reasoning_content += tool_call_start;
+                        builder.move_to(tc->groups[0].end);
+                    }
+                } else {
+                    unclosed_reasoning_content += tool_call_start;
+                }
+                continue;
+            } else {
+                reasoning_unclosed = false;
+                std::string reasoning_content;
+                if (pos == std::string::npos) {
+                    reasoning_content = std::move(content);
+                } else {
+                    reasoning_content = content.substr(0, pos);
+                    content.erase(0, pos + end_think.size());
+                }
+                if (builder.pos() == builder.input().size() && all_space(content)) {
+                    rstrip(reasoning_content);
+                    trim_potential_partial_word(reasoning_content);
+                    rstrip(reasoning_content);
+                    if (reasoning_content.empty()) {
+                        rstrip(unclosed_reasoning_content);
+                        trim_potential_partial_word(unclosed_reasoning_content);
+                        rstrip(unclosed_reasoning_content);
+                        if (unclosed_reasoning_content.empty()) continue;
+                    }
+                }
+                if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
+                    builder.add_content(start_think);
+                    builder.add_content(unclosed_reasoning_content);
+                    builder.add_content(reasoning_content);
+                    if (builder.pos() != builder.input().size() || !all_space(content))
+                        builder.add_content(end_think);
+                } else {
+                    builder.add_reasoning_content(unclosed_reasoning_content);
+                    builder.add_reasoning_content(reasoning_content);
+                }
+                unclosed_reasoning_content.clear();
+            }
+        }
+
+        // Handle multiple think block
+        bool toolcall_in_think = false;
+        for (auto think_start = content.find(start_think); think_start != std::string::npos; think_start = content.find(start_think, think_start)) {
+            if (auto think_end = content.find(end_think, think_start + start_think.size()); think_end != std::string::npos) {
+                if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
+                    auto reasoning_content = content.substr(think_start + start_think.size(), think_end - think_start - start_think.size());
+                    builder.add_reasoning_content(reasoning_content);
+                    think_start = erase_spaces(content, think_start, think_end + end_think.size() - 1);
+                } else {
+                    think_start = think_end + end_think.size() - 1;
+                }
+            } else {
+                // This <tool_call> start is in thinking block, skip this tool call
+                auto pos = think_start + start_think.size();
+                unclosed_reasoning_content = content.substr(pos) + tool_call_start;
+                reasoning_unclosed = true;
+                content.resize(think_start);
+                toolcall_in_think = true;
+            }
+        }
+
+        if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
+            rstrip(content);
+            // Handle unclosed </think> token from content: delete all </think> token
+            if (auto pos = content.rfind(end_think); pos != std::string::npos) {
+                while (pos != std::string::npos) {
+                    pos = erase_spaces(content, pos, pos + end_think.size() - 1);
+                    pos = content.rfind(end_think, pos);
+                }
+            }
+            // Strip if needed
+            if (content.size() > 0 && std::isspace(static_cast<unsigned char>(content[0]))) {
+                content = string_strip(content);
+            }
+        }
+
+        // remove potential partial suffix
+        if (content.size() > 0 && builder.pos() == builder.input().size() && unclosed_reasoning_content.empty()) {
+            rstrip(content);
+            trim_potential_partial_word(content);
+            rstrip(content);
+        }
+
+        // Add content
+        if (content.size() != 0) {
+            // If there are multiple content blocks
+            if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content && builder.result().content.size() != 0) {
+                builder.add_content("\n\n");
+            }
+            builder.add_content(content);
+        }
+
+        // This <tool_call> start is in thinking block, skip this tool call
+        if (toolcall_in_think && !form.allow_toolcall_in_think) {
+            continue;
+        }
+
+        // There is no tool call and all content is parsed
+        if (!tc) {
+            GGML_ASSERT(builder.pos() == builder.input().size());
+            GGML_ASSERT(unclosed_reasoning_content.empty());
+            GGML_ASSERT(!reasoning_unclosed);
+            break;
+        }
+
+        builder.move_to(tc->groups[0].begin);
+        if (builder.try_consume_xml_tool_calls(form)) {
+            auto end_of_tool = builder.pos();
+            builder.consume_spaces();
+            if (builder.pos() != builder.input().size()) {
+                builder.move_to(end_of_tool);
+                if (!builder.result().content.empty()) {
+                    builder.add_content("\n\n");
+                }
+            }
+        } else {
+            static const common_regex next_char_regex(".");
+            auto c = builder.str(builder.consume_regex(next_char_regex).groups[0]);
+            rstrip(c);
+            builder.add_content(c);
+        }
+    }
+}
+
+/**
+ * Parse content uses reasoning and XML-Style tool call
+ * TODO: Note that form.allow_toolcall_in_think is not tested yet. If anyone confirms it works, this comment can be removed.
+ */
+void common_chat_msg_parser::consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think, const std::string & end_think) {
+    parse_msg_with_xml_tool_calls(*this, form, start_think, end_think);
+}
--- a/common/chat-parser-xml-toolcall.h
+++ b/common/chat-parser-xml-toolcall.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include "chat.h"
+
+#include <nlohmann/json.hpp>
+
+#include <optional>
+#include <string>
+#include <vector>
+
+
+// Sample config:
+// MiniMax-M2 (left): <minimax:tool_call>\n<invoke name="tool-name">\n<parameter name="key">value</parameter>\n...</invoke>\n...</minimax:tool_call>
+// GLM 4.5   (right): <tool_call>function_name\n<arg_key>key</arg_key>\n<arg_value>value</arg_value>\n</tool_call>
+struct xml_tool_call_format {
+    std::string scope_start; // <minimax:tool_call>\n  // \n                      // can be empty
+    std::string tool_start;  // <invoke name=\"        // <tool_call>
+    std::string tool_sep;    // \">\n                  // \n                      // can be empty only for parse_xml_tool_calls
+    std::string key_start;   // <parameter name=\"     // <arg_key>
+    std::string key_val_sep; // \">                    // </arg_key>\n<arg_value>
+    std::string val_end;     // </parameter>\n         // </arg_value>\n
+    std::string tool_end;    // </invoke>\n            // </tool_call>\n
+    std::string scope_end;   // </minimax:tool_call>   //                         // can be empty
+    // Set this if there can be dynamic spaces inside key_val_sep.
+    // e.g. key_val_sep=</arg_key> key_val_sep2=<arg_value> for GLM4.5
+    std::optional<std::string> key_val_sep2 = std::nullopt;
+    // Set true if argval should only be raw string. e.g. Hello "world" hi
+    // Set false if argval should only be json string. e.g. "Hello \"world\" hi"
+    // Defaults to std::nullopt, both will be allowed.
+    std::optional<bool> raw_argval = std::nullopt;
+    std::optional<std::string> last_val_end = std::nullopt;
+    std::optional<std::string> last_tool_end = std::nullopt;
+    bool trim_raw_argval = false;
+    bool allow_toolcall_in_think = false; // TODO: UNTESTED!!!
+};
+
+// make a GBNF that accept any strings except those containing any of the forbidden strings.
+std::string make_gbnf_excluding(std::vector<std::string> forbids);
+
+/**
+ * Build grammar for xml-style tool call
+ * form.scope_start and form.scope_end can be empty.
+ * Requires data.format for model-specific hacks.
+ */
+void build_grammar_xml_tool_call(common_chat_params & data, const nlohmann::ordered_json & tools, const struct xml_tool_call_format & form);
--- a/common/chat-parser.h
+++ b/common/chat-parser.h
@@ -1,6 +1,7 @@
 #pragma once

 #include "chat.h"
+#include "chat-parser-xml-toolcall.h"
 #include "json-partial.h"
 #include "regex-partial.h"

@@ -119,5 +120,14 @@ class common_chat_msg_parser {
        const std::vector<std::vector<std::string>> & content_paths = {}
    );

+    /**
+     * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
+     * form.scope_start, form.tool_sep and form.scope_end can be empty.
+     */
+    bool try_consume_xml_tool_calls(const struct xml_tool_call_format & form);
+
+    // Parse content uses reasoning and XML-Style tool call
+    void consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think = "<think>", const std::string & end_think = "</think>");
+
    void clear_tools();
 };
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -9,8 +9,11 @@
 #include <minja/chat-template.hpp>
 #include <minja/minja.hpp>

+#include <algorithm>
 #include <cstdio>
+#include <cctype>
 #include <exception>
+#include <functional>
 #include <iostream>
 #include <optional>
 #include <stdexcept>
@@ -310,7 +313,6 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
        }
        if (!msg.reasoning_content.empty()) {
            jmsg["reasoning_content"] = msg.reasoning_content;
-            jmsg["thinking"] = msg.reasoning_content; // gpt-oss
        }
        if (!msg.tool_name.empty()) {
            jmsg["name"] = msg.tool_name;
@@ -640,6 +642,13 @@ const char * common_chat_format_name(common_chat_format format) {
        case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
        case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
        case COMMON_CHAT_FORMAT_APERTUS: return "Apertus";
+        case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS: return "LFM2 with JSON tools";
+        case COMMON_CHAT_FORMAT_MINIMAX_M2: return "MiniMax-M2";
+        case COMMON_CHAT_FORMAT_GLM_4_5: return "GLM 4.5";
+        case COMMON_CHAT_FORMAT_KIMI_K2: return "Kimi K2";
+        case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
+        case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
+        case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
        default:
            throw std::runtime_error("Unknown chat format");
    }
@@ -986,6 +995,126 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
    return data;
 }

+
+// Case-insensitive find
+static size_t ifind_string(const std::string & haystack, const std::string & needle, size_t pos = 0) {
+    auto it = std::search(
+        haystack.begin() + pos, haystack.end(),
+        needle.begin(), needle.end(),
+        [](char a, char b) { return std::tolower(a) == std::tolower(b); }
+    );
+    return (it == haystack.end()) ? std::string::npos : std::distance(haystack.begin(), it);
+}
+
+static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl, const struct templates_params & inputs) {
+    common_chat_params data;
+    const auto is_json_schema_provided = !inputs.json_schema.is_null();
+    const auto is_grammar_provided = !inputs.grammar.empty();
+    const auto are_tools_provided = inputs.tools.is_array() && !inputs.tools.empty();
+
+    // the logic requires potentially modifying the messages
+    auto tweaked_messages = inputs.messages;
+
+    auto replace_json_schema_marker = [](json & messages) -> bool {
+        static std::string marker1 = "force json schema.\n";
+        static std::string marker2 = "force json schema.";
+
+        if (messages.empty() || messages.at(0).at("role") != "system") {
+            return false;
+        }
+
+        std::string content = messages.at(0).at("content");
+
+        for (const auto & marker : {marker1, marker2}) {
+            const auto pos = ifind_string(content, marker);
+            if (pos != std::string::npos) {
+                content.replace(pos, marker.length(), "");
+                // inject modified content back into the messages
+                messages.at(0).at("content") = content;
+                return true;
+            }
+        }
+
+        return false;
+    };
+
+    // Lfm2 model does not natively work with json, but can generally understand the tools structure
+    //
+    // Example of the pytorch dialog structure:
+    //     <|startoftext|><|im_start|>system
+    //     List of tools: <|tool_list_start|>[{"name": "get_candidate_status", "description": "Retrieves the current status of a candidate in the recruitment process", "parameters": {"type": "object", "properties": {"candidate_id": {"type": "string", "description": "Unique identifier for the candidate"}}, "required": ["candidate_id"]}}]<|tool_list_end|><|im_end|>
+    //     <|im_start|>user
+    //     What is the current status of candidate ID 12345?<|im_end|>
+    //     <|im_start|>assistant
+    //     <|tool_call_start|>[get_candidate_status(candidate_id="12345")]<|tool_call_end|>Checking the current status of candidate ID 12345.<|im_end|>
+    //     <|im_start|>tool
+    //     <|tool_response_start|>{"candidate_id": "12345", "status": "Interview Scheduled", "position": "Clinical Research Associate", "date": "2023-11-20"}<|tool_response_end|><|im_end|>
+    //     <|im_start|>assistant
+    //     The candidate with ID 12345 is currently in the "Interview Scheduled" stage for the position of Clinical Research Associate, with an interview date set for 2023-11-20.<|im_end|>
+    //
+    // For the llama server compatibility with json tools semantic,
+    // the client can add "Follow json schema." line into the system message prompt to force the json output.
+    //
+    if (are_tools_provided && (is_json_schema_provided || is_grammar_provided)) {
+        // server/utils.hpp prohibits that branch for the custom grammar anyways
+        throw std::runtime_error("Tools call must not use \"json_schema\" or \"grammar\", use non-tool invocation if you want to use custom grammar");
+    } else if (are_tools_provided && replace_json_schema_marker(tweaked_messages)) {
+        LOG_INF("%s: Using tools to build a grammar\n", __func__);
+
+        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+            auto schemas = json::array();
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                schemas.push_back({
+                    {"type", "object"},
+                    {"properties", {
+                        {"name", {
+                            {"type", "string"},
+                            {"const", function.at("name")},
+                        }},
+                        {"arguments", function.at("parameters")},
+                    }},
+                    {"required", json::array({"name", "arguments", "id"})},
+                });
+            });
+            auto schema = json {
+                {"type", "array"},
+                {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
+                {"minItems", 1},
+            };
+            if (!inputs.parallel_tool_calls) {
+                schema["maxItems"] = 1;
+            }
+
+            builder.add_rule("root", "\"<|tool_call_start|>\"" + builder.add_schema("tool_calls", schema) + "\"<|tool_call_end|>\"");
+        });
+        // model has no concept of tool selection mode choice,
+        // if the system prompt rendered correctly it will produce a tool call
+        // the grammar goes inside the tool call body
+        data.grammar_lazy = true;
+        data.grammar_triggers = {{COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL, "\\s*<\\|tool_call_start\\|>\\s*\\["}};
+        data.preserved_tokens = {"<|tool_call_start|>", "<|tool_call_end|>"};
+        data.format = COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS;
+    } else if (are_tools_provided && (!is_json_schema_provided && !is_grammar_provided)) {
+        LOG_INF("%s: Using tools without json schema or grammar\n", __func__);
+        // output those tokens
+        data.preserved_tokens = {"<|tool_call_start|>", "<|tool_call_end|>"};
+    } else if (is_json_schema_provided) {
+        LOG_INF("%s: Using provided json schema to build a grammar\n", __func__);
+        data.grammar = json_schema_to_grammar(inputs.json_schema);
+    } else if (is_grammar_provided) {
+        LOG_INF("%s: Using provided grammar\n", __func__);
+        data.grammar = inputs.grammar;
+    } else {
+        LOG_INF("%s: Using content relying on the template\n", __func__);
+    }
+
+    data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
+    LOG_DBG("%s: Prompt: %s\n", __func__, data.prompt.c_str());
+
+    return data;
+}
+
 static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
    common_chat_params data;
    data.prompt = apply(tmpl, inputs);
@@ -1684,9 +1813,297 @@ static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
    }
 }

+
+static common_chat_params common_chat_params_init_minimax_m2(const common_chat_template & tmpl, const struct templates_params & params) {
+    common_chat_params data;
+    data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+
+    data.prompt = apply(tmpl, params);
+    data.format = COMMON_CHAT_FORMAT_MINIMAX_M2;
+
+    // Handle thinking tags based on prompt ending
+    if (string_ends_with(data.prompt, "<think>\n")) {
+        if (!params.enable_thinking) {
+            // Close the thinking tag immediately if thinking is disabled
+            data.prompt += "</think>\n\n";
+        } else {
+            // Mark thinking as forced open (template started with <think>)
+            data.thinking_forced_open = true;
+        }
+    }
+
+    // Preserve MiniMax-M2 special tokens
+    data.preserved_tokens = {
+        "<think>",
+        "</think>",
+        "<minimax:tool_call>",
+        "</minimax:tool_call>",
+    };
+
+    // build grammar for tool call
+    static const xml_tool_call_format form {
+        /* form.scope_start = */ "<minimax:tool_call>\n",
+        /* form.tool_start  = */ "<invoke name=\"",
+        /* form.tool_sep    = */ "\">\n",
+        /* form.key_start   = */ "<parameter name=\"",
+        /* form.key_val_sep = */ "\">",
+        /* form.val_end     = */ "</parameter>\n",
+        /* form.tool_end    = */ "</invoke>\n",
+        /* form.scope_end   = */ "</minimax:tool_call>",
+    };
+    build_grammar_xml_tool_call(data, params.tools, form);
+
+    return data;
+}
+
+static void common_chat_parse_minimax_m2(common_chat_msg_parser & builder) {
+    static const xml_tool_call_format form {
+        /* form.scope_start = */ "<minimax:tool_call>",
+        /* form.tool_start  = */ "<invoke name=\"",
+        /* form.tool_sep    = */ "\">",
+        /* form.key_start   = */ "<parameter name=\"",
+        /* form.key_val_sep = */ "\">",
+        /* form.val_end     = */ "</parameter>",
+        /* form.tool_end    = */ "</invoke>",
+        /* form.scope_end   = */ "</minimax:tool_call>",
+    };
+    builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
+}
+
+static common_chat_params common_chat_params_init_qwen3_coder_xml(const common_chat_template & tmpl, const struct templates_params & params) {
+    common_chat_params data;
+    data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+
+    data.prompt = apply(tmpl, params);
+    data.format = COMMON_CHAT_FORMAT_QWEN3_CODER_XML;
+
+    data.preserved_tokens = {
+        "<tool_call>",
+        "</tool_call>",
+        "<function=",
+        "</function>",
+        "<parameter=",
+        "</parameter>",
+    };
+
+    // build grammar for tool call
+    static const xml_tool_call_format form {
+        /* form.scope_start = */ "<tool_call>\n",
+        /* form.tool_start  = */ "<function=",
+        /* form.tool_sep    = */ ">\n",
+        /* form.key_start   = */ "<parameter=",
+        /* form.key_val_sep = */ ">\n",
+        /* form.val_end     = */ "\n</parameter>\n",
+        /* form.tool_end    = */ "</function>\n",
+        /* form.scope_end   = */ "</tool_call>",
+    };
+    build_grammar_xml_tool_call(data, params.tools, form);
+
+    return data;
+}
+
+static void common_chat_parse_qwen3_coder_xml(common_chat_msg_parser & builder) {
+    static const xml_tool_call_format form = ([]() {
+        xml_tool_call_format form {};
+        form.scope_start = "<tool_call>";
+        form.tool_start  = "<function=";
+        form.tool_sep    = ">";
+        form.key_start   = "<parameter=";
+        form.key_val_sep = ">";
+        form.val_end     = "</parameter>";
+        form.tool_end    = "</function>";
+        form.scope_end   = "</tool_call>";
+        form.trim_raw_argval = true;
+        return form;
+    })();
+    builder.consume_reasoning_with_xml_tool_calls(form);
+}
+
+static common_chat_params common_chat_params_init_kimi_k2(const common_chat_template & tmpl, const struct templates_params & params) {
+    common_chat_params data;
+    data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+
+    data.prompt = apply(tmpl, params);
+    data.format = COMMON_CHAT_FORMAT_KIMI_K2;
+
+    data.preserved_tokens = {
+        "<think>",
+        "</think>",
+        "<|tool_calls_section_begin|>",
+        "<|tool_call_begin|>",
+        "<|tool_call_argument_begin|>",
+        "<|tool_call_end|>",
+        "<|tool_calls_section_end|>",
+        "<|im_end|>",
+        "<|im_system|>",
+        "<|im_middle|>",
+    };
+
+    data.additional_stops.insert(data.additional_stops.end(), {
+        "<|im_end|>",
+        "<|im_middle|>"
+    });
+    // build grammar for tool call
+    static const xml_tool_call_format form = ([]() {
+        xml_tool_call_format form {};
+        form.scope_start = "<|tool_calls_section_begin|>";
+        form.tool_start  = "<|tool_call_begin|>";
+        form.tool_sep    = "<|tool_call_argument_begin|>{";
+        form.key_start   = "\"";
+        form.key_val_sep = "\": ";
+        form.val_end     = ", ";
+        form.tool_end    = "}<|tool_call_end|>";
+        form.scope_end   = "<|tool_calls_section_end|>";
+        form.raw_argval  = false;
+        form.last_val_end = "";
+        return form;
+    })();
+    build_grammar_xml_tool_call(data, params.tools, form);
+
+    return data;
+}
+
+static void common_chat_parse_kimi_k2(common_chat_msg_parser & builder) {
+    static const xml_tool_call_format form = ([]() {
+        xml_tool_call_format form {};
+        form.scope_start = "<|tool_calls_section_begin|>";
+        form.tool_start  = "<|tool_call_begin|>";
+        form.tool_sep    = "<|tool_call_argument_begin|>{";
+        form.key_start   = "\"";
+        form.key_val_sep = "\": ";
+        form.val_end     = ", ";
+        form.tool_end    = "}<|tool_call_end|>";
+        form.scope_end   = "<|tool_calls_section_end|>";
+        form.raw_argval  = false;
+        form.last_val_end = "";
+        return form;
+    })();
+    builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
+}
+
+static common_chat_params common_chat_params_init_apriel_1_5(const common_chat_template & tmpl, const struct templates_params & params) {
+    common_chat_params data;
+    data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+
+    data.prompt = apply(tmpl, params);
+    data.format = COMMON_CHAT_FORMAT_APRIEL_1_5;
+
+    data.preserved_tokens = {
+        "<thinking>",
+        "</thinking>",
+        "<tool_calls>",
+        "</tool_calls>",
+    };
+
+    // build grammar for tool call
+    static const xml_tool_call_format form = ([]() {
+        xml_tool_call_format form {};
+        form.scope_start = "<tool_calls>[";
+        form.tool_start  = "{\"name\": \"";
+        form.tool_sep    = "\", \"arguments\": {";
+        form.key_start   = "\"";
+        form.key_val_sep = "\": ";
+        form.val_end     = ", ";
+        form.tool_end    = "}, ";
+        form.scope_end   = "]</tool_calls>";
+        form.raw_argval  = false;
+        form.last_val_end = "";
+        form.last_tool_end = "}";
+        return form;
+    })();
+    build_grammar_xml_tool_call(data, params.tools, form);
+
+    return data;
+}
+
+static void common_chat_parse_apriel_1_5(common_chat_msg_parser & builder) {
+    static const xml_tool_call_format form = ([]() {
+        xml_tool_call_format form {};
+        form.scope_start = "<tool_calls>[";
+        form.tool_start  = "{\"name\": \"";
+        form.tool_sep    = "\", \"arguments\": {";
+        form.key_start   = "\"";
+        form.key_val_sep = "\": ";
+        form.val_end     = ", ";
+        form.tool_end    = "}, ";
+        form.scope_end   = "]</tool_calls>";
+        form.raw_argval  = false;
+        form.last_val_end = "";
+        form.last_tool_end = "}";
+        return form;
+    })();
+    builder.consume_reasoning_with_xml_tool_calls(form, "<thinking>", "</thinking>");
+}
+
+static common_chat_params common_chat_params_init_xiaomi_mimo(const common_chat_template & tmpl, const struct templates_params & params) {
+    common_chat_params data;
+    data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+
+    data.prompt = apply(tmpl, params);
+    data.format = COMMON_CHAT_FORMAT_XIAOMI_MIMO;
+
+    data.preserved_tokens = {
+        "<tool_call>",
+        "</tool_call>",
+    };
+
+    // build grammar for tool call
+    static const xml_tool_call_format form = ([]() {
+        xml_tool_call_format form {};
+        form.scope_start = "\n";
+        form.tool_start  = "<tool_call>\n{\"name\": \"";
+        form.tool_sep    = "\", \"arguments\": {";
+        form.key_start   = "\"";
+        form.key_val_sep = "\": ";
+        form.val_end     = ", ";
+        form.tool_end    = "}\n</tool_call>";
+        form.scope_end   = "";
+        form.raw_argval  = false;
+        form.last_val_end = "";
+        return form;
+    })();
+    build_grammar_xml_tool_call(data, params.tools, form);
+
+    return data;
+}
+
+static void common_chat_parse_xiaomi_mimo(common_chat_msg_parser & builder) {
+    static const xml_tool_call_format form = ([]() {
+        xml_tool_call_format form {};
+        form.scope_start = "";
+        form.tool_start  = "<tool_call>\n{\"name\": \"";
+        form.tool_sep    = "\", \"arguments\": {";
+        form.key_start   = "\"";
+        form.key_val_sep = "\": ";
+        form.val_end     = ", ";
+        form.tool_end    = "}\n</tool_call>";
+        form.scope_end   = "";
+        form.raw_argval  = false;
+        form.last_val_end = "";
+        return form;
+    })();
+    builder.consume_reasoning_with_xml_tool_calls(form);
+}
+
 static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
    common_chat_params data;
-    auto prompt = apply(tmpl, inputs);
+
+    // Copy reasoning to the "thinking" field as expected by the gpt-oss template
+    auto adjusted_messages = json::array();
+    for (const auto & msg : inputs.messages) {
+        auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string();
+        auto has_tool_calls = msg.contains("tool_calls") && msg.at("tool_calls").is_array();
+
+        if (has_reasoning_content && has_tool_calls) {
+            auto adjusted_message = msg;
+            adjusted_message["thinking"] = msg.at("reasoning_content");
+            adjusted_messages.push_back(adjusted_message);
+        } else {
+            adjusted_messages.push_back(msg);
+        }
+    }
+
+    auto prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);

    // Check if we need to replace the return token with end token during
    // inference and without generation prompt. For more details see:
@@ -1902,6 +2319,100 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
    }
 }

+static common_chat_params common_chat_params_init_glm_4_5(const common_chat_template & tmpl, const struct templates_params & inputs) {
+    common_chat_params data;
+    data.grammar_lazy = inputs.tools.is_array() && !inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+
+    std::string prompt = apply(tmpl, inputs);
+
+    // match the existing trimming behavior
+    if (inputs.add_bos && string_starts_with(prompt, tmpl.bos_token())) {
+        prompt.erase(0, tmpl.bos_token().size());
+    }
+    if (inputs.add_eos && string_ends_with(prompt, tmpl.eos_token())) {
+        prompt.erase(prompt.size() - tmpl.eos_token().size());
+    }
+    if (string_ends_with(prompt, "<think>")) {
+        if (!inputs.enable_thinking) {
+            prompt += "</think>";
+        } else {
+            data.thinking_forced_open = true;
+        }
+    }
+
+    // add GLM preserved tokens
+    data.preserved_tokens = {
+        "<|endoftext|>",
+        "[MASK]",
+        "[gMASK]",
+        "[sMASK]",
+        "<sop>",
+        "<eop>",
+        "<|system|>",
+        "<|user|>",
+        "<|assistant|>",
+        "<|observation|>",
+        "<|begin_of_image|>",
+        "<|end_of_image|>",
+        "<|begin_of_video|>",
+        "<|end_of_video|>",
+        "<|begin_of_audio|>",
+        "<|end_of_audio|>",
+        "<|begin_of_transcription|>",
+        "<|end_of_transcription|>",
+        "<|code_prefix|>",
+        "<|code_middle|>",
+        "<|code_suffix|>",
+        "/nothink",
+        "<think>",
+        "</think>",
+        "<tool_call>",
+        "</tool_call>",
+        "<arg_key>",
+        "</arg_key>",
+        "<arg_value>",
+        "</arg_value>"
+    };
+
+    // extra GLM 4.5 stop word
+    data.additional_stops.insert(data.additional_stops.end(), {
+        "<|user|>",
+        "<|observation|>"
+    });
+
+    // build grammar for tool call
+    static const xml_tool_call_format form {
+        /* form.scope_start = */ "",
+        /* form.tool_start  = */ "\n<tool_call>",
+        /* form.tool_sep    = */ "\n",
+        /* form.key_start   = */ "<arg_key>",
+        /* form.key_val_sep = */ "</arg_key>\n<arg_value>",
+        /* form.val_end     = */ "</arg_value>\n",
+        /* form.tool_end    = */ "</tool_call>\n",
+        /* form.scope_end   = */ "",
+    };
+    build_grammar_xml_tool_call(data, inputs.tools, form);
+
+    data.prompt = prompt;
+    data.format = COMMON_CHAT_FORMAT_GLM_4_5;
+    return data;
+}
+
+static void common_chat_parse_glm_4_5(common_chat_msg_parser & builder) {
+    static const xml_tool_call_format form {
+        /* form.scope_start  = */ "",
+        /* form.tool_start   = */ "<tool_call>",
+        /* form.tool_sep     = */ "",
+        /* form.key_start    = */ "<arg_key>",
+        /* form.key_val_sep  = */ "</arg_key>",
+        /* form.val_end      = */ "</arg_value>",
+        /* form.tool_end     = */ "</tool_call>",
+        /* form.scope_end    = */ "",
+        /* form.key_val_sep2 = */ "<arg_value>",
+    };
+    builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
+}
+
 static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
    LOG_DBG("%s\n", __func__);
    common_chat_params data;
@@ -2499,94 +3010,85 @@ static void common_chat_parse_apertus(common_chat_msg_parser & builder) {
    builder.add_content(builder.consume_rest());
 }

-static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
-    // Parse thinking tags first - this handles the main reasoning content
-    builder.try_parse_reasoning("<seed:think>", "</seed:think>");

+static void common_chat_parse_lfm2(common_chat_msg_parser & builder) {
    if (!builder.syntax().parse_tool_calls) {
        builder.add_content(builder.consume_rest());
        return;
    }

-    // Parse tool calls - Seed-OSS uses <seed:tool_call> format
-    static const common_regex tool_call_begin_regex("<seed:tool_call>");
-    static const common_regex tool_call_end_regex("</seed:tool_call>");
-    static const common_regex function_regex("<function=([^>]+)>");
-    static const common_regex param_regex("<parameter=([^>]+)>");
+    // LFM2 format: <|tool_call_start|>[{"name": "get_current_time", "arguments": {"location": "Paris"}}]<|tool_call_end|>
+    static const common_regex tool_call_start_regex(regex_escape("<|tool_call_start|>"));
+    static const common_regex tool_call_end_regex(regex_escape("<|tool_call_end|>"));

-    while (auto tool_res = builder.try_find_regex(tool_call_begin_regex)) {
-        builder.consume_spaces();  // Consume whitespace after <seed:tool_call>
+    // Loop through all tool calls
+    while (auto res = builder.try_find_regex(tool_call_start_regex, std::string::npos, /* add_prelude_to_content= */ true)) {
+        builder.move_to(res->groups[0].end);

-        // Look for function call inside tool call, ignore any content before it
-        if (auto func_res = builder.try_find_regex(function_regex, std::string::npos, false)) {
-            auto function_name = builder.str(func_res->groups[1]);
+        // Parse JSON array format: [{"name": "...", "arguments": {...}}]
+        auto tool_calls_data = builder.consume_json();

-            // Parse Seed-OSS parameters <parameter=name>value</parameter>
-            json args = json::object();
-            // Parse all parameters
-            while (auto param_res = builder.try_find_regex(param_regex, std::string::npos, false)) {
-                // again, ignore noise around parameters
-                auto param_name = builder.str(param_res->groups[1]);
-                builder.move_to(param_res->groups[0].end);
-                builder.consume_spaces();  // Consume whitespace after parameter
-                auto savedPos = builder.pos();
-                if (auto param_parse = builder.try_find_literal("</parameter>")) {
-                    auto param = param_parse->prelude;
-                    builder.move_to(savedPos);
-                    try {
-                        if (auto param_res = builder.try_consume_json()) {
-                            args[param_name] = param_res->json;
-                        } else {
-                            args[param_name] = param;
-                        }
-                    } catch (json::exception &) {
-                        args[param_name] = param;
-                    }
-                } else {
-                    throw common_chat_msg_partial_exception("Incomplete tool parameter");
+        // Consume end marker
+        builder.consume_spaces();
+        if (!builder.try_consume_regex(tool_call_end_regex)) {
+            throw common_chat_msg_partial_exception("Expected <|tool_call_end|>");
+        }
+
+        // Process each tool call in the array
+        if (tool_calls_data.json.is_array()) {
+            for (const auto & tool_call : tool_calls_data.json) {
+                if (!tool_call.is_object()) {
+                    throw common_chat_msg_partial_exception("Tool call must be an object");
                }
-            }
-            // Look for closing function tag
-            auto end_func = builder.try_find_literal("</function>");
-            if (end_func) {
-                builder.move_to(end_func->groups[0].end);
-                builder.consume_spaces();  // Consume whitespace after </function>

-                // Add the tool call with parsed arguments, but only if we REALLY got the literal
-                auto eaten_fragment = builder.input().substr(end_func->groups[0].begin, end_func->groups[0].end);
-                auto funlen = std::string("</function>").length();
-                if (eaten_fragment.length() >= funlen && eaten_fragment.substr(0, funlen) == std::string("</function>")) {
-                    if (!builder.add_tool_call(function_name, "", args.dump())) {
-                        throw common_chat_msg_partial_exception("Incomplete tool call");
+                if (!tool_call.contains("name")) {
+                    throw common_chat_msg_partial_exception("Tool call missing 'name' field");
+                }
+
+                std::string function_name = tool_call.at("name");
+                std::string arguments = "{}";
+
+                if (tool_call.contains("arguments")) {
+                    if (tool_call.at("arguments").is_object()) {
+                        arguments = tool_call.at("arguments").dump();
+                    } else if (tool_call.at("arguments").is_string()) {
+                        arguments = tool_call.at("arguments");
                    }
-                } else {
+                }
+
+                if (!builder.add_tool_call(function_name, "", arguments)) {
                    throw common_chat_msg_partial_exception("Incomplete tool call");
                }
-            } else {
-                throw common_chat_msg_partial_exception("Incomplete tool call");
-            }
-            // Look for closing tool call tag
-            if (auto end_tool = builder.try_find_regex(tool_call_end_regex, std::string::npos, false)) {
-                builder.move_to(end_tool->groups[0].end);
-                builder.consume_spaces();  // Consume trailing whitespace after tool call
-            } else {
-                throw common_chat_msg_partial_exception("Incomplete tool call");
            }
        } else {
-            // No function found - don't consume content here, let it be handled at the end
-            break;
+            throw common_chat_msg_partial_exception("Expected JSON array for tool calls");
        }
+
+        // Consume any trailing whitespace after this tool call
+        builder.consume_spaces();
    }

-    // Consume any remaining whitespace after all tool call processing
-    builder.consume_spaces();
+    // Consume any remaining content after all tool calls
    auto remaining = builder.consume_rest();
-    // If there's any non-whitespace content remaining, add it as content
    if (!string_strip(remaining).empty()) {
        builder.add_content(remaining);
    }
 }

+static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
+    static const xml_tool_call_format form {
+        /* form.scope_start = */ "<seed:tool_call>",
+        /* form.tool_start  = */ "<function=",
+        /* form.tool_sep    = */ ">",
+        /* form.key_start   = */ "<parameter=",
+        /* form.key_val_sep = */ ">",
+        /* form.val_end     = */ "</parameter>",
+        /* form.tool_end    = */ "</function>",
+        /* form.scope_end   = */ "</seed:tool_call>",
+    };
+    builder.consume_reasoning_with_xml_tool_calls(form, "<seed:think>", "</seed:think>");
+}
+
 static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
    common_chat_params data;
    data.prompt = apply(tmpl, inputs);
@@ -2723,6 +3225,35 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_granite(tmpl, params);
    }

+    // GLM 4.5: detect by <arg_key> and <arg_value> tags (check before Hermes since both use <tool_call>)
+    if (src.find("[gMASK]<sop>") != std::string::npos &&
+        src.find("<arg_key>") != std::string::npos &&
+        src.find("<arg_value>") != std::string::npos &&
+        params.json_schema.is_null()) {
+        return common_chat_params_init_glm_4_5(tmpl, params);
+    }
+
+    // Qwen3-Coder XML format detection (must come before Hermes 2 Pro)
+    // Detect via explicit XML markers unique to Qwen3-Coder to avoid false positives in other templates.
+    // Require presence of <tool_call>, <function=...>, and <parameter=...> blocks.
+    if (src.find("<tool_call>") != std::string::npos &&
+        src.find("<function>") != std::string::npos &&
+        src.find("<function=") != std::string::npos &&
+        src.find("<parameters>") != std::string::npos &&
+        src.find("<parameter=") != std::string::npos) {
+        return common_chat_params_init_qwen3_coder_xml(tmpl, params);
+    }
+
+    // Xiaomi MiMo format detection (must come before Hermes 2 Pro)
+    if (src.find("<tools>") != std::string::npos &&
+        src.find("# Tools") != std::string::npos &&
+        src.find("</tools>") != std::string::npos &&
+        src.find("<tool_calls>") != std::string::npos &&
+        src.find("</tool_calls>") != std::string::npos &&
+        src.find("<tool_response>") != std::string::npos) {
+        return common_chat_params_init_xiaomi_mimo(tmpl, params);
+    }
+
    // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
    if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
        return common_chat_params_init_hermes_2_pro(tmpl, params);
@@ -2748,6 +3279,35 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_apertus(tmpl, params);
    }

+    // LFM2 (w/ tools)
+    if (src.find("List of tools: <|tool_list_start|>[") != std::string::npos &&
+        src.find("]<|tool_list_end|>") != std::string::npos) {
+        return common_chat_params_init_lfm2(tmpl, params);
+    }
+
+    // MiniMax-M2 format detection
+    if (src.find("]~!b[") != std::string::npos && src.find("]~b]") != std::string::npos) {
+        return common_chat_params_init_minimax_m2(tmpl, params);
+    }
+
+    // Kimi K2 format detection
+    if (src.find("<|im_system|>tool_declare<|im_middle|>") != std::string::npos &&
+        src.find("<|tool_calls_section_begin|>") != std::string::npos &&
+        src.find("## Return of") != std::string::npos) {
+        return common_chat_params_init_kimi_k2(tmpl, params);
+    }
+
+    // Apriel 1.5 format detection
+    if (src.find("<thinking>") != std::string::npos &&
+        src.find("</thinking>") != std::string::npos &&
+        src.find("<available_tools>") != std::string::npos &&
+        src.find("<|assistant|>") != std::string::npos &&
+        src.find("<|tool_result|>") != std::string::npos &&
+        src.find("<tool_calls>[") != std::string::npos &&
+        src.find("]</tool_calls>") != std::string::npos) {
+        return common_chat_params_init_apriel_1_5(tmpl, params);
+    }
+
    // Use generic handler when mixing tools + JSON schema.
    // TODO: support that mix in handlers below.
    if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2799,7 +3359,7 @@ static common_chat_params common_chat_templates_apply_legacy(
    const struct common_chat_templates * tmpls,
    const struct common_chat_templates_inputs & inputs)
 {
-    int alloc_size = 0;
+    size_t alloc_size = 0;
    std::vector<llama_chat_message> chat;
    std::vector<std::string> contents;

@@ -2821,7 +3381,8 @@ static common_chat_params common_chat_templates_apply_legacy(
        const auto & msg = inputs.messages[i];
        const auto & content = contents[i];
        chat.push_back({msg.role.c_str(), content.c_str()});
-        alloc_size += (msg.role.size() + content.size()) * 1.25;
+        size_t msg_size = msg.role.size() + content.size();
+        alloc_size += msg_size + (msg_size / 4); // == msg_size * 1.25 but avoiding float ops
    }

    std::vector<char> buf(alloc_size);
@@ -2843,6 +3404,11 @@ static common_chat_params common_chat_templates_apply_legacy(
        res = llama_chat_apply_template(src.c_str(), chat.data(), chat.size(), inputs.add_generation_prompt, buf.data(), buf.size());
    }

+    // for safety, we check the result again
+    if (res < 0 || (size_t) res > buf.size()) {
+        throw std::runtime_error("failed to apply chat template, try using --jinja");
+    }
+
    common_chat_params params;
    params.prompt = std::string(buf.data(), res);
    if (!inputs.json_schema.empty()) {
@@ -2926,6 +3492,27 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
        case COMMON_CHAT_FORMAT_APERTUS:
            common_chat_parse_apertus(builder);
            break;
+        case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS:
+            common_chat_parse_lfm2(builder);
+            break;
+        case COMMON_CHAT_FORMAT_MINIMAX_M2:
+            common_chat_parse_minimax_m2(builder);
+            break;
+        case COMMON_CHAT_FORMAT_GLM_4_5:
+            common_chat_parse_glm_4_5(builder);
+            break;
+        case COMMON_CHAT_FORMAT_KIMI_K2:
+            common_chat_parse_kimi_k2(builder);
+            break;
+        case COMMON_CHAT_FORMAT_QWEN3_CODER_XML:
+            common_chat_parse_qwen3_coder_xml(builder);
+            break;
+        case COMMON_CHAT_FORMAT_APRIEL_1_5:
+            common_chat_parse_apriel_1_5(builder);
+            break;
+        case COMMON_CHAT_FORMAT_XIAOMI_MIMO:
+            common_chat_parse_xiaomi_mimo(builder);
+            break;
        default:
            throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
    }
--- a/common/chat.h
+++ b/common/chat.h
@@ -116,6 +116,13 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_SEED_OSS,
    COMMON_CHAT_FORMAT_NEMOTRON_V2,
    COMMON_CHAT_FORMAT_APERTUS,
+    COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS,
+    COMMON_CHAT_FORMAT_GLM_4_5,
+    COMMON_CHAT_FORMAT_MINIMAX_M2,
+    COMMON_CHAT_FORMAT_KIMI_K2,
+    COMMON_CHAT_FORMAT_QWEN3_CODER_XML,
+    COMMON_CHAT_FORMAT_APRIEL_1_5,
+    COMMON_CHAT_FORMAT_XIAOMI_MIMO,

    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -8,6 +8,7 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "sampling.h"

 #include <algorithm>
 #include <cinttypes>
@@ -26,7 +27,6 @@
 #include <sstream>
 #include <string>
 #include <thread>
-#include <unordered_map>
 #include <unordered_set>
 #include <vector>

@@ -60,6 +60,14 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+common_time_meas::common_time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
+
+common_time_meas::~common_time_meas() {
+    if (t_start_us >= 0) {
+        t_acc += ggml_time_us() - t_start_us;
+    }
+}
+
 //
 // CPU utils
 //
@@ -355,11 +363,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
 }

 void common_init() {
-    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
-        if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
-            common_log_add(common_log_main(), level, "%s", text);
-        }
-    }, NULL);
+    llama_log_set(common_log_default_callback, NULL);

 #ifdef NDEBUG
    const char * build_type = "";
@@ -908,11 +912,96 @@ std::string fs_get_cache_file(const std::string & filename) {
    return cache_directory + filename;
 }

+std::vector<common_file_info> fs_list_files(const std::string & path) {
+    std::vector<common_file_info> files;
+    if (path.empty()) return files;
+
+    std::filesystem::path dir(path);
+    if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
+        return files;
+    }
+
+    for (const auto & entry : std::filesystem::directory_iterator(dir)) {
+        try {
+            // Only include regular files (skip directories)
+            const auto & p = entry.path();
+            if (std::filesystem::is_regular_file(p)) {
+                common_file_info info;
+                info.path = p.string();
+                info.name = p.filename().string();
+                try {
+                    info.size = static_cast<size_t>(std::filesystem::file_size(p));
+                } catch (const std::filesystem::filesystem_error &) {
+                    info.size = 0;
+                }
+                files.push_back(std::move(info));
+            }
+        } catch (const std::filesystem::filesystem_error &) {
+            // skip entries we cannot inspect
+            continue;
+        }
+    }
+
+    return files;
+}
+

 //
 // Model utils
 //

+static inline void common_init_sampler_from_model(
+    const llama_model * model,
+    common_params_sampling & sparams) {
+
+    const uint64_t config = sparams.user_sampling_config;
+
+    auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
+        if (config & user_config) return;
+
+        char buf[64] = {0};
+        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
+            char * end = nullptr;
+            int32_t v = strtol(buf, &end, 10);
+            if (end && end != buf) dst = v;
+        }
+    };
+
+    auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
+        if (config & user_config) return;
+
+        char buf[128] = {0};
+        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
+            char * end = nullptr;
+            float v = strtof(buf, &end);
+            if (end && end != buf) dst = v;
+        }
+    };
+
+    // Sampling sequence
+    if (!(config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS)) {
+        char buf[512] = {0};
+        if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
+            const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
+            if (!sampler_names.empty()) {
+                sparams.samplers = common_sampler_types_from_names(sampler_names, true);
+            }
+        }
+    }
+
+    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_K),           sparams.top_k,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_P),           sparams.top_p,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIN_P),           sparams.min_p,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY), sparams.xtc_probability, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD),   sparams.xtc_threshold,   common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TEMP),            sparams.temp,            common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP);
+    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N),  sparams.penalty_last_n,  common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT),  sparams.penalty_repeat,  common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT);
+    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT),        sparams.mirostat,        common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU),    sparams.mirostat_tau,    common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA),    sparams.mirostat_eta,    common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
+}
+
 struct common_init_result common_init_from_params(common_params & params) {
    common_init_result iparams;
    auto mparams = common_model_params_to_llama(params);
@@ -924,6 +1013,8 @@ struct common_init_result common_init_from_params(common_params & params) {
        return iparams;
    }

+    common_init_sampler_from_model(model, params.sampling);
+
    const llama_vocab * vocab = llama_model_get_vocab(model);

    auto cparams = common_context_params_to_llama(params);
--- a/common/common.h
+++ b/common/common.h
@@ -2,17 +2,15 @@

 #pragma once

+#include "ggml-opt.h"
+#include "llama-cpp.h"
+
 #include <set>
 #include <sstream>
 #include <string>
 #include <string_view>
 #include <vector>
 #include <map>
-#include <sstream>
-#include <cmath>
-
-#include "ggml-opt.h"
-#include "llama-cpp.h"

 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -30,6 +28,15 @@

 #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"

+struct common_time_meas {
+    common_time_meas(int64_t & t_acc, bool disable = false);
+    ~common_time_meas();
+
+    const int64_t t_start_us;
+
+    int64_t & t_acc;
+};
+
 struct common_adapter_lora_info {
    std::string path;
    float scale;
@@ -133,6 +140,22 @@ struct common_grammar_trigger {
    llama_token token = LLAMA_TOKEN_NULL;
 };

+enum common_params_sampling_config : uint64_t {
+    COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS        = 1 << 0,
+    COMMON_PARAMS_SAMPLING_CONFIG_TOP_K           = 1 << 1,
+    COMMON_PARAMS_SAMPLING_CONFIG_TOP_P           = 1 << 2,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIN_P           = 1 << 3,
+    COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
+    COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD   = 1 << 5,
+    COMMON_PARAMS_SAMPLING_CONFIG_TEMP            = 1 << 6,
+    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N  = 1 << 7,
+    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT  = 1 << 8,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT        = 1 << 9,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU    = 1 << 10,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA    = 1 << 11,
+};
+
+
 // sampling parameters
 struct common_params_sampling {
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -165,6 +188,8 @@ struct common_params_sampling {
    bool    no_perf            = false; // disable performance metrics
    bool    timing_per_token   = false;

+    uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
+
    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY


@@ -406,6 +431,8 @@ struct common_params {
    bool mmproj_use_gpu = true;     // use GPU for multimodal model
    bool no_mmproj = false;         // explicitly disable multimodal model
    std::vector<std::string> image; // path to image file(s)
+    int image_min_tokens = -1;
+    int image_max_tokens = -1;

    // finetune
    struct lr_opt lr;
@@ -458,7 +485,8 @@ struct common_params {
    float slot_prompt_similarity = 0.1f;

    // batched-bench params
-    bool is_pp_shared = false;
+    bool is_pp_shared   = false;
+    bool is_tg_separate = false;

    std::vector<int32_t> n_pp;
    std::vector<int32_t> n_tg;
@@ -505,6 +533,10 @@ struct common_params {
    // return false from callback to abort model loading or true to continue
    llama_progress_callback load_progress_callback = NULL;
    void *                  load_progress_callback_user_data = NULL;
+
+    bool has_speculative() const {
+        return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
+    }
 };

 // call once at the start of a program if it uses libcommon
@@ -605,6 +637,13 @@ bool fs_create_directory_with_parents(const std::string & path);
 std::string fs_get_cache_directory();
 std::string fs_get_cache_file(const std::string & filename);

+struct common_file_info {
+    std::string path;
+    std::string name;
+    size_t      size = 0; // in bytes
+};
+std::vector<common_file_info> fs_list_files(const std::string & path);
+
 //
 // Model utils
 //
--- a/common/download.cpp
+++ b/common/download.cpp
--- a/common/download.h
+++ b/common/download.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <string>
+
+struct common_params_model;
+
+//
+// download functionalities
+//
+
+struct common_cached_model_info {
+    std::string manifest_path;
+    std::string user;
+    std::string model;
+    std::string tag;
+    size_t      size = 0; // GGUF size in bytes
+    std::string to_string() const {
+        return user + "/" + model + ":" + tag;
+    }
+};
+
+struct common_hf_file_res {
+    std::string repo; // repo name with ":tag" removed
+    std::string ggufFile;
+    std::string mmprojFile;
+};
+
+/**
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
+ *
+ * Return pair of <repo, file> (with "repo" already having tag removed)
+ *
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
+ */
+common_hf_file_res common_get_hf_file(
+    const std::string & hf_repo_with_tag,
+    const std::string & bearer_token,
+    bool offline);
+
+// returns true if download succeeded
+bool common_download_model(
+    const common_params_model & model,
+    const std::string & bearer_token,
+    bool offline);
+
+// returns list of cached models
+std::vector<common_cached_model_info> common_list_cached_models();
+
+// resolve and download model from Docker registry
+// return local path to downloaded model file
+std::string common_docker_resolve_model(const std::string & docker);
--- a/common/json-partial.cpp
+++ b/common/json-partial.cpp
@@ -297,8 +297,25 @@ bool common_json_parse(
            it = temptative_end;
            return true;
        }
-        // TODO: handle unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
-        // fprintf(stderr, "Closing: TODO\n");
+        // handle unclosed top-level primitive
+        if (err_loc.position != 0 && !healing_marker.empty() && err_loc.stack.empty()) {
+            std::string str(it, temptative_end);
+            const auto & magic_seed = out.healing_marker.marker = healing_marker;
+            if (can_parse(str + "\"")) {
+                // Was inside an string
+                str += (out.healing_marker.json_dump_marker = magic_seed) + "\"";
+            } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"")) {
+                // Was inside an string after an escape
+                str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"";
+            } else {
+                // TODO: handle more unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
+                // fprintf(stderr, "Closing: TODO\n");
+                return false;
+            }
+            out.json = json::parse(str);
+            it = temptative_end;
+            return true;
+        }
        return false;
    }
    out.json = json::parse(it, end);
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -41,9 +41,9 @@ static std::string build_repetition(const std::string & item_rule, int min_items
    return result;
 }

-static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
-    auto has_min = min_value != std::numeric_limits<int>::min();
-    auto has_max = max_value != std::numeric_limits<int>::max();
+static void _build_min_max_int(int64_t min_value, int64_t max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
+    auto has_min = min_value != std::numeric_limits<int64_t>::min();
+    auto has_max = max_value != std::numeric_limits<int64_t>::max();

    auto digit_range = [&](char from, char to) {
        out << "[";
@@ -159,7 +159,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
    if (has_min) {
        if (min_value < 0) {
            out << "\"-\" (";
-            _build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
+            _build_min_max_int(std::numeric_limits<int64_t>::min(), -min_value, out, decimals_left, /* top_level= */ false);
            out << ") | [0] | [1-9] ";
            more_digits(0, decimals_left - 1);
        } else if (min_value == 0) {
@@ -194,7 +194,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
            }
            digit_range(c, c);
            out << " (";
-            _build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
+            _build_min_max_int(std::stoll(min_s.substr(1)), std::numeric_limits<int64_t>::max(), out, less_decimals, /* top_level= */ false);
            out << ")";
            if (c < '9') {
                out << " | ";
@@ -216,7 +216,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
            _build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
        } else {
            out << "\"-\" (";
-            _build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
+            _build_min_max_int(-max_value, std::numeric_limits<int64_t>::max(), out, decimals_left, /* top_level= */ false);
            out << ")";
        }
        return;
@@ -303,6 +303,8 @@ static std::string format_literal(const std::string & literal) {
    return "\"" + escaped + "\"";
 }

+std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
+
 class SchemaConverter {
 private:
    friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
@@ -601,7 +603,10 @@ private:
    }

    std::string _resolve_ref(const std::string & ref) {
-        std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
+        auto it = ref.find('#');
+        std::string ref_fragment = it != std::string::npos ? ref.substr(it + 1) : ref;
+        static const std::regex nonalphanumeric_regex(R"([^a-zA-Z0-9-]+)");
+        std::string ref_name = "ref" + std::regex_replace(ref_fragment, nonalphanumeric_regex, "-");
        if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
            _refs_being_resolved.insert(ref);
            json resolved = _refs[ref];
@@ -774,11 +779,24 @@ public:
                        std::vector<std::string> tokens = string_split(pointer, "/");
                        for (size_t i = 1; i < tokens.size(); ++i) {
                            std::string sel = tokens[i];
-                            if (target.is_null() || !target.contains(sel)) {
+                            if (target.is_object() && target.contains(sel)) {
+                                target = target[sel];
+                            } else if (target.is_array()) {
+                                size_t sel_index;
+                                try {
+                                    sel_index = std::stoul(sel);
+                                } catch (const std::invalid_argument & e) {
+                                    sel_index = target.size();
+                                }
+                                if (sel_index >= target.size()) {
+                                    _errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
+                                    return;
+                                }
+                                target = target[sel_index];
+                            } else {
                                _errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
                                return;
                            }
-                            target = target[sel];
                        }
                        _refs[ref] = target;
                    }
@@ -925,17 +943,17 @@ public:
            int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
        } else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
-            int min_value = std::numeric_limits<int>::min();
-            int max_value = std::numeric_limits<int>::max();
+            int64_t min_value = std::numeric_limits<int64_t>::min();
+            int64_t max_value = std::numeric_limits<int64_t>::max();
            if (schema.contains("minimum")) {
-                min_value = schema["minimum"].get<int>();
+                min_value = schema["minimum"].get<int64_t>();
            } else if (schema.contains("exclusiveMinimum")) {
-                min_value = schema["exclusiveMinimum"].get<int>() + 1;
+                min_value = schema["exclusiveMinimum"].get<int64_t>() + 1;
            }
            if (schema.contains("maximum")) {
-                max_value = schema["maximum"].get<int>();
+                max_value = schema["maximum"].get<int64_t>();
            } else if (schema.contains("exclusiveMaximum")) {
-                max_value = schema["exclusiveMaximum"].get<int>() - 1;
+                max_value = schema["exclusiveMaximum"].get<int64_t>() - 1;
            }
            std::stringstream out;
            out << "(";
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -18,4 +18,6 @@ struct common_grammar_options {
    bool dotall = false;
 };

+std::string gbnf_format_literal(const std::string & literal);
+
 std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -442,3 +442,9 @@ void common_log_set_prefix(struct common_log * log, bool prefix) {
 void common_log_set_timestamps(struct common_log * log, bool timestamps) {
    log->set_timestamps(timestamps);
 }
+
+void common_log_default_callback(enum ggml_log_level level, const char * text, void * /*user_data*/) {
+    if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
+        common_log_add(common_log_main(), level, "%s", text);
+    }
+}
--- a/common/log.h
+++ b/common/log.h
@@ -36,6 +36,8 @@ extern int common_log_verbosity_thold;

 void common_log_set_verbosity_thold(int verbosity); // not thread-safe

+void common_log_default_callback(enum ggml_log_level level, const char * text, void * user_data);
+
 // the common_log uses an internal worker thread to print/write log messages
 // when the worker thread is paused, incoming log messages are discarded
 struct common_log;
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -3,9 +3,10 @@
 #include "common.h"
 #include "log.h"

-#include <cmath>
-#include <unordered_map>
 #include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <unordered_map>

 // the ring buffer works similarly to std::deque, but with a fixed capacity
 // TODO: deduplicate with llama-impl.h
@@ -112,6 +113,13 @@ struct common_sampler {

    llama_token_data_array cur_p;

+    void reset() {
+        prev.clear();
+
+        llama_sampler_reset(grmr);
+        llama_sampler_reset(chain);
+    }
+
    void set_logits(struct llama_context * ctx, int idx) {
        const auto * logits = llama_get_logits_ith(ctx, idx);

@@ -128,6 +136,12 @@ struct common_sampler {

        cur_p = { cur.data(), cur.size(), -1, false };
    }
+
+    common_time_meas tm() {
+        return common_time_meas(t_total_us, params.no_perf);
+    }
+
+    mutable int64_t t_total_us = 0;
 };

 std::string common_params_sampling::print() const {
@@ -298,6 +312,8 @@ void common_sampler_free(struct common_sampler * gsmpl) {
 }

 void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
+    const auto tm = gsmpl->tm();
+
    if (accept_grammar) {
        llama_sampler_accept(gsmpl->grmr, token);
    }
@@ -308,9 +324,7 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
 }

 void common_sampler_reset(struct common_sampler * gsmpl) {
-    llama_sampler_reset(gsmpl->grmr);
-
-    llama_sampler_reset(gsmpl->chain);
+    gsmpl->reset();
 }

 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
@@ -327,16 +341,54 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
 void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
    // TODO: measure grammar performance

+    const double t_sampling_ms = gsmpl ? 1e-3*gsmpl->t_total_us : 0;
+
+    llama_perf_sampler_data data_smpl;
+    llama_perf_context_data data_ctx;
+
+    memset(&data_smpl, 0, sizeof(data_smpl));
+    memset(&data_ctx,  0, sizeof(data_ctx));
+
    if (gsmpl) {
-        llama_perf_sampler_print(gsmpl->chain);
+        auto & data = data_smpl;
+
+        data = llama_perf_sampler(gsmpl->chain);
+
+        // note: the sampling time includes the samplers time + extra time spent in common/sampling
+        LOG_INF("%s:    sampling time = %10.2f ms\n", __func__, t_sampling_ms);
+        LOG_INF("%s:    samplers time = %10.2f ms / %5d tokens\n", __func__, data.t_sample_ms, data.n_sample);
    }
+
    if (ctx) {
-        llama_perf_context_print(ctx);
+        auto & data = data_ctx;
+
+        data = llama_perf_context(ctx);
+
+        const double t_end_ms = 1e-3 * ggml_time_us();
+
+        const double t_total_ms = t_end_ms - data.t_start_ms;
+        const double t_unacc_ms = t_total_ms - (t_sampling_ms + data.t_p_eval_ms + data.t_eval_ms);
+        const double t_unacc_pc = 100.0 * t_unacc_ms /  t_total_ms;
+
+        LOG_INF("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
+        LOG_INF("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+                __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
+        LOG_INF("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+                __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
+        LOG_INF("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
+        LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %%      (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
+        LOG_INF("%s:    graphs reused = %10d\n", __func__, data.n_reused);
+
        llama_memory_breakdown_print(ctx);
    }
 }

 llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+    llama_synchronize(ctx);
+
+    // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
+    const auto tm = gsmpl->tm();
+
    gsmpl->set_logits(ctx, idx);

    auto & grmr  = gsmpl->grmr;
@@ -428,6 +480,8 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
 // helpers

 llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
+    const auto tm = gsmpl->tm();
+
    auto * res = &gsmpl->cur_p;

    if (do_sort && !res->sorted) {
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -139,8 +139,10 @@ models = [
    {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
    {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
    {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
-    {"name": "llada-moe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base", },
+    {"name": "afmoe",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/arcee-ai/Trinity-Tokenizer", },
+    {"name": "bailingmoe2",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", },
    {"name": "granite-docling",  "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
+    {"name": "minimax-m2",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
@@ -435,7 +437,7 @@ for model in models:
            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
        else:
            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
-    except OSError as e:
+    except (OSError, TypeError) as e:
        logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
        continue  # Skip this model and continue with the next one in the loop

--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -242,7 +242,7 @@ def parse_args() -> argparse.Namespace:
        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
    )
    parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f32",
        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
    )
    parser.add_argument(
@@ -277,10 +277,15 @@ def parse_args() -> argparse.Namespace:
    return parser.parse_args()


-def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
+def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None]:
+    from huggingface_hub import try_to_load_from_cache
+
    # normally, adapter does not come with base model config, we need to load it from AutoConfig
    config = AutoConfig.from_pretrained(hf_model_id)
-    return config.to_dict()
+    cache_dir = try_to_load_from_cache(hf_model_id, "config.json")
+    cache_dir = Path(cache_dir).parent if isinstance(cache_dir, str) else None
+
+    return config.to_dict(), cache_dir


 if __name__ == '__main__':
@@ -325,13 +330,13 @@ if __name__ == '__main__':
    # load base model
    if base_model_id is not None:
        logger.info(f"Loading base model from Hugging Face: {base_model_id}")
-        hparams = load_hparams_from_hf(base_model_id)
+        hparams, dir_base_model = load_hparams_from_hf(base_model_id)
    elif dir_base_model is None:
        if "base_model_name_or_path" in lparams:
            model_id = lparams["base_model_name_or_path"]
            logger.info(f"Loading base model from Hugging Face: {model_id}")
            try:
-                hparams = load_hparams_from_hf(model_id)
+                hparams, dir_base_model = load_hparams_from_hf(model_id)
            except OSError as e:
                logger.error(f"Failed to load base model config: {e}")
                logger.error("Please try downloading the base model and add its path to --base")
@@ -480,6 +485,7 @@ if __name__ == '__main__':
            dir_lora_model=dir_lora,
            lora_alpha=alpha,
            hparams=hparams,
+            remote_hf_model_id=base_model_id,
        )

        logger.info("Exporting model...")
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -313,7 +313,12 @@ Converting the matmul weight format from ND to NZ to improve performance. Enable

 ### GGML_CANN_ACL_GRAPH

-Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default.
+Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default. This option is only effective if `USE_ACL_GRAPH` was enabled at compilation time. To enable it, recompile using:
+
+```sh
+cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release -DUSE_ACL_GRAPH=ON
+cmake --build build --config release
+```

 ### GGML_CANN_GRAPH_CACHE_CAPACITY

--- a/docs/backend/OPENCL.md
+++ b/docs/backend/OPENCL.md
@@ -39,18 +39,23 @@ The llama.cpp OpenCL backend is designed to enable llama.cpp on **Qualcomm Adren
 | Adreno 830 (Snapdragon 8 Elite)      | Support |
 | Adreno X85 (Snapdragon X Elite)      | Support |

+> A6x GPUs with a recent driver and compiler are supported; they are usually found in IoT platforms.
+However, A6x GPUs in phones are likely not supported due to the outdated driver and compiler.
+
 ## DataType Supports

 | DataType               | Status                     |
 |:----------------------:|:--------------------------:|
 | Q4_0                   | Support                    |
 | Q6_K                   | Support, but not optimized |
+| Q8_0                   | Support                    |
+| MXFP4                  | Support                    |

 ## Model Preparation

-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration.
+You can refer to the general [llama-quantize tool](/tools/quantize/README.md) for steps to convert a model in Hugging Face safetensor format to GGUF with quantization.

-Currently we support `Q4_0` quantization and have optimize for it. To achieve best performance on Adreno GPU, add `--pure` to `llama-quantize`. For example,
+Currently we support `Q4_0` quantization and have optimized for it. To achieve best performance on Adreno GPU, add `--pure` to `llama-quantize` (i.e., make all weights in `Q4_0`). For example,

 ```sh
 ./llama-quantize --pure ggml-model-qwen2.5-3b-f16.gguf ggml-model-qwen-3b-Q4_0.gguf Q4_0
@@ -58,6 +63,17 @@ Currently we support `Q4_0` quantization and have optimize for it. To achieve be

 Since `Q6_K` is also supported, `Q4_0` quantization without `--pure` will also work. However, the performance will be worse compared to pure `Q4_0` quantization.

+### `MXFP4` MoE Models
+
+OpenAI gpt-oss models are MoE models in `MXFP4`. The quantized model will be in `MXFP4_MOE`, a mixture of `MXFP4` and `Q8_0`.
+For this quantization, there is no need to specify `--pure`.
+For gpt-oss-20b model, you can directly [download](https://huggingface.co/ggml-org/gpt-oss-20b-GGUF) the quantized GGUF file in `MXFP4_MOE` from Hugging Face.
+
+Although it is possible to quantize gpt-oss-20b model in pure `Q4_0` (all weights in `Q4_0`), it is not recommended since `MXFP4` has been optimized for MoE while `Q4_0` is not. In addition, accuracy should degrade with such pure `Q4_0` quantization.
+Hence, using the default `MXFP4_MOE` quantization (see the link above) is recommended for this model.
+
+> Note that the `Q4_0` model found [here](https://huggingface.co/unsloth/gpt-oss-20b-GGUF/blob/main/gpt-oss-20b-Q4_0.gguf) is a mixture of `Q4_0`, `Q8_0` and `MXFP4` and gives better performance than `MXFP4_MOE` quantization.
+
 ## CMake Options

 The OpenCL backend has the following CMake options that control the behavior of the backend.
@@ -146,10 +162,13 @@ A Snapdragon X Elite device with Windows 11 Arm64 is used. Make sure the followi
 * Ninja
 * Visual Studio 2022
 * Powershell 7
+* Python

 Visual Studio provides necessary headers and libraries although it is not directly used for building.
 Alternatively, Visual Studio Build Tools can be installed instead of the full Visual Studio.

+> Note that building using Visual Studio's cl compiler is not supported. Clang must be used. Clang depends on libraries provided by Visual Studio to work. Therefore, Visual Studio must be installed. Alternatively, Visual Studio Build Tools can be installed instead of the full Visual Studio.
+
 Powershell 7 is used for the following commands.
 If an older version of Powershell is used, these commands may not work as they are.

@@ -201,9 +220,12 @@ ninja

 ## Known Issues

- Currently OpenCL backend does not work on Adreno 6xx GPUs.
+- Flash attention does not always improve performance.
+- Currently OpenCL backend works on A6xx GPUs with recent drivers and compilers (usually found in IoT platforms).
+  However, it does not work on A6xx GPUs found in phones with old drivers and compilers.

 ## TODO

 - Optimization for Q6_K
 - Support and optimization for Q4_K
+- Improve flash attention
--- a/docs/backend/hexagon/CMakeUserPresets.json
+++ b/docs/backend/hexagon/CMakeUserPresets.json
@@ -0,0 +1,49 @@
+{
+  "version": 4,
+  "configurePresets": [
+    {
+        "name": "arm64-android-snapdragon",
+        "hidden": true,
+        "architecture": { "value": "arm64",       "strategy": "external" },
+        "toolset":      { "value": "host=x86_64", "strategy": "external" },
+        "cacheVariables": {
+            "ANDROID_ABI":      "arm64-v8a",
+            "ANDROID_PLATFORM": "android-31",
+            "CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake",
+            "CMAKE_C_FLAGS":   "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
+            "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
+            "CMAKE_C_FLAGS_RELEASE":          "-O3 -DNDEBUG",
+            "CMAKE_CXX_FLAGS_RELEASE":        "-O3 -DNDEBUG",
+            "CMAKE_C_FLAGS_RELWITHDEBINFO":   "-O3 -DNDEBUG -g",
+            "CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
+            "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
+            "PREBUILT_LIB_DIR": "android_aarch64",
+            "GGML_OPENMP":      "OFF",
+            "GGML_LLAMAFILE":   "OFF",
+            "GGML_OPENCL":      "ON",
+            "GGML_HEXAGON":     "ON",
+            "LLAMA_CURL":       "OFF"
+        }
+    },
+
+    {
+        "name": "arm64-windows-snapdragon",
+        "inherits": [ "base", "arm64-windows-llvm" ],
+        "cacheVariables": {
+            "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
+            "PREBUILT_LIB_DIR": "windows_aarch64",
+            "GGML_OPENMP":      "OFF",
+            "GGML_LLAMAFILE":   "OFF",
+            "GGML_OPENCL":      "ON",
+            "GGML_HEXAGON":     "ON",
+            "LLAMA_CURL":       "OFF"
+        }
+    },
+
+    { "name": "arm64-android-snapdragon-debug"  , "inherits": [ "base", "arm64-android-snapdragon", "debug" ] },
+    { "name": "arm64-android-snapdragon-release", "inherits": [ "base", "arm64-android-snapdragon", "release" ] },
+
+    { "name": "arm64-windows-snapdragon-debug"  , "inherits": [ "base", "arm64-windows-snapdragon", "debug" ] },
+    { "name": "arm64-windows-snapdragon-release", "inherits": [ "base", "arm64-windows-snapdragon", "release" ] }
+  ]
+}
--- a/docs/backend/hexagon/README.md
+++ b/docs/backend/hexagon/README.md
@@ -0,0 +1,239 @@
+# Snapdragon-based Android devices
+
+## How to Build
+
+The easiest way to build llama.cpp for a Snapdragon-based Android device is using the toolchain Docker image (see github.com/snapdragon-toolchain).
+This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
+
+This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.
+
+```
+~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.3
+[d]/> cd /workspace
+```
+
+The rest of the Android build process assumes that you're running inside the toolchain container.
+Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets:
+
+```
+[d]/workspace> cp docs/backend/hexagon/CMakeUserPresets.json .
+
+[d]/workspace> cmake --preset arm64-android-snapdragon-release -B build-snapdragon
+Preset CMake variables:
+  ANDROID_ABI="arm64-v8a"
+  ...
+  CMAKE_TOOLCHAIN_FILE="/opt/android-ndk-r28b/build/cmake/android.toolchain.cmake"
+  GGML_HEXAGON="ON"
+  GGML_OPENCL="ON"
+  GGML_OPENMP="OFF"
+  HEXAGON_SDK_ROOT="/opt/hexagon/6.4.0.2"
+...
+-- Including OpenCL backend
+-- Including Hexagon backend
+...
+-- Build files have been written to: /workspace/build-snapdragon
+
+[d]/workspace> cmake --build build-snapdragon
+...
+[144/356] Performing build step for 'htp-v73'
+[1/16] Generating htp_iface_skel.c, htp_iface_stub.c, htp_iface.h
+[2/16] Building C object CMakeFiles/ggml-htp-v73.dir/hvx-sigmoid.c.obj
+[3/16] Building C object CMakeFiles/ggml-htp-v73.dir/htp-dma.c.obj
+[4/16] Building C object CMakeFiles/ggml-htp-v73.dir/worker-pool.c.obj
+...
+-- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v73.so
+-- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v75.so
+...
+```
+
+To generate an installable "package" simply use cmake --install:
+
+```
+[d]/workspace> cmake --install build-snapdragon --prefix pkg-adb/llama.cpp
+-- Install configuration: "Release"
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-cpu.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-opencl.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-hexagon.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v73.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v75.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v79.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v81.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml.so
+...
+-- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-bench
+-- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-cli
+...
+```
+
+## How to Install
+
+For this step, your device needs to be configured for on-device development.
+Please see https://developer.android.com/studio/debug/dev-options for details.
+
+Once ADB is enabled, use `adb push` to install `pkg-snapdragon` on the device.
+**Note that the toolchain Docker image doesn't have ADB and doesn't set up the ADB bridge. Please use native ADB on the host.**
+
+```
+~/src/llama.cpp$ adb push pkg-adb/llama.cpp /data/local/tmp/
+pkg-adb/llama.cpp/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s)
+pkg-adb/llama.cpp/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s)
+pkg-adb/llama.cpp/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s)
+102 files pushed, 0 skipped. 186.9 MB/s (963151597 bytes in 4.914s)
+```
+
+At this point, you should also install some models:
+
+```
+~/src/llama.cpp$ wget https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf
+...
+2025-10-11 12:04:52 (10.7 MB/s) - ‘Llama-3.2-1B-Instruct-Q4_0.gguf’ saved [773025920/773025920]
+
+~/src/llama.cpp$ adb push Llama-3.2-1B-Instruct-Q4_0.gguf /data/local/tmp/gguf
+Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920 bytes in 19.250s)
+```
+
+## How to Run
+
+The easiest way to run llama.cpp cli tools is using provided wrapper scripts that properly set up all required environment variables.
+
+llama.cpp supports three backends on Snapdragon-based devices: CPU, Adreno GPU (GPUOpenCL), and Hexagon NPU (HTP0-4).
+You can select which backend to run the model on using the `D=` variable, which maps to the `--device` option.
+
+Hexagon NPU behaves as a "GPU" device when it comes to `-ngl` and other offload-related options.
+
+Here are some examples of running various llama.cpp tools via ADB.
+
+Simple question for Llama-3.2-1B
+
+```
+~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-cli.sh -no-cnv -p "what is the most popular cookie in the world?"
+...
+ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
+ggml-hex: Hexagon Arch version v79
+ggml-hex: allocating new session: HTP0
+ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb4000072c7955e50
+...
+load_tensors: offloading output layer to GPU
+load_tensors: offloaded 17/17 layers to GPU
+load_tensors:          CPU model buffer size =   225.49 MiB
+load_tensors:         HTP0 model buffer size =     0.26 MiB
+load_tensors:  HTP0-REPACK model buffer size =   504.00 MiB
+...
+I hope this helps you understand the world's most popular cookies! [end of text]
+...
+llama_perf_sampler_print:    sampling time =      30.08 ms /   487 runs   (    0.06 ms per token, 16191.77 tokens per second)
+llama_perf_context_print:        load time =     617.94 ms
+llama_perf_context_print: prompt eval time =      80.76 ms /    11 tokens (    7.34 ms per token,   136.21 tokens per second)
+llama_perf_context_print:        eval time =    9210.59 ms /   475 runs   (   19.39 ms per token,    51.57 tokens per second)
+llama_perf_context_print:       total time =    9454.92 ms /   486 tokens
+llama_perf_context_print:    graphs reused =        473
+llama_memory_breakdown_print: | memory breakdown [MiB] | total   free    self   model   context   compute    unaccounted |
+llama_memory_breakdown_print: |   - HTP0 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - Host               |                  439 =   225 +     136 +      77                |
+llama_memory_breakdown_print: |   - HTP0-REPACK        |                  504 =   504 +       0 +       0                |
+```
+
+Summary request for OLMoE-1B-7B. This is a large model that requires two HTP sessions/devices
+
+```
+~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-cli.sh -f surfing.txt -no-cnv
+...
+ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
+ggml-hex: Hexagon Arch version v81
+ggml-hex: allocating new session: HTP0
+ggml-hex: allocating new session: HTP1
+...
+load_tensors: offloading output layer to GPU
+load_tensors: offloaded 17/17 layers to GPU
+load_tensors:          CPU model buffer size =   143.86 MiB
+load_tensors:         HTP1 model buffer size =     0.23 MiB
+load_tensors:  HTP1-REPACK model buffer size =  1575.00 MiB
+load_tensors:         HTP0 model buffer size =     0.28 MiB
+load_tensors:  HTP0-REPACK model buffer size =  2025.00 MiB
+...
+llama_context:        CPU  output buffer size =     0.19 MiB
+llama_kv_cache:       HTP1 KV buffer size =   238.00 MiB
+llama_kv_cache:       HTP0 KV buffer size =   306.00 MiB
+llama_kv_cache: size =  544.00 MiB (  8192 cells,  16 layers,  1/1 seqs), K (q8_0):  272.00 MiB, V (q8_0):  272.00 MiB
+llama_context:       HTP0 compute buffer size =    15.00 MiB
+llama_context:       HTP1 compute buffer size =    15.00 MiB
+llama_context:        CPU compute buffer size =    24.56 MiB
+...
+llama_perf_context_print: prompt eval time =    1730.57 ms /   212 tokens (    8.16 ms per token,   122.50 tokens per second)
+llama_perf_context_print:        eval time =    5624.75 ms /   257 runs   (   21.89 ms per token,    45.69 tokens per second)
+llama_perf_context_print:       total time =    7377.33 ms /   469 tokens
+llama_perf_context_print:    graphs reused =        255
+llama_memory_breakdown_print: | memory breakdown [MiB] | total   free    self   model   context   compute    unaccounted |
+llama_memory_breakdown_print: |   - HTP0 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - HTP1 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - Host               |                  742 =   144 +     544 +      54                |
+llama_memory_breakdown_print: |   - HTP1-REPACK        |                 1575 =  1575 +       0 +       0                |
+llama_memory_breakdown_print: |   - HTP0-REPACK        |                 2025 =  2025 +       0 +       0                |
+```
+
+Op test for MUL_MAT
+
+```
+~/src/llama.cpp$ HB=0 ./scripts/snapdragon/adb/run-tool.sh test-backend-ops -b HTP0 -o MUL_MAT
+...
+Backend 2/3: HTP0
+Device description: Hexagon
+Device memory: 2048 MB (2048 MB free)
+MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
+MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
+MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
+
+~/src/llama.cpp-hexagon$ M=Llama-3.2-1B-Instruct-Q4_0.gguf ./scripts/snapdragon/adb/run-bench.sh -p 128 -n 64
+...
+ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
+ggml-hex: Hexagon Arch version v79
+ggml-hex: allocating new session: HTP0
+ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb400007d4b231090
+| model          |       size | params | backend    | ngl | threads | n_batch | mmap |  test |           t/s |
+| ---------------| ---------: | -----: | ---------- | --: | ------: | ------: | ---: | ----: | ------------: |
+| llama 1B Q4_0  | 729.75 MiB | 1.24 B | HTP        |  99 |       4 |     128 |    0 | pp128 | 169.42 ± 1.75 |
+| llama 1B Q4_0  | 729.75 MiB | 1.24 B | HTP        |  99 |       4 |     128 |    0 |  tg64 |  51.54 ± 1.13 |
+
+build: 6a8cf8914 (6733)
+```
+
+## Environment variables
+
+- `GGML_HEXAGON_NDEV=1`
+  Controls the number of devices/sessions to allocate. The default is 1.
+  Most quantized models under 4B fit into a single session; an 8B model needs two, and a 20B model needs four.
+
+- `GGML_HEXAGON_NHVX=0`
+  Controls the number of HVX hardware threads to use. The default is all (actual number varies depending on the hardware version).
+
+- `GGML_HEXAGON_HOSTBUF=1`
+  Controls whether the Hexagon backend allocates host buffers. By default, all buffers except for REPACK are host buffers.
+  This option is required for testing Ops that require REPACK buffers (MUL_MAT and MUL_MAT_ID).
+
+- `GGML_HEXAGON_VERBOSE=1`
+  Enables verbose logging of Ops from the backend. Example output:
+
+  ```
+  ggml-hex: HTP0 graph-compute n_nodes 2
+  ggml-hex: HTP0 matmul : blk.27.ffn_up.weight x ffn_norm-27 -> ffn_up-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x1
+  ggml-hex: HTP0 matmul : blk.27.ffn_gate.weight x ffn_norm-27 -> ffn_gate-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x3
+  ggml-hex: HTP0 graph-compute n_nodes 1
+  ggml-hex: HTP0 matmul : blk.27.ffn_down.weight x ffn_gate_par-27 -> ffn_out-27 : 8192:3072 x 8192:1 -> 3072:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x0
+  ggml-hex: HTP0 get-tensor result_output : data 0x7592487000 offset 0 size 513024
+  ```
+
+- `GGML_HEXAGON_PROFILE=1`
+  Generates a host-side profile for the ggml-hexagon Ops.
+
+- `GGML_HEXAGON_OPMASK=0x0`
+  Allows enabling specific stages of the processing pipeline:
+
+  - `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
+  - `0x2` Enable Dynamic Quantizer (if needed for the Op)
+  - `0x4` Enable Op Compute (MUL_MAT, etc.)
+
+  Examples:
+
+      `GGML_HEXAGON_OPMASK=0x1 llama-cli ...` - Ops are enqueued but NPU-side processing is stubbed out
+      `GGML_HEXAGON_OPMASK=0x3 llama-cli ...` - NPU performs dynamic quantization and skips the rest
+      `GGML_HEXAGON_OPMASK=0x7 llama-cli ...` - Full queuing and processing of Ops (default)
--- a/docs/backend/hexagon/developer.md
+++ b/docs/backend/hexagon/developer.md
@@ -0,0 +1,109 @@
+# Hexagon backend developer details
+
+## Backend libraries
+
+The Hexagon backend consist of two parts:
+
+  - `libggml-hexagon`
+    This is the regular CPU-side GGML backend library, either shared or statically linked
+
+  - `libggml-htp-vNN`
+    This is the NPU-side (HTP stands for Hexagon Tensor Processor) shared library that contains the Op dispatcher and kernels.
+    The correct library is selected automatically at runtime based on the HW version.
+
+Here is an example of the build artifacts
+
+```
+~/src/llama.cpp$ ls -l pkg-adb/llama.cpp/lib/libggml*
+pkg-adb/llama.cpp/lib/libggml-base.so
+pkg-adb/llama.cpp/lib/libggml-cpu.so
+pkg-adb/llama.cpp/lib/libggml-hexagon.so      <<< CPU library
+pkg-adb/llama.cpp/lib/libggml-htp-v73.so      <<< HTP op/kernels for Hexagon v73
+pkg-adb/llama.cpp/lib/libggml-htp-v75.so
+pkg-adb/llama.cpp/lib/libggml-htp-v79.so
+pkg-adb/llama.cpp/lib/libggml-htp-v81.so
+```
+
+## Memory buffers
+
+Hexagon NPU backend takes advantage of the Snapdragon's unified memory model where all buffers are fully accessible by the CPU and GPU.
+The NPU does have a dedicated tightly-coupled memory called VTCM but that memory is used only for intermediate data (e.g. dynamically
+quantized tensors) or temporary data (chunks of the weight tensors fetched via DMA).
+
+Please note that currently the Hexagon backend does not implement SET/GET_ROWS Ops because there is no advantage in offloading those
+to the NPU at this point.
+
+The backend does allocates non-host buffers for the tensors with datatypes that require repacking: Q4_0, Q8_0, MXFP4.
+From the MMU perspective these buffers are still regular buffers (normal access by the CPU) they are marked as non-host simply to force
+the repacking.
+
+## Large model handling
+
+Hexagon NPU session (aka Process Domain (PD) in the Hexagon docs) is limited to a memory mapping of around 3.5GB.
+In llama.cpp/GGML the Hexagon session is mapped to a single GGML backend device (HTP0, HTP1, etc).
+
+In order to map models larger than 3.5GB we need to allocate multiple devices and split the model.
+For this we're taking advantage of the llama.cpp/GGML multi-GPU layer-splitting support.
+Each Hexagon device behaves like a GPU from the offload and model splitting perspective.
+
+Here is an example of running GPT-OSS-20B model on a newer Snapdragon device with 16GB of DDR.
+
+```
+M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-cli.sh -no-cnv -f surfing.txt -n 32
+...
+LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
+ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
+GGML_HEXAGON_NDEV=4 ./bin/llama-cli --no-mmap -m /data/local/tmp/llama.cpp/../gguf/gpt-oss-20b-Q4_0.gguf
+      -t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on -ngl 99 --device HTP0,HTP1,HTP2,HTP3 -no-cnv -f surfing.txt
+...
+llama_model_loader: - type  f32:  289 tensors
+llama_model_loader: - type q4_0:   96 tensors
+llama_model_loader: - type q8_0:    2 tensors
+llama_model_loader: - type mxfp4:  72 tensors
+...
+load_tensors: offloaded 25/25 layers to GPU
+load_tensors:          CPU model buffer size =  1182.09 MiB
+load_tensors:         HTP1 model buffer size =     6.64 MiB
+load_tensors:  HTP1-REPACK model buffer size =  2505.94 MiB
+load_tensors:         HTP3 model buffer size =     5.55 MiB
+load_tensors:  HTP3-REPACK model buffer size =  2088.28 MiB
+load_tensors:         HTP0 model buffer size =     7.75 MiB
+load_tensors:  HTP0-REPACK model buffer size =  2923.59 MiB
+load_tensors:         HTP2 model buffer size =     6.64 MiB
+load_tensors:  HTP2-REPACK model buffer size =  2505.94 MiB
+...
+llama_context: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
+llama_context:        CPU  output buffer size =     0.77 MiB
+llama_kv_cache_iswa: creating non-SWA KV cache, size = 8192 cells
+llama_kv_cache:       HTP1 KV buffer size =    25.50 MiB
+llama_kv_cache:       HTP3 KV buffer size =    25.50 MiB
+llama_kv_cache:       HTP0 KV buffer size =    25.50 MiB
+llama_kv_cache:       HTP2 KV buffer size =    25.50 MiB
+llama_kv_cache: size =  102.00 MiB (  8192 cells,  12 layers,  1/1 seqs), K (q8_0):   51.00 MiB, V (q8_0):   51.00 MiB
+llama_kv_cache_iswa: creating     SWA KV cache, size = 256 cells
+llama_kv_cache:       HTP1 KV buffer size =     0.80 MiB
+llama_kv_cache:       HTP3 KV buffer size =     0.53 MiB
+llama_kv_cache:       HTP0 KV buffer size =     1.06 MiB
+llama_kv_cache:       HTP2 KV buffer size =     0.80 MiB
+llama_kv_cache: size =    3.19 MiB (   256 cells,  12 layers,  1/1 seqs), K (q8_0):    1.59 MiB, V (q8_0):    1.59 MiB
+llama_context:       HTP0 compute buffer size =    16.06 MiB
+llama_context:       HTP1 compute buffer size =    16.06 MiB
+llama_context:       HTP2 compute buffer size =    16.06 MiB
+llama_context:       HTP3 compute buffer size =    16.06 MiB
+llama_context:        CPU compute buffer size =    98.19 MiB
+...
+llama_perf_context_print: prompt eval time =    3843.67 ms /   197 tokens ( 19.51 ms per token, 51.25 tokens per second)
+llama_perf_context_print:        eval time =    1686.13 ms /    31 runs   ( 54.39 ms per token, 18.39 tokens per second)
+llama_perf_context_print:       total time =    6266.30 ms /   228 tokens
+llama_perf_context_print:    graphs reused =         30
+llama_memory_breakdown_print: | memory breakdown [MiB] | total   free    self   model   context   compute    unaccounted |
+llama_memory_breakdown_print: |   - HTP0 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - HTP1 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - HTP2 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - HTP3 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - Host               |                 1476 =  1208 +     105 +     162                |
+llama_memory_breakdown_print: |   - HTP1-REPACK        |                 2505 =  2505 +       0 +       0                |
+llama_memory_breakdown_print: |   - HTP3-REPACK        |                 2088 =  2088 +       0 +       0                |
+llama_memory_breakdown_print: |   - HTP0-REPACK        |                 2923 =  2923 +       0 +       0                |
+llama_memory_breakdown_print: |   - HTP2-REPACK        |                 2505 =  2505 +       0 +       0                |
+```
--- a/docs/build.md
+++ b/docs/build.md
@@ -178,6 +178,48 @@ GeForce RTX 3070      8.6
 cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="86;89"
 ```

+### Overriding the CUDA Version
+
+If you have multiple CUDA installations on your system and want to compile llama.cpp for a specific one, e.g. for CUDA 11.7 installed under `/opt/cuda-11.7`:
+
+```bash
+cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/opt/cuda-11.7/bin/nvcc -DCMAKE_INSTALL_RPATH="/opt/cuda-11.7/lib64;\$ORIGIN" -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
+```
+
+#### Fixing Compatibility Issues with Old CUDA and New glibc
+
+If you try to use an old CUDA version (e.g. v11.7) with a new glibc version you can get errors like this:
+
+```
+/usr/include/bits/mathcalls.h(83): error: exception specification is
+  incompatible with that of previous function "cospi"
+
+
+  /opt/cuda-11.7/bin/../targets/x86_64-linux/include/crt/math_functions.h(5545):
+  here
+```
+
+It seems the least bad solution is to patch the CUDA installation to declare the correct signatures.
+Replace the following lines in `/path/to/your/cuda/installation/targets/x86_64-linux/include/crt/math_functions.h`:
+
+```C++
+// original lines
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 cospi(double x);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  cospif(float x);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 sinpi(double x);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sinpif(float x);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 rsqrt(double x);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  rsqrtf(float x);
+
+// edited lines
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 cospi(double x) noexcept (true);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  cospif(float x) noexcept (true);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 sinpi(double x) noexcept (true);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sinpif(float x) noexcept (true);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 rsqrt(double x) noexcept (true);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  rsqrtf(float x) noexcept (true);
+```
+
 ### Runtime CUDA environmental variables

 You may set the [cuda environmental variables](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) at runtime.
@@ -261,10 +303,12 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
  ```bash
  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-      cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+      cmake -S . -B build -DGGML_HIP=ON -DGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
      && cmake --build build --config Release -- -j 16
  ```

+  Note: `GPU_TARGETS` is optional, omitting it will build the code for all GPUs in the current system.
+
  To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.

  The rocWMMA library is included by default when installing the ROCm SDK using the `rocm` meta package provided by AMD. Alternatively, if you are not using the meta package, you can install the library using the `rocwmma-dev` or `rocwmma-devel` package, depending on your system's package manager.
@@ -282,17 +326,17 @@ You can download it from your Linux distro's package manager or from here: [ROCm
  ```bash
  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
  HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
-      cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+      cmake -S . -B build -DGGML_HIP=ON -DGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
      && cmake --build build -- -j 16
  ```

 - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
  ```bash
  set PATH=%HIP_PATH%\bin;%PATH%
-  cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
+  cmake -S . -B build -G Ninja -DGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
  cmake --build build
  ```
-  Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
+  If necessary, adapt `GPU_TARGETS` to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
  Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.


--- a/docs/docker.md
+++ b/docs/docker.md
@@ -7,9 +7,9 @@
 ## Images
 We have three Docker images available for this project:

-1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
-2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
-3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
+1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
+2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
+3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)

 Additionally, there the following images, similar to the above:

--- a/docs/ops.md
+++ b/docs/ops.md
@@ -14,99 +14,108 @@ Legend:

 | Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | zDNN |
 |-----------|------|------|------|------|------|------|------|------|------|
-|                              ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
+|                              ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
 |                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                           ADD_ID | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
+|                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ |
+|                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ |
+|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
+|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
+|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ |
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
-|                          CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ |
+|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ |
 |                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                          CONV_3D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
+|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
 |                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
 |                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
 |               CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                           CUMSUM | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
 |                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
 |                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
+|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ❌ |
+|                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
+|                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                             FILL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
 |                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
+|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
 |                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
 |                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
 |                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
 |                      GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                             GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
+|                             GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
+|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
+|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
 |                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
 |                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|               GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
+|               GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
+|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
 |                           IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
-|                        IM2COL_3D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
 |                          L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                              LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | 🟡 | ❌ |
+|                              LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ |
+|                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
 |                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
 |                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
-|                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
+|                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
 |                             NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                     NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                     NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                     OPT_STEP_SGD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                     OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
 |                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
-|                              PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ |
-|                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|                              PAD | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
+|                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ |
 |                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
+|                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
 |                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ |
-|                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
 |                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
-|                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                             ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
+|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                             ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
 |                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
 |                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                              SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|                              SET | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ❌ | ❌ |
 |                         SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
+|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ❌ |
+|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
+|                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
 |                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                          SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
+|                          SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ |
 |                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ |
-|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ❌ | ❌ |
-|                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
+|                        SOLVE_TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
+|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
+|                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
+|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ |
+|                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
 |                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                              SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
-|                         SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ |
+|                              SUM | ❌ | ✅ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
+|                         SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
 |                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                       SWIGLU_OAI | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
+|                       SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | 🟡 | ❌ |
+|                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ❌ |
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                         TOPK_MOE | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                              TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
 |                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
-|                            XIELU | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                            XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
--- a/docs/ops/CPU.csv
+++ b/docs/ops/CPU.csv
--- a/docs/ops/CUDA.csv
+++ b/docs/ops/CUDA.csv
--- a/docs/ops/SYCL.csv
+++ b/docs/ops/SYCL.csv
--- a/docs/ops/Vulkan.csv
+++ b/docs/ops/Vulkan.csv
--- a/examples/batched/README.md
+++ b/examples/batched/README.md
@@ -3,7 +3,7 @@
 The example demonstrates batched generation from a given prompt

 ```bash
-./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
+./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4 --kv-unified

 ...

--- a/examples/diffusion/README.md
+++ b/examples/diffusion/README.md
@@ -6,8 +6,54 @@ More Info:
 - https://github.com/ggml-org/llama.cpp/pull/14644
 - https://github.com/ggml-org/llama.cpp/pull/14771

+## Parameters
+The diffusion CLI supports various parameters to control the generation process:

-Example of using Dream architechture: `llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual`
+### Core Diffusion Parameters
+- `--diffusion-steps`: Number of diffusion steps (default: 256)
+- `--diffusion-algorithm`: Algorithm for token selection
+  - `0`: ORIGIN - Token will be generated in a purely random order from https://arxiv.org/abs/2107.03006.
+  - `1`: ENTROPY_BASED - Entropy-based selection
+  - `2`: MARGIN_BASED - Margin-based selection
+  - `3`: RANDOM - Random selection
+  - `4`: CONFIDENCE_BASED - Confidence-based selection (default)
+  - More documentation here https://github.com/DreamLM/Dream
+- `--diffusion-visual`: Enable live visualization during generation

-Example of using LLaDA architechture: `llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual`
+### Scheduling Parameters
+Choose one of the following scheduling methods:

+**Timestep-based scheduling:**
+- `--diffusion-eps`: Epsilon value for timestep scheduling (e.g., 0.001)
+
+**Block-based scheduling:**
+- `--diffusion-block-length`: Block size for block-based scheduling (e.g., 32)
+
+### Sampling Parameters
+- `--temp`: Temperature for sampling (0.0 = greedy/deterministic, higher = more random)
+- `--top-k`: Top-k filtering for sampling
+- `--top-p`: Top-p (nucleus) filtering for sampling
+- `--seed`: Random seed for reproducibility
+
+### Model Parameters
+- `-m`: Path to the GGUF model file
+- `-p`: Input prompt text
+- `-ub`: Maximum sequence length (ubatch size)
+- `-c`: Context size
+- `-b`: Batch size
+
+### Examples
+#### Dream architechture:
+```
+llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual
+```
+
+#### LLaDA architechture:
+```
+llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual
+```
+
+#### RND1 architecture:
+```
+llama-diffusion-cli -m RND1-Base-0910.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-algorithm 1 --diffusion-steps 256 --diffusion-visual --temp 0.5 --diffusion-eps 0.001
+```
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@@ -38,6 +38,7 @@ The above command will output space-separated float values.
 |            | multiple embeddings          | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
 | 'json'     | openai style                 |
 | 'json+'    | add cosine similarity matrix |
+| 'raw'      | plain text output            |

 ### --embd-separator $"string"$
 | $"string"$   | |
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -70,6 +70,29 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
    }
 }

+// plain, pipe-friendly output: one embedding per line
+static void print_raw_embeddings(const float * emb,
+                                 int n_embd_count,
+                                 int n_embd,
+                                 const llama_model * model,
+                                 enum llama_pooling_type pooling_type,
+                                 int embd_normalize) {
+    const uint32_t n_cls_out = llama_model_n_cls_out(model);
+    const bool is_rank = (pooling_type == LLAMA_POOLING_TYPE_RANK);
+    const int cols = is_rank ? std::min<int>(n_embd, (int) n_cls_out) : n_embd;
+
+    for (int j = 0; j < n_embd_count; ++j) {
+        for (int i = 0; i < cols; ++i) {
+            if (embd_normalize == 0) {
+                LOG("%1.0f%s", emb[j * n_embd + i], (i + 1 < cols ? " " : ""));
+            } else {
+                LOG("%1.7f%s", emb[j * n_embd + i], (i + 1 < cols ? " " : ""));
+            }
+        }
+        LOG("\n");
+    }
+}
+
 int main(int argc, char ** argv) {
    common_params params;

@@ -372,6 +395,8 @@ int main(int argc, char ** argv) {
        }

        if (notArray) LOG("\n}\n");
+    } else if (params.embd_out == "raw") {
+        print_raw_embeddings(emb, n_embd_count, n_embd, model, pooling_type, params.embd_normalize);
    }

    LOG("\n");
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -4,10 +4,10 @@
 #include "llama.h"
 #include "ggml.h"

+#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
-#include <numeric>

 /**
 * This the arbitrary data which will be passed to each callback.
@@ -37,23 +37,23 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
    return u.f;
 }

-static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
+static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
    float v;
    if (type == GGML_TYPE_F16) {
-        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
+        v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
    } else if (type == GGML_TYPE_F32) {
-        v = *(float *) &data[i];
+        v = *(const float *) &data[i];
    } else if (type == GGML_TYPE_I64) {
-        v = (float) *(int64_t *) &data[i];
+        v = (float) *(const int64_t *) &data[i];
    } else if (type == GGML_TYPE_I32) {
-        v = (float) *(int32_t *) &data[i];
+        v = (float) *(const int32_t *) &data[i];
    } else if (type == GGML_TYPE_I16) {
-        v = (float) *(int16_t *) &data[i];
+        v = (float) *(const int16_t *) &data[i];
    } else if (type == GGML_TYPE_I8) {
-        v = (float) *(int8_t *) &data[i];
+        v = (float) *(const int8_t *) &data[i];
    } else if (type == GGML_TYPE_BF16) {
-        v = ggml_compute_bf16_to_fp32(*(ggml_bf16_t *) &data[i]);
+        v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
    } else {
        GGML_ABORT("fatal error");
    }
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -184,8 +184,13 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
            const char * name   = gguf_get_tensor_name  (ctx, i);
            const size_t size   = gguf_get_tensor_size  (ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);
+            const auto   type   = gguf_get_tensor_type  (ctx, i);

-            printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
+            const char * type_name  = ggml_type_name(type);
+            const size_t type_size  = ggml_type_size(type);
+            const size_t n_elements = size / type_size;
+
+            printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu, type = %s, n_elts = %zu\n", __func__, i, name, size, offset, type_name, n_elements);
        }
    }

--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -371,8 +371,17 @@ class SchemaConverter:
                        raise ValueError(f'Unsupported ref {ref}')

                    for sel in ref.split('#')[-1].split('/')[1:]:
-                        assert target is not None and sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
-                        target = target[sel]
+                        assert target is not None, f'Error resolving ref {ref}: {sel} not in {target}'
+                        if isinstance(target, list):
+                            try:
+                                sel_index = int(sel)
+                            except ValueError:
+                                raise ValueError(f'Error resolving ref {ref}: {sel} not in {target}')
+                            assert 0 <= sel_index < len(target), f'Error resolving ref {ref}: {sel} not in {target}'
+                            target = target[sel_index]
+                        else:
+                            assert sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
+                            target = target[sel]

                    self._refs[ref] = target
                else:
@@ -547,7 +556,8 @@ class SchemaConverter:


    def _resolve_ref(self, ref):
-        ref_name = ref.split('/')[-1]
+        ref_fragment = ref.split('#')[-1]
+        ref_name = 'ref' + re.sub(r'[^a-zA-Z0-9-]+', '-', ref_fragment)
        if ref_name not in self._rules and ref not in self._refs_being_resolved:
            self._refs_being_resolved.add(ref)
            resolved = self._refs[ref]
--- a/examples/model-conversion/scripts/causal/run-org-model.py
+++ b/examples/model-conversion/scripts/causal/run-org-model.py
@@ -138,7 +138,10 @@ if model_path is None:
        "Model path must be specified either via --model-path argument or MODEL_PATH environment variable"
    )

-config = AutoConfig.from_pretrained(model_path)
+
+print("Loading model and tokenizer using AutoTokenizer:", model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)

 print("Model type:       ", config.model_type)
 print("Vocab size:       ", config.vocab_size)
@@ -147,10 +150,6 @@ print("Number of layers: ", config.num_hidden_layers)
 print("BOS token id:     ", config.bos_token_id)
 print("EOS token id:     ", config.eos_token_id)

-print("Loading model and tokenizer using AutoTokenizer:", model_path)
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-config = AutoConfig.from_pretrained(model_path)
-
 if unreleased_model_name:
    model_name_lower = unreleased_model_name.lower()
    unreleased_module_path = (
@@ -171,7 +170,7 @@ if unreleased_model_name:
        exit(1)
 else:
    model = AutoModelForCausalLM.from_pretrained(
-        model_path, device_map="auto", offload_folder="offload"
+        model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=config
    )

 for name, module in model.named_modules():
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -25,16 +25,17 @@ if(GIT_EXE)
    )
 endif()

-# Build the version string with optional dirty flag
 set(GGML_VERSION "${GGML_VERSION_BASE}")
-if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
-    set(GGML_VERSION "${GGML_VERSION}-dirty")
-endif()

 if(NOT GGML_BUILD_COMMIT)
    set(GGML_BUILD_COMMIT "unknown")
 endif()

+# Build the commit string with optional dirty flag
+if(DEFINED GGML_GIT_DIRTY AND GGML_GIT_DIRTY EQUAL 1)
+    set(GGML_BUILD_COMMIT "${GGML_BUILD_COMMIT}-dirty")
+endif()
+
 include(CheckIncludeFileCXX)

 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -168,7 +169,7 @@ option(GGML_RV_ZFH           "ggml: enable riscv zfh"        ON)
 option(GGML_RV_ZVFH          "ggml: enable riscv zvfh"       ON)
 option(GGML_RV_ZICBOP        "ggml: enable riscv zicbop"     ON)
 option(GGML_XTHEADVECTOR     "ggml: enable xtheadvector"     OFF)
-option(GGML_VXE              "ggml: enable vxe"              ON)
+option(GGML_VXE              "ggml: enable vxe"              ${GGML_NATIVE})

 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
 set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
@@ -251,6 +252,8 @@ option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adr
 set   (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
                                            "gmml: OpenCL API version to target")

+option(GGML_HEXAGON                         "ggml: enable Hexagon backend"                    OFF)
+
 # toolchain for vulkan-shaders-gen
 set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")

--- a/ggml/include/ggml-hexagon.h
+++ b/ggml/include/ggml-hexagon.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(void);
+
+GGML_BACKEND_API bool ggml_backend_is_hexagon(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@@ -21,8 +21,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
 GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device, size_t * free, size_t * total);

 GGML_BACKEND_API void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir,
-                                                    size_t n_threads, size_t n_devices,
-                                                    ggml_backend_dev_t * devices, size_t * free_mem, size_t * total_mem);
+                                                    size_t n_threads, size_t n_devices, ggml_backend_dev_t * devices);

 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint);
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -242,6 +242,7 @@
 #define GGML_ROPE_TYPE_NEOX   2
 #define GGML_ROPE_TYPE_MROPE  8
 #define GGML_ROPE_TYPE_VISION 24
+#define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000

 #define GGML_MROPE_SECTIONS   4

@@ -474,6 +475,7 @@ extern "C" {
        GGML_OP_COS,
        GGML_OP_SUM,
        GGML_OP_SUM_ROWS,
+        GGML_OP_CUMSUM,
        GGML_OP_MEAN,
        GGML_OP_ARGMAX,
        GGML_OP_COUNT_EQUAL,
@@ -528,7 +530,10 @@ extern "C" {
        GGML_OP_ARANGE,
        GGML_OP_TIMESTEP_EMBEDDING,
        GGML_OP_ARGSORT,
+        GGML_OP_TOP_K,
        GGML_OP_LEAKY_RELU,
+        GGML_OP_TRI,
+        GGML_OP_FILL,

        GGML_OP_FLASH_ATTN_EXT,
        GGML_OP_FLASH_ATTN_BACK,
@@ -541,6 +546,7 @@ extern "C" {
        GGML_OP_RWKV_WKV6,
        GGML_OP_GATED_LINEAR_ATTN,
        GGML_OP_RWKV_WKV7,
+        GGML_OP_SOLVE_TRI,

        GGML_OP_UNARY,

@@ -575,8 +581,14 @@ extern "C" {
        GGML_UNARY_OP_HARDSWISH,
        GGML_UNARY_OP_HARDSIGMOID,
        GGML_UNARY_OP_EXP,
+        GGML_UNARY_OP_EXPM1,
+        GGML_UNARY_OP_SOFTPLUS,
        GGML_UNARY_OP_GELU_ERF,
        GGML_UNARY_OP_XIELU,
+        GGML_UNARY_OP_FLOOR,
+        GGML_UNARY_OP_CEIL,
+        GGML_UNARY_OP_ROUND,
+        GGML_UNARY_OP_TRUNC,

        GGML_UNARY_OP_COUNT,
    };
@@ -615,6 +627,13 @@ extern "C" {
        GGML_TENSOR_FLAG_LOSS   =  8, // ...defines loss for numerical optimization (multiple loss tensors add up)
    };

+    enum ggml_tri_type {
+        GGML_TRI_TYPE_UPPER_DIAG = 0,
+        GGML_TRI_TYPE_UPPER      = 1,
+        GGML_TRI_TYPE_LOWER_DIAG = 2,
+        GGML_TRI_TYPE_LOWER      = 3
+    };
+
    struct ggml_init_params {
        // memory pool
        size_t mem_size;   // bytes
@@ -952,6 +971,22 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    GGML_API struct ggml_tensor * ggml_expm1(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_expm1_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_softplus(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_softplus_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    GGML_API struct ggml_tensor * ggml_sin(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
@@ -978,6 +1013,10 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    GGML_API struct ggml_tensor * ggml_cumsum(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+
    // mean along rows
    GGML_API struct ggml_tensor * ggml_mean(
            struct ggml_context * ctx,
@@ -1151,6 +1190,46 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    GGML_API struct ggml_tensor * ggml_floor(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_floor_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_ceil(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_ceil_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_round(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_round_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+     /**
+     * Truncates the fractional part of each element in the tensor (towards zero).
+     * For example: trunc(3.7) = 3.0, trunc(-2.9) = -2.0
+     * Similar to std::trunc in C/C++.
+     */
+
+    GGML_API struct ggml_tensor * ggml_trunc(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_trunc_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+
+
    // xIELU activation function
    // x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)
    // where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions
@@ -2063,6 +2142,7 @@ extern "C" {
    enum ggml_scale_mode {
        GGML_SCALE_MODE_NEAREST  = 0,
        GGML_SCALE_MODE_BILINEAR = 1,
+        GGML_SCALE_MODE_BICUBIC  = 2,

        GGML_SCALE_MODE_COUNT
    };
@@ -2141,6 +2221,23 @@ extern "C" {
            int                   shift2,
            int                   shift3);

+    // Convert matrix into a triangular one (upper, strict upper, lower or strict lower) by writing
+    // zeroes everywhere outside the masked area
+    GGML_API struct ggml_tensor * ggml_tri(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            enum ggml_tri_type    type);
+
+    // Fill tensor a with constant c
+    GGML_API struct ggml_tensor * ggml_fill(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 c);
+
+    GGML_API struct ggml_tensor * ggml_fill_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 c);

    // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
    // timesteps: [N,]
@@ -2162,18 +2259,25 @@ extern "C" {
            struct ggml_tensor  * a,
            enum ggml_sort_order  order);

+    // similar to ggml_top_k but implemented as `argsort` + `view`
+    GGML_API struct ggml_tensor * ggml_argsort_top_k(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   k);
+
+    // top k elements per row
+    // note: the resulting top k indices are in no particular order
+    GGML_API struct ggml_tensor * ggml_top_k(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   k);
+
    GGML_API struct ggml_tensor * ggml_arange(
            struct ggml_context * ctx,
            float                 start,
            float                 stop,
            float                 step);

-    // top k elements per row
-    GGML_API struct ggml_tensor * ggml_top_k(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   k);
-
 #define GGML_KQ_MASK_PAD 64

    // q:    [n_embd_k, n_batch,     n_head,    ne3 ]
@@ -2310,6 +2414,27 @@ extern "C" {
            struct ggml_tensor  * b,
            struct ggml_tensor  * state);

+    /* Solves a specific equation of the form Ax=B, where A is a triangular matrix
+    *  without zeroes on the diagonal (i.e. invertible).
+    *  B can have any number of columns, but must have the same number of rows as A
+    *  If A is [n, n] and B is [n, m], then the result will be [n, m] as well
+    *  Has O(n^3) complexity (unlike most matrix ops out there), so use on cases
+    *  where n > 100 sparingly, pre-chunk if necessary.
+    *
+    *  If left = false, solves xA=B instead
+    *  If lower = false, assumes upper triangular instead
+    *  If uni = true, assumes diagonal of A to be all ones (will override actual values)
+    *
+    *  TODO: currently only lower, right, non-unitriangular variant is implemented
+    */
+    GGML_API struct ggml_tensor * ggml_solve_tri(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        bool                  left,
+        bool                  lower,
+        bool                  uni);
+
    // custom operators

    typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -211,6 +211,11 @@ add_library(ggml-base
            ggml-quants.h
            gguf.cpp)

+set_target_properties(ggml-base PROPERTIES
+    VERSION ${GGML_VERSION}
+    SOVERSION ${GGML_VERSION_MAJOR}
+)
+
 target_include_directories(ggml-base PRIVATE .)
 if (GGML_BACKEND_DL)
    target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
@@ -220,6 +225,11 @@ add_library(ggml
            ggml-backend-reg.cpp)
 add_library(ggml::ggml ALIAS ggml)

+set_target_properties(ggml PROPERTIES
+    VERSION ${GGML_VERSION}
+    SOVERSION ${GGML_VERSION_MAJOR}
+)
+
 if (GGML_BACKEND_DIR)
    if (NOT GGML_BACKEND_DL)
        message(FATAL_ERROR "GGML_BACKEND_DIR requires GGML_BACKEND_DL")
@@ -259,6 +269,12 @@ function(ggml_add_backend_library backend)
        target_compile_definitions(${backend} PUBLIC  GGML_BACKEND_SHARED)
    endif()

+    # Set versioning properties for all backend libraries
+    set_target_properties(${backend} PROPERTIES
+        VERSION ${GGML_VERSION}
+        SOVERSION ${GGML_VERSION_MAJOR}
+    )
+
    if(NOT GGML_AVAILABLE_BACKENDS)
        set(GGML_AVAILABLE_BACKENDS "${backend}"
            CACHE INTERNAL "List of backends for cmake package")
@@ -304,6 +320,22 @@ function(ggml_add_cpu_backend_variant tag_name)
            set(GGML_INTERNAL_${feat} ON)
        endforeach()
    elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
+        foreach (feat ${ARGN})
+            set(GGML_INTERNAL_${feat} ON)
+        endforeach()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
+        foreach (feat VXE2 NNPA)
+            set(GGML_INTERNAL_${feat} OFF)
+        endforeach()
+
+        foreach (feat ${ARGN})
+            set(GGML_INTERNAL_${feat} ON)
+        endforeach()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
+        foreach (feat RVV)
+            set(GGML_INTERNAL_${feat} OFF)
+        endforeach()
+
        foreach (feat ${ARGN})
            set(GGML_INTERNAL_${feat} ON)
        endforeach()
@@ -371,6 +403,20 @@ if (GGML_CPU_ALL_VARIANTS)
        else()
            message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
        endif()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
+        if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+            ggml_add_cpu_backend_variant(z15    Z15 VXE2)
+            ggml_add_cpu_backend_variant(z16    Z16 VXE2 NNPA)
+        else()
+            message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
+        endif()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
+        if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+            ggml_add_cpu_backend_variant(riscv64_0)
+            ggml_add_cpu_backend_variant(riscv64_v   RVV)
+        else()
+            message(FATAL_ERROR "Unsupported RISC-V target OS: ${CMAKE_SYSTEM_NAME}")
+        endif()
    else()
        message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
    endif()
@@ -390,6 +436,7 @@ ggml_add_backend(Vulkan)
 ggml_add_backend(WebGPU)
 ggml_add_backend(zDNN)
 ggml_add_backend(OpenCL)
+ggml_add_backend(Hexagon)

 foreach (target ggml-base ggml)
    target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -226,16 +226,23 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
    }

    if (best_fit_block == -1) {
-        // no suitable block found, try the last block (this will grow a chunks size)
+        // no suitable block found, try the last block (this may grow a chunks size)
+        int64_t best_reuse = INT64_MIN;
        for (int c = 0; c < alloc->n_chunks; ++c) {
            struct tallocr_chunk * chunk = alloc->chunks[c];
            if (chunk->n_free_blocks > 0) {
                struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
                max_avail = MAX(max_avail, block->size);
-                if (block->size >= size) {
+                int64_t reuse_factor = chunk->max_size - block->offset - size;
+                // reuse_factor < 0 : amount of extra memory that needs to be allocated
+                // reuse_factor = 0 : allocated free space exactly matches tensor size
+                // reuse_factor > 0 : superfluous memory that will remain unused
+                bool better_reuse = best_reuse < 0 && reuse_factor > best_reuse;
+                bool better_fit = reuse_factor >= 0 && reuse_factor < best_reuse;
+                if (block->size >= size && (better_reuse || better_fit)) {
                    best_fit_chunk = c;
                    best_fit_block = chunk->n_free_blocks - 1;
-                    break;
+                    best_reuse = reuse_factor;
                }
            }
        }
@@ -268,7 +275,7 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
 #ifdef GGML_ALLOCATOR_DEBUG
    add_allocated_tensor(alloc, addr, tensor);
    size_t cur_max = addr.offset + size;
-    if (cur_max > alloc->max_size[addr.chunk]) {
+    if (cur_max > chunk->max_size) {
        // sort allocated_tensors by chunk/offset
        for (int i = 0; i < 1024; i++) {
            for (int j = i + 1; j < 1024; j++) {
@@ -598,6 +605,26 @@ static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor
    return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
 }

+// free the extra space at the end if the new tensor is smaller
+static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_tensor * node, struct ggml_tensor * parent) {
+    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
+    struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
+
+    size_t parent_size = ggml_backend_buft_get_alloc_size(galloc->bufts[p_hn->buffer_id], parent);
+    size_t node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
+
+    GGML_ASSERT(parent_size >= node_size);
+
+    if (parent_size > node_size) {
+        struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
+        struct buffer_address p_addr = p_hn->addr;
+        p_addr.offset += node_size;
+        size_t extra_size = parent_size - node_size;
+        AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
+        ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
+    }
+}
+
 static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
    GGML_ASSERT(buffer_id >= 0);
    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
@@ -643,6 +670,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
                            hn->addr = p_hn->addr;
                            p_hn->allocated = false; // avoid freeing the parent
                            view_src_hn->allocated = false;
+                            ggml_gallocr_free_extra_space(galloc, node, view_src);
                            return;
                        }
                    } else {
@@ -650,6 +678,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
                        hn->buffer_id = p_hn->buffer_id;
                        hn->addr = p_hn->addr;
                        p_hn->allocated = false; // avoid freeing the parent
+                        ggml_gallocr_free_extra_space(galloc, node, parent);
                        return;
                    }
                }
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -57,6 +57,10 @@
 #include "ggml-opencl.h"
 #endif

+#ifdef GGML_USE_HEXAGON
+#include "ggml-hexagon.h"
+#endif
+
 #ifdef GGML_USE_BLAS
 #include "ggml-blas.h"
 #endif
@@ -199,6 +203,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_OPENCL
        register_backend(ggml_backend_opencl_reg());
 #endif
+#ifdef GGML_USE_HEXAGON
+        register_backend(ggml_backend_hexagon_reg());
+#endif
 #ifdef GGML_USE_CANN
        register_backend(ggml_backend_cann_reg());
 #endif
@@ -598,6 +605,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
    ggml_backend_load_best("sycl", silent, dir_path);
    ggml_backend_load_best("vulkan", silent, dir_path);
    ggml_backend_load_best("opencl", silent, dir_path);
+    ggml_backend_load_best("hexagon", silent, dir_path);
    ggml_backend_load_best("musa", silent, dir_path);
    ggml_backend_load_best("cpu", silent, dir_path);
    // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1698,8 +1698,6 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
    GGML_ASSERT(sched);
    GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);

-    ggml_backend_sched_reset(sched);
-
    ggml_backend_sched_synchronize(sched);

    ggml_backend_sched_split_graph(sched, measure_graph);
--- a/ggml/src/ggml-cann/Doxyfile
+++ b/ggml/src/ggml-cann/Doxyfile
--- a/ggml/src/ggml-cann/acl_tensor.cpp
+++ b/ggml/src/ggml-cann/acl_tensor.cpp
@@ -48,31 +48,33 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
        default:
            return ACL_DT_UNDEFINED;
    }
-    return ACL_DT_UNDEFINED;
 }

-aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
-                                   size_t* nb, int64_t dims, aclFormat format,
-                                   size_t offset) {
+acl_tensor_ptr ggml_cann_create_tensor(const ggml_tensor * tensor,
+                                       int64_t *           ne,
+                                       size_t *            nb,
+                                       int64_t             dims,
+                                       aclFormat           format,
+                                       size_t              offset) {
    // If tensor is bcasted, Up to GGML_MAX_DIMS additional dimensions will be
    // added.
    int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];

    if (ne == nullptr) {
        for (int i = 0; i < GGML_MAX_DIMS; i++) {
-            acl_ne[i] = tensor->ne[i];
+            acl_ne[i]     = tensor->ne[i];
            // The step size of acl is in elements.
            acl_stride[i] = tensor->nb[i] / ggml_element_size(tensor);
        }
    } else {
        // With bcast
        for (int i = 0; i < dims; i++) {
-            acl_ne[i] = ne[i];
+            acl_ne[i]     = ne[i];
            acl_stride[i] = nb[i] / ggml_element_size(tensor);
        }
    }

-    int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
+    int64_t final_dims      = (dims == 0 ? GGML_MAX_DIMS : dims);
    int64_t acl_storage_len = 1;
    for (int i = 0; i < final_dims; i++) {
        acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
@@ -84,15 +86,23 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
    std::reverse(acl_ne, acl_ne + final_dims);
    std::reverse(acl_stride, acl_stride + final_dims);

-    aclTensor* acl_tensor = aclCreateTensor(
-        acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
-        elem_offset, format, &acl_storage_len, 1,
-        tensor->data);
+    aclTensor * raw = aclCreateTensor(acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride, elem_offset,
+                                      format, &acl_storage_len, 1, tensor->data);

-    return acl_tensor;
+    return acl_tensor_ptr(raw);
 }

-bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
+acl_int_array_ptr ggml_cann_create_int_array(const int64_t * value, uint64_t size) {
+    aclIntArray * raw = aclCreateIntArray(value, size);
+    return acl_int_array_ptr(raw);
+}
+
+acl_scalar_ptr ggml_cann_create_scalar(void * value, aclDataType dataType) {
+    aclScalar * raw = aclCreateScalar(value, dataType);
+    return acl_scalar_ptr(raw);
+}
+
+bool ggml_cann_need_bcast(const ggml_tensor * t0, const ggml_tensor * t1) {
    for (int i = 0; i < GGML_MAX_DIMS; i++) {
        if (t1->ne[i] != t0->ne[i] && t1->ne[i] != 1) {
            return true;
@@ -101,15 +111,16 @@ bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
    return false;
 }

-int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
-                                  const ggml_tensor* src1,
-                                  int64_t* bcast_src0_ne,
-                                  int64_t* bcast_src1_ne, size_t* bcast_src0_nb,
-                                  size_t* bcast_src1_nb) {
+int64_t ggml_cann_get_bcast_shape(const ggml_tensor * src0,
+                                  const ggml_tensor * src1,
+                                  int64_t *           bcast_src0_ne,
+                                  int64_t *           bcast_src1_ne,
+                                  size_t *            bcast_src0_nb,
+                                  size_t *            bcast_src1_nb) {
    GGML_ASSERT(ggml_can_repeat(src1, src0));
    int bcast_dim_cnt = 0;
    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        int64_t nr = src0->ne[i] / src1->ne[i];
+        int64_t nr                   = src0->ne[i] / src1->ne[i];
        bcast_src0_ne[bcast_dim_cnt] = src0->ne[i] / nr;
        bcast_src1_ne[bcast_dim_cnt] = src1->ne[i];
        bcast_src0_nb[bcast_dim_cnt] = src0->nb[i];
@@ -119,21 +130,26 @@ int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
            // Need to add an extra dim.
            bcast_src0_ne[bcast_dim_cnt] = nr;
            bcast_src1_ne[bcast_dim_cnt] = 1;
-            bcast_src0_nb[bcast_dim_cnt] = bcast_src0_nb[bcast_dim_cnt - 1] *
-                                           bcast_src0_ne[bcast_dim_cnt - 1];
-            bcast_src1_nb[bcast_dim_cnt] = bcast_src1_nb[bcast_dim_cnt - 1] *
-                                           bcast_src1_ne[bcast_dim_cnt - 1];
+            bcast_src0_nb[bcast_dim_cnt] = bcast_src0_nb[bcast_dim_cnt - 1] * bcast_src0_ne[bcast_dim_cnt - 1];
+            bcast_src1_nb[bcast_dim_cnt] = bcast_src1_nb[bcast_dim_cnt - 1] * bcast_src1_ne[bcast_dim_cnt - 1];
            bcast_dim_cnt++;
        }
    }
    return bcast_dim_cnt;
 }

-int64_t ggml_cann_get_mulmat_bcast_shape(
-    const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne,
-    const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb,
-    int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
-    size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb) {
+int64_t ggml_cann_get_mulmat_bcast_shape(const int64_t * input_ne,
+                                         const int64_t * weight_ne,
+                                         const int64_t * dst_ne,
+                                         const size_t *  input_nb,
+                                         const size_t *  weight_nb,
+                                         const size_t *  dst_nb,
+                                         int64_t *       bcast_input_ne,
+                                         int64_t *       bcast_weight_ne,
+                                         int64_t *       bcast_dst_ne,
+                                         size_t *        bcast_input_nb,
+                                         size_t *        bcast_weight_nb,
+                                         size_t *        bcast_dst_nb) {
    // input and dst shoule in same shape, except first two dims.
    GGML_ASSERT(input_ne[2] == dst_ne[2]);
    GGML_ASSERT(input_ne[3] == dst_ne[3]);
@@ -148,34 +164,30 @@ int64_t ggml_cann_get_mulmat_bcast_shape(
        // Do not use bcast in the first two dimensions because we only support
        // the bcast batch dimension. Just copy them.
        if (i < 2 || nr == 1) {
-            bcast_input_ne[bcast_dim_cnt] = input_ne[i];
+            bcast_input_ne[bcast_dim_cnt]  = input_ne[i];
            bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
-            bcast_dst_ne[bcast_dim_cnt] = dst_ne[i];
+            bcast_dst_ne[bcast_dim_cnt]    = dst_ne[i];

-            bcast_input_nb[bcast_dim_cnt] = input_nb[i];
+            bcast_input_nb[bcast_dim_cnt]  = input_nb[i];
            bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
-            bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
+            bcast_dst_nb[bcast_dim_cnt]    = dst_nb[i];
            bcast_dim_cnt++;
        } else {
            // Need to add an extra dim.
-            bcast_input_ne[bcast_dim_cnt] = nr;
-            bcast_dst_ne[bcast_dim_cnt] = nr;
+            bcast_input_ne[bcast_dim_cnt]  = nr;
+            bcast_dst_ne[bcast_dim_cnt]    = nr;
            bcast_weight_ne[bcast_dim_cnt] = 1;
-            bcast_input_nb[bcast_dim_cnt] = input_nb[i];
-            bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
+            bcast_input_nb[bcast_dim_cnt]  = input_nb[i];
+            bcast_dst_nb[bcast_dim_cnt]    = dst_nb[i];
            bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
            bcast_dim_cnt++;

-            bcast_input_ne[bcast_dim_cnt] = input_ne[i] / nr;
-            bcast_dst_ne[bcast_dim_cnt] = dst_ne[i] / nr;
+            bcast_input_ne[bcast_dim_cnt]  = input_ne[i] / nr;
+            bcast_dst_ne[bcast_dim_cnt]    = dst_ne[i] / nr;
            bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
-            bcast_input_nb[bcast_dim_cnt] = bcast_input_nb[bcast_dim_cnt - 1] *
-                                            bcast_input_ne[bcast_dim_cnt - 1];
-            bcast_dst_nb[bcast_dim_cnt] = bcast_dst_nb[bcast_dim_cnt - 1] *
-                                          bcast_dst_ne[bcast_dim_cnt - 1];
-            bcast_weight_nb[bcast_dim_cnt] =
-                bcast_weight_nb[bcast_dim_cnt - 1] *
-                bcast_weight_ne[bcast_dim_cnt - 1];
+            bcast_input_nb[bcast_dim_cnt]  = bcast_input_nb[bcast_dim_cnt - 1] * bcast_input_ne[bcast_dim_cnt - 1];
+            bcast_dst_nb[bcast_dim_cnt]    = bcast_dst_nb[bcast_dim_cnt - 1] * bcast_dst_ne[bcast_dim_cnt - 1];
+            bcast_weight_nb[bcast_dim_cnt] = bcast_weight_nb[bcast_dim_cnt - 1] * bcast_weight_ne[bcast_dim_cnt - 1];
            bcast_dim_cnt++;
        }
    }
--- a/ggml/src/ggml-cann/acl_tensor.h
+++ b/ggml/src/ggml-cann/acl_tensor.h
@@ -23,11 +23,12 @@
 #ifndef CANN_ACL_TENSOR_H
 #define CANN_ACL_TENSOR_H

-#include <algorithm>
-#include <cstring>
+#include "common.h"

 #include <aclnn/aclnn_base.h>
-#include "common.h"
+
+#include <algorithm>
+#include <cstring>

 /**
 * @brief	Maps a ggml_type to its corresponding aclDataType.
@@ -43,6 +44,20 @@
 */
 aclDataType ggml_cann_type_mapping(ggml_type type);

+// Deleter for acl objects.
+template <typename T, aclError (*DestroyFunc)(const T *)> struct acl_deleter {
+    void operator()(T * ptr) const noexcept {
+        if (ptr) {
+            ACL_CHECK(DestroyFunc(ptr));
+        }
+    }
+};
+
+using acl_tensor_ptr      = std::unique_ptr<aclTensor, acl_deleter<aclTensor, aclDestroyTensor>>;
+using acl_int_array_ptr   = std::unique_ptr<aclIntArray, acl_deleter<aclIntArray, aclDestroyIntArray>>;
+using acl_scalar_ptr      = std::unique_ptr<aclScalar, acl_deleter<aclScalar, aclDestroyScalar>>;
+using acl_tensor_list_ptr = std::unique_ptr<aclTensorList, acl_deleter<aclTensorList, aclDestroyTensorList>>;
+
 /**
 * @brief   Creates an ACL tensor from a ggml_tensor with optional shape.
 *
@@ -62,10 +77,12 @@ aclDataType ggml_cann_type_mapping(ggml_type type);
 * @param   offset      Offset in bytes for the ACL tensor data. Defaults to 0.
 * @return  Pointer to the created ACL tensor.
 */
-aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = nullptr,
-                             size_t* nb = nullptr, int64_t dims = 0,
-                             aclFormat format = ACL_FORMAT_ND,
-                             size_t offset = 0);
+acl_tensor_ptr ggml_cann_create_tensor(const ggml_tensor * tensor,
+                                       int64_t *           ne     = nullptr,
+                                       size_t *            nb     = nullptr,
+                                       int64_t             dims   = 0,
+                                       aclFormat           format = ACL_FORMAT_ND,
+                                       size_t              offset = 0);

 /**
 * @brief   Template for creating an ACL tensor from provided parameters. typename TYPE
@@ -87,12 +104,15 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
 * @param   offset      Offset in bytes for the ACL tensor data. Defaults to 0.
 * @return  Pointer to the created ACL tensor.
 */
-template<typename TYPE>
-aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
-                                   TYPE type_size, int64_t* ne, TYPE* nb,
-                                   int64_t dims,
-                                   aclFormat format = ACL_FORMAT_ND,
-                                   size_t offset = 0) {
+template <typename TYPE>
+acl_tensor_ptr ggml_cann_create_tensor(void *      data_ptr,
+                                       aclDataType dtype,
+                                       TYPE        type_size,
+                                       int64_t *   ne,
+                                       TYPE *      nb,
+                                       int64_t     dims,
+                                       aclFormat   format = ACL_FORMAT_ND,
+                                       size_t      offset = 0) {
    int64_t tmp_ne[GGML_MAX_DIMS * 2];
    int64_t tmp_stride[GGML_MAX_DIMS * 2];

@@ -109,11 +129,75 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
    std::reverse(tmp_ne, tmp_ne + dims);
    std::reverse(tmp_stride, tmp_stride + dims);

-    aclTensor* acl_tensor =
-        aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
-                        format, &acl_storage_len, 1, data_ptr);
+    aclTensor * raw =
+        aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size, format, &acl_storage_len, 1, data_ptr);

-    return acl_tensor;
+    return acl_tensor_ptr(raw);
+}
+
+/**
+ * @brief Create an ACL int array resource wrapped in a smart pointer.
+ *
+ * This function constructs an aclIntArray from the provided int64_t values
+ * and returns it as an acl_int_array_ptr (a std::unique_ptr with a custom
+ * deleter). The returned pointer owns the ACL resource and will automatically
+ * destroy it via aclDestroyIntArray().
+ *
+ * @param value  Pointer to the int64_t elements.
+ * @param size   Number of elements in value.
+ *
+ * @return A smart pointer managing the created ACL int array.
+ */
+acl_int_array_ptr ggml_cann_create_int_array(const int64_t * value, uint64_t size);
+
+/**
+ * @brief Create an ACL scalar resource wrapped in a smart pointer.
+ *
+ * This function constructs an aclScalar from the raw value pointer and ACL
+ * data type, then returns it as an acl_scalar_ptr (a std::unique_ptr with
+ * a custom deleter). The returned pointer owns the ACL scalar and will
+ * automatically destroy it via aclDestroyScalar().
+ *
+ * @param value     Pointer to the raw scalar memory.
+ * @param dataType  ACL data type of the scalar.
+ *
+ * @return A smart pointer managing the created ACL scalar.
+ */
+acl_scalar_ptr ggml_cann_create_scalar(void * value, aclDataType dataType);
+
+/**
+ * @brief Create an ACL tensor list from multiple tensor smart pointers.
+ *
+ * This function accepts a variadic list of acl_tensor_ptr (a unique_ptr with
+ * custom deleter) and produces an aclTensorList using aclCreateTensorList().
+ *
+ * The lifecycle management of the tensor objects changes as follows:
+ *  - aclCreateTensorList() takes ownership of the tensors
+ *  - Each input smart pointer releases ownership using release()
+ *  - As a result, the tensors will NOT be destroyed by unique_ptr
+ *  - Instead, they will be destroyed when aclDestroyTensorList() is called
+ *
+ * This ensures correct ownership transfer and prevents double-free situations.
+ *
+ * @param acl_tensor_ptr  Variadic template parameter; each argument must be
+ *                         a unique_ptr-like type supporting get() and release().
+ *
+ * @param tensors  Variadic list of acl_tensor_ptr objects. Ownership of
+ *                         each tensor is transferred away from these smart pointers.
+ *
+ * @return A smart pointer (acl_tensor_list_ptr) owning the created ACL tensor list.
+ *
+ * @note This implementation is C++11 compatible. The ownership-release process is
+ *       executed using a pack expansion inside an initializer list.
+ */
+template <typename... acl_tensor_ptr> acl_tensor_list_ptr ggml_cann_create_tensor_list(acl_tensor_ptr &&... tensors) {
+    aclTensor *     raw_tensors[] = { tensors.get()... };
+    aclTensorList * raw           = aclCreateTensorList(raw_tensors, sizeof...(tensors));
+    // aclTensor will release by aclTensorList, so release ownership without
+    // destroying the tensor
+    int             dummy[]       = { (tensors.release(), 0)... };
+    GGML_UNUSED(dummy);
+    return acl_tensor_list_ptr(raw);
 }

 /**
@@ -132,7 +216,7 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
 *          to 1. If such a dimension is found, broadcasting is required to align t1
 *          with t0 for element-wise operations.
 */
-bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1);
+bool ggml_cann_need_bcast(const ggml_tensor * t0, const ggml_tensor * t1);

 /**
 * @brief   Computes broadcast shapes and strides for two ggml_tensors.
@@ -187,19 +271,21 @@ bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1);
 *  dim1 in a inserted dim, should add nb for dim1,
 *  and all other nb moves to next in order.
 */
-int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
-                        int64_t* bcast_ne_src0, int64_t* bcast_ne_src1,
-                        size_t* bcast_nb_src0, size_t* bcast_nb_src1);
+int64_t ggml_cann_get_bcast_shape(const ggml_tensor * src0,
+                                  const ggml_tensor * src1,
+                                  int64_t *           bcast_ne_src0,
+                                  int64_t *           bcast_ne_src1,
+                                  size_t *            bcast_nb_src0,
+                                  size_t *            bcast_nb_src1);

 // Bcast macro to avoid duplicate code.
-#define BCAST_SHAPE(src0, src1)                                              \
-    int64_t bcast_##src0##_ne[GGML_MAX_DIMS * 2];                            \
-    int64_t bcast_##src1##_ne[GGML_MAX_DIMS * 2];                            \
-    size_t bcast_##src0##_nb[GGML_MAX_DIMS * 2];                             \
-    size_t bcast_##src1##_nb[GGML_MAX_DIMS * 2];                             \
-    int64_t bcast_dims = ggml_cann_get_bcast_shape(                          \
-        src0, src1, bcast_##src0##_ne, bcast_##src1##_ne, bcast_##src0##_nb, \
-        bcast_##src1##_nb);
+#define BCAST_SHAPE(src0, src1)                                                                      \
+    int64_t bcast_##src0##_ne[GGML_MAX_DIMS * 2];                                                    \
+    int64_t bcast_##src1##_ne[GGML_MAX_DIMS * 2];                                                    \
+    size_t  bcast_##src0##_nb[GGML_MAX_DIMS * 2];                                                    \
+    size_t  bcast_##src1##_nb[GGML_MAX_DIMS * 2];                                                    \
+    int64_t bcast_dims = ggml_cann_get_bcast_shape(src0, src1, bcast_##src0##_ne, bcast_##src1##_ne, \
+                                                   bcast_##src0##_nb, bcast_##src1##_nb);

 #define BCAST_PARAM(tensor) bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims

@@ -233,26 +319,31 @@ int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* sr
 *       before cast dim.
 * @sa ggml_cann_get_bcast_shape
 */
-int64_t ggml_cann_get_mulmat_bcast_shape(
-    const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne,
-    const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb,
-    int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
-    size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb);
+int64_t ggml_cann_get_mulmat_bcast_shape(const int64_t * input_ne,
+                                         const int64_t * weight_ne,
+                                         const int64_t * dst_ne,
+                                         const size_t *  input_nb,
+                                         const size_t *  weight_nb,
+                                         const size_t *  dst_nb,
+                                         int64_t *       bcast_input_ne,
+                                         int64_t *       bcast_weight_ne,
+                                         int64_t *       bcast_dst_ne,
+                                         size_t *        bcast_input_nb,
+                                         size_t *        bcast_weight_nb,
+                                         size_t *        bcast_dst_nb);

 // Bcast macro to avoid duplicate code.
-#define BCAST_MUL_MAT_SHAPE(input, weight, dst)                         \
-    int64_t bcast_##input##_ne[GGML_MAX_DIMS * 2];                      \
-    int64_t bcast_##weight##_ne[GGML_MAX_DIMS * 2];                     \
-    int64_t bcast_##dst##_ne[GGML_MAX_DIMS * 2];                        \
-    size_t bcast_##input##_nb[GGML_MAX_DIMS * 2];                       \
-    size_t bcast_##weight##_nb[GGML_MAX_DIMS * 2];                      \
-    size_t bcast_##dst##_nb[GGML_MAX_DIMS * 2];                         \
-    int64_t bcast_dims = ggml_cann_get_mulmat_bcast_shape(              \
-        input->ne, weight->ne, dst->ne, input->nb, weight->nb, dst->nb, \
-        bcast_##input##_ne, bcast_##weight##_ne, bcast_##dst##_ne,      \
-        bcast_##input##_nb, bcast_##weight##_nb, bcast_##dst##_nb);
+#define BCAST_MUL_MAT_SHAPE(input, weight, dst)                                                                  \
+    int64_t bcast_##input##_ne[GGML_MAX_DIMS * 2];                                                               \
+    int64_t bcast_##weight##_ne[GGML_MAX_DIMS * 2];                                                              \
+    int64_t bcast_##dst##_ne[GGML_MAX_DIMS * 2];                                                                 \
+    size_t  bcast_##input##_nb[GGML_MAX_DIMS * 2];                                                               \
+    size_t  bcast_##weight##_nb[GGML_MAX_DIMS * 2];                                                              \
+    size_t  bcast_##dst##_nb[GGML_MAX_DIMS * 2];                                                                 \
+    int64_t bcast_dims = ggml_cann_get_mulmat_bcast_shape(                                                       \
+        input->ne, weight->ne, dst->ne, input->nb, weight->nb, dst->nb, bcast_##input##_ne, bcast_##weight##_ne, \
+        bcast_##dst##_ne, bcast_##input##_nb, bcast_##weight##_nb, bcast_##dst##_nb);

-#define BCAST_MUL_MAT_PARAM(tensor) \
-    bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims
+#define BCAST_MUL_MAT_PARAM(tensor) bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims

 #endif  // CANN_ACL_TENSOR_H
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -23,31 +23,35 @@
 #ifndef CANN_ACLNN_OPS
 #define CANN_ACLNN_OPS

-#include <unordered_set>
-#include <functional>
+#include "acl_tensor.h"
+#include "common.h"
+
 #include <aclnnop/aclnn_abs.h>
-#include <aclnnop/aclnn_neg.h>
-#include <aclnnop/aclnn_exp.h>
 #include <aclnnop/aclnn_arange.h>
 #include <aclnnop/aclnn_argsort.h>
 #include <aclnnop/aclnn_cat.h>
 #include <aclnnop/aclnn_clamp.h>
+#include <aclnnop/aclnn_cos.h>
+#include <aclnnop/aclnn_exp.h>
 #include <aclnnop/aclnn_gelu.h>
 #include <aclnnop/aclnn_gelu_v2.h>
-#include <aclnnop/aclnn_sigmoid.h>
 #include <aclnnop/aclnn_hardsigmoid.h>
 #include <aclnnop/aclnn_hardswish.h>
 #include <aclnnop/aclnn_leaky_relu.h>
-#include <aclnnop/aclnn_relu.h>
-#include <aclnnop/aclnn_silu.h>
-#include <aclnnop/aclnn_tanh.h>
-#include <aclnnop/aclnn_sqrt.h>
-#include <aclnnop/aclnn_sin.h>
-#include <aclnnop/aclnn_cos.h>
 #include <aclnnop/aclnn_log.h>
+#include <aclnnop/aclnn_logsoftmax.h>
+#include <aclnnop/aclnn_neg.h>
+#include <aclnnop/aclnn_norm.h>
+#include <aclnnop/aclnn_relu.h>
+#include <aclnnop/aclnn_sigmoid.h>
 #include <aclnnop/aclnn_sign.h>
-#include "acl_tensor.h"
-#include "common.h"
+#include <aclnnop/aclnn_silu.h>
+#include <aclnnop/aclnn_sin.h>
+#include <aclnnop/aclnn_sqrt.h>
+#include <aclnnop/aclnn_tanh.h>
+
+#include <functional>
+#include <unordered_set>

 /**
 * @brief   Repeats a ggml tensor along each dimension to match the dimensions
@@ -62,7 +66,7 @@
 * @param   dst The ggml tensor representing the destination, which op is
 *              GGML_OP_REPEAT and specifies the desired dimensions.
 */
-void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_repeat(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Applies the Leaky ReLU activation function to a tensor using the CANN
@@ -82,7 +86,7 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the result of the Leaky ReLU
 *            activation is stored, which op is `GGML_OP_LEAKY_RELU`
 */
-void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_leaky_relu(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief    Concatenates multiple tensors along a specified dimension using the
@@ -97,7 +101,7 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @attention tensorList length should be 2 and the dimension using for concat
 *            default to 1.
 */
-void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_concat(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Generates a sequence of evenly spaced values within a specified
@@ -113,7 +117,7 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 *            `start`, 'stop' and 'step' are in dst->op_params and dst->op is
 *            `GGML_OP_ARANGE`.
 */
-void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_arange(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Applies a clamp operation to the elements of a ggml tensor using the
@@ -131,7 +135,7 @@ void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the clamped values will be stored.
 *            dst->op is `GGML_OP_CLAMP`, `min` and `max` value is in dst->params.
 */
-void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_clamp(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Scales the elements of a ggml tensor by a constant factor using the
@@ -148,7 +152,7 @@ void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the scaled values will be stored.
 *            dst->op is `GGML_OP_SCALE` and `scale` value is in dst->params.
 */
-void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_scale(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Sorts the elements of a ggml tensor and returns the indices that
@@ -163,7 +167,7 @@ void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the sorted indices will be stored.
 *            dst->op is `GGML_OP_ARGSORT`.
 */
-void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_argsort(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Computes the Layer Normalization for a ggml tensor using the CANN
@@ -185,7 +189,67 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the normalized values will be stored.
 * @attention `Var` defaults to dst->ne[0].
 */
-void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief   Computes the L2 Normalization for a ggml tensor using the CANN
+ *          backend.
+ *
+ * @details This function applies the L2 Normalization operation on the
+ *          input tensor `src` and stores the result in the destination tensor
+ *          `dst`. L2 Normalization scales the input tensor such that the
+ *          L2 norm along the specified dimension equals 1. This operation
+ *          is commonly used in neural networks for feature normalization
+ *          and vector scaling.
+ *          The operation is defined as:
+ *          \f[
+ *              \text{out} = \frac{x}{\sqrt{\sum{x^2}}}
+ *          \f]
+ *          The normalization is performed along the last dimension by default.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the normalized values will be stored.
+ * @attention The normalization is performed along the last dimension of the
+ *            input tensor by default.
+ */
+void ggml_cann_l2_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief   Computes the Cross Entropy Loss for a ggml tensor using the CANN
+ *          backend.
+ *
+ * @details This function computes the cross entropy loss between the predicted
+ *          logits and target probability distributions. The operation follows
+ *          the same computation pattern as the CPU implementation:
+ *          1. Applies log_softmax to the logits along the class dimension
+ *          2. Element-wise multiplication with target distributions
+ *          3. Summation along the class dimension to get per-sample losses
+ *          4. Global summation and scaling by -1/nr to get final loss
+ *
+ *          The computation can be expressed as:
+ *          \f[
+ *              \text{loss} = -\frac{1}{N} \sum_{i=1}^{N} \sum_{j=1}^{C} y_{ij} \cdot \log(\text{softmax}(x_{ij}))
+ *          \f]
+ *          where \f$N\f$ is the total number of samples, \f$C\f$ is the number
+ *          of classes, \f$x\f$ are the logits, and \f$y\f$ are the target
+ *          probability distributions.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the computed loss will be stored.
+ *            This should be a scalar tensor containing the final loss value.
+ *
+ * @note This implementation computes cross entropy between probability
+ *       distributions, not the typical classification cross entropy that
+ *       expects class indices as targets. Both input tensors (src0 and src1)
+ *       should have the same shape and represent probability distributions
+ *       over the class dimension.
+ * @note The function expects two source tensors:
+ *       - dst->src[0]: Logits tensor (before softmax)
+ *       - dst->src[1]: Target probability distributions tensor
+ * @note The computation is performed using CANN backend operators including
+ *       LogSoftmax, Mul, ReduceSum, and Muls for the final scaling.
+ */
+void ggml_cann_cross_entropy_loss(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief  Computes the Group Normalization for a ggml tensor using the CANN
@@ -209,7 +273,7 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 *
 * @attention eps defaults to 1e-6f.
 */
-void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_group_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Computes the accumulation of tensors using the CANN backend.
@@ -228,7 +292,7 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the accumulated values will be stored.
 *            `inplace` is in dst->params, and dst->op is `GGML_OP_ACC`.
 */
-void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_acc(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Computes the sum of elements along the last dimension of a ggml tensor
@@ -244,7 +308,7 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 *
 * @attention `reduce_dims` defaults to 3, which means the last dimension.
 */
-void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_sum_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Computes the sum of elements in a ggml tensor.
@@ -258,7 +322,7 @@ void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 *
 */

-void ggml_cann_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Upsamples a ggml tensor using nearest neighbor interpolation using
@@ -274,8 +338,7 @@ void ggml_cann_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the upsampled values will be stored.
 *            dst->op is `GGML_OP_UPSCALE`.
 */
-void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
-                                  ggml_tensor* dst);
+void ggml_cann_upsample_nearest2d(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Pads a ggml tensor to match the dimensions of the destination tensor
@@ -290,7 +353,7 @@ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
 * @param dst The destination tensor, which specifies the target dimensions for
 *            padding. dst->op is `GGML_OP_PAD`.
 */
-void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_pad(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Executes a 2D pooling operation on a ggml tensor using the CANN
@@ -307,7 +370,7 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor on which the pooling operation is to be
 *            performed. dst->op is `GGML_OP_POOL_2D`.
 */
-void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Duplicates a ggml tensor using the CANN backend.
@@ -326,7 +389,7 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 *            different shape and dst is no-contiguous.
 * @note:     This func need to simplify.
 */
-void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_dup(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Computes the Root Mean Square (RMS) normalization of a ggml tensor
@@ -348,7 +411,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the normalized values will be stored.
 *            dst->op is `GGML_OP_RMS_NORM`.
 */
-void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_rms_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Applies a diagonal mask to the tensor with a specified value.
@@ -363,7 +426,7 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 *            `GGML_OP_DIAG_MASK`
 * @param value The value to use for masking.
 */
-void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float value);
+void ggml_cann_diag_mask(ggml_backend_cann_context & ctx, ggml_tensor * dst, float value);

 /**
 * @brief   Performs an image-to-column transformation on the input tensor.
@@ -378,7 +441,7 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float
 * @param dst The destination tensor that stores the result of the operation.
 *            dst->op is `GGML_OP_IM2COL`.
 */
-void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_im2col(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Computes time step embeddings using sine and cosine functions.
@@ -392,10 +455,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the result of the embedding operation
 *            will be stored. dst->op is `GGML_OP_TIMESTEP_EMBEDDING`.
 */
-void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_timestep_embedding(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 // @see ggml_cann_dup.
-void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Computes the softmax activation with optional masking.
@@ -417,7 +480,7 @@ void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the result will be stored. dst->op is
 *            `GGML_OP_SOFTMAX`.
 */
-void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Extracts specific rows from a tensor based on indices.
@@ -429,7 +492,7 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param ctx The backend CANN context for executing operations.
 * @param dst The destination tensor where the extracted rows will be stored.
 */
-void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Writes specific rows into a tensor at positions specified by indices.
@@ -441,7 +504,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param ctx The backend CANN context for executing operations.
 * @param dst The destination tensor where the specified rows will be updated.
 */
-void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Executes matrix multiplication for the given tensor.
@@ -454,7 +517,7 @@ void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor for storing the result of the matrix
 *            multiplication. dst->op is `GGML_OP_MUL_MAT`.
 */
-void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief Applies Rotary Positional Embedding (RoPE) to the input tensor.
@@ -477,7 +540,7 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @note The function currently does not support cases where the freq_scale is
 *       not equal 1.
 */
-void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Computes the index of the maximum value along the specified dimension
@@ -492,7 +555,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the indices of the maximum values will
 *            be stored. dst->op is `GGML_OP_ARGMAX`.
 */
-void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief Adds two tensors element-wise and stores the result in a destination
@@ -509,8 +572,10 @@ void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param acl_src1 The second source tensor.
 * @param acl_dst The destination tensor where the result will be stored.
 */
-void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
-    aclTensor* acl_src1, aclTensor* acl_dst = nullptr);
+void aclnn_add(ggml_backend_cann_context & ctx,
+               aclTensor *                 acl_src0,
+               aclTensor *                 acl_src1,
+               aclTensor *                 acl_dst = nullptr);

 /**
 * @brief Sub two tensors element-wise and stores the result in a destination
@@ -527,8 +592,10 @@ void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
 * @param acl_src1 The second source tensor.
 * @param acl_dst The destination tensor where the result will be stored.
 */
-void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
-    aclTensor* acl_src1, aclTensor* acl_dst = nullptr);
+void aclnn_sub(ggml_backend_cann_context & ctx,
+               aclTensor *                 acl_src0,
+               aclTensor *                 acl_src1,
+               aclTensor *                 acl_dst = nullptr);

 /**
 * @brief Performs element-wise multiplication of two tensors and stores the
@@ -546,8 +613,10 @@ void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
 * @param acl_other The second tensor for element-wise multiplication.
 * @param acl_dst The destination tensor where the result will be stored.
 */
-void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-    aclTensor* acl_other, aclTensor* acl_dst = nullptr);
+void aclnn_mul(ggml_backend_cann_context & ctx,
+               aclTensor *                 acl_src,
+               aclTensor *                 acl_other,
+               aclTensor *                 acl_dst = nullptr);

 /**
 * @brief Matrix division, optionally in-place.
@@ -567,8 +636,10 @@ void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 * @param inplace Flag indicating whether to perform the operation in-place on
 * `acl_src`.
 */
-void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-    aclTensor* acl_other, aclTensor* acl_dst = nullptr);
+void aclnn_div(ggml_backend_cann_context & ctx,
+               aclTensor *                 acl_src,
+               aclTensor *                 acl_other,
+               aclTensor *                 acl_dst = nullptr);

 /**
 * @brief Applies element-wise cosine function to the elements of a tensor.
@@ -584,8 +655,7 @@ void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 * @param acl_dst The destination tensor where the cosine results will be
 * stored.
 */
-void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-    aclTensor* acl_dst);
+void aclnn_cos(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst);

 /**
 * @brief Applies element-wise sine function to the elements of a tensor.
@@ -602,8 +672,7 @@ void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 * @param acl_src The source tensor on which the sine function will be applied.
 * @param acl_dst The destination tensor where the sine results will be stored.
 */
-void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-    aclTensor* acl_dst);
+void aclnn_sin(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst);

 /**
 * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one
@@ -621,8 +690,12 @@ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 * @param acl_src1 Output pointer to the created ACL tensor corresponding to src1.
 * @param acl_dst  Output pointer to the created ACL tensor corresponding to dst.
 */
-void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst,
-    aclTensor ** acl_src0, aclTensor ** acl_src1, aclTensor ** acl_dst);
+void bcast_shape(ggml_tensor *    src0,
+                 ggml_tensor *    src1,
+                 ggml_tensor *    dst,
+                 acl_tensor_ptr & acl_src0,
+                 acl_tensor_ptr & acl_src1,
+                 acl_tensor_ptr & acl_dst);

 /**
 * @brief   Computes the 1D transposed convolution (deconvolution) of a ggml
@@ -637,7 +710,7 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst,
 * @param dst The destination tensor where the transposed convolution result
 * will be stored. dst->op is `GGML_OP_CONV_TRANSPOSE_1D`.
 */
-void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_conv_transpose_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Applies the ELU (Exponential Linear Unit) activation to a ggml tensor
@@ -662,7 +735,7 @@ void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* ds
 * @param dst The destination tensor where the ELU-activated result will be stored.
 *            dst->op is expected to be `GGML_OP_ELU`.
 */
-void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_elu(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Computes the mean of a ggml tensor element-wise using the CANN backend.
@@ -677,7 +750,7 @@ void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the mean result will be stored.
 *            dst->op is expected to be `GGML_OP_MEAN`.
 */
-void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_mean(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Applies 1D reflect padding to a ggml tensor using the CANN backend.
@@ -692,7 +765,7 @@ void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the padded result will be stored.
 *            dst->op is expected to be `GGML_OP_PAD_REFLECT_1D`.
 */
-void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_pad_reflect_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Counts the number of equal elements in two ggml tensors using the CANN backend.
@@ -708,7 +781,7 @@ void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the result will be stored.
 *            dst->op is expected to be `GGML_OP_COUNT_EQUAL`.
 */
-void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Applies the Step activation function to a ggml tensor using the CANN backend.
@@ -723,7 +796,7 @@ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the result will be stored.
 *            dst->op is expected to be `GGML_OP_STEP`.
 */
-void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Performs the Flash Attention extended operator using the CANN backend.
@@ -738,59 +811,46 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 * @param dst The destination tensor where the result will be stored.
 *            dst->op is expected to be `GGML_OP_FLASH_ATTN_EXT`.
 */
-void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /*
 * @brief A generic wrapper for ACL resources with custom deleter support.
 */
-using any_acl_resource = std::unique_ptr<void, std::function<void(void*)>>;
+using any_acl_resource = std::unique_ptr<void, std::function<void(void *)>>;

 /**
 * @brief Trait structure used to define how to destroy a given ACL resource type.
 *
 * @tparam T ACL resource type.
 */
-template<typename T>
-struct acl_resource_traits;
+template <typename T> struct acl_resource_traits;

 /**
 * @brief Specialization for aclTensor, defines how to destroy an aclTensor resource.
 */
-template<>
-struct acl_resource_traits<aclTensor> {
-    static void destroy(void* p) {
-        ACL_CHECK(aclDestroyTensor(static_cast<aclTensor*>(p)));
-    }
+template <> struct acl_resource_traits<aclTensor> {
+    static void destroy(void * p) { ACL_CHECK(aclDestroyTensor(static_cast<aclTensor *>(p))); }
 };

 /**
 * @brief Specialization for aclIntArray, defines how to destroy an aclIntArray resource.
 */
-template<>
-struct acl_resource_traits<aclIntArray> {
-    static void destroy(void* p) {
-        ACL_CHECK(aclDestroyIntArray(static_cast<aclIntArray*>(p)));
-    }
+template <> struct acl_resource_traits<aclIntArray> {
+    static void destroy(void * p) { ACL_CHECK(aclDestroyIntArray(static_cast<aclIntArray *>(p))); }
 };

 /**
 * @brief Specialization for aclScalar, defines how to destroy an aclScalar resource.
 */
-template<>
-struct acl_resource_traits<aclScalar> {
-    static void destroy(void* p) {
-        ACL_CHECK(aclDestroyScalar(static_cast<aclScalar*>(p)));
-    }
+template <> struct acl_resource_traits<aclScalar> {
+    static void destroy(void * p) { ACL_CHECK(aclDestroyScalar(static_cast<aclScalar *>(p))); }
 };

 /**
 * @brief Specialization for aclTensorList, defines how to destroy an aclTensorList resource.
 */
-template<>
-struct acl_resource_traits<aclTensorList> {
-    static void destroy(void* p) {
-        ACL_CHECK(aclDestroyTensorList(static_cast<aclTensorList*>(p)));
-    }
+template <> struct acl_resource_traits<aclTensorList> {
+    static void destroy(void * p) { ACL_CHECK(aclDestroyTensorList(static_cast<aclTensorList *>(p))); }
 };

 /**
@@ -800,14 +860,8 @@ struct acl_resource_traits<aclTensorList> {
 * @param ptr Raw pointer to ACL resource.
 * @return any_acl_resource Smart pointer that handles destruction.
 */
-template<typename T>
-any_acl_resource make_acl_resource(T* ptr) {
-    return any_acl_resource(
-        static_cast<void*>(ptr),
-        [](void* p) {
-            acl_resource_traits<T>::destroy(p);
-        }
-    );
+template <typename T> any_acl_resource make_acl_resource(T * ptr) {
+    return any_acl_resource(static_cast<void *>(ptr), [](void * p) { acl_resource_traits<T>::destroy(p); });
 }

 /**
@@ -817,89 +871,10 @@ any_acl_resource make_acl_resource(T* ptr) {
 * @param vec Target vector to hold ACL resources.
 * @param args Raw pointers to ACL resources.
 */
-template<typename... Args>
-void register_acl_resources(std::vector<any_acl_resource>& vec, Args*... args) {
+template <typename... Args> void register_acl_resources(std::vector<any_acl_resource> & vec, Args *... args) {
    (vec.emplace_back(make_acl_resource(args)), ...);
 }

-/**
- * @brief Task class that wraps the execution of an aclnn function call.
- */
-class aclnn_task : public cann_task {
-    public:
-        aclnn_task(aclnn_func_t aclnn_func, void * workspace_addr,
-                   uint64_t workspace_size, aclOpExecutor * executor,
-                   aclrtStream stream) :
-            aclnn_func_(aclnn_func),
-            workspace_addr_(workspace_addr),
-            workspace_size_(workspace_size),
-            executor_(executor),
-            stream_(stream) {}
-        virtual void run_task() override {
-            ACL_CHECK(aclnn_func_(workspace_addr_, workspace_size_, executor_, stream_));
-        }
-    private:
-        aclnn_func_t aclnn_func_;
-        void *          workspace_addr_;
-        uint64_t        workspace_size_;
-        aclOpExecutor * executor_;
-        aclrtStream     stream_;
-};
-
-/**
- * @brief Task class that releases ACL resources after usage.
- */
-class release_resource_task : public cann_task {
-public:
-    release_resource_task(std::vector<any_acl_resource>&& resources){
-        resource_ = std::move(resources);
-    }
-
-    virtual void run_task() override {
-        resource_.clear();
-    }
-private:
-    std::vector<any_acl_resource> resource_;
-};
-
-/**
- * @brief Task class for performing asynchronous memory copy operations.
- */
-class async_memcpy_task : public cann_task {
-public:
-    async_memcpy_task(void* dst, const void* src, size_t size,
-                      aclrtMemcpyKind kind, aclrtStream stream)
-        : dst_(dst), src_(src), size_(size), kind_(kind), stream_(stream) {}
-
-    virtual void run_task() override {
-        ACL_CHECK(aclrtMemcpyAsync(dst_, size_, src_, size_, kind_, stream_));
-    }
-private:
-    void* dst_;
-    const void* src_;
-    size_t size_;
-    aclrtMemcpyKind kind_;
-    aclrtStream stream_;
-};
-
-/**
- * @brief Task class for performing asynchronous memory set operations.
- */
-class async_memset_task : public cann_task {
-    public:
-    async_memset_task(void* buffer, size_t size, int32_t value, aclrtStream stream)
-            : buffer_(buffer), size_(size), value_(value), stream_(stream) {}
-
-        virtual void run_task() override {
-            ACL_CHECK(aclrtMemsetAsync(buffer_, size_, value_, size_, stream_));
-        }
-    private:
-        void* buffer_;
-        size_t size_;
-        int32_t value_;
-        aclrtStream stream_;
-};
-
 /**
 * @brief Launches an asynchronous task using the memory allocator.
 *
@@ -918,92 +893,20 @@ class async_memset_task : public cann_task {
 * same stream are executed in queue order.
 */

-#define GGML_CANN_CALL_ACLNN_OP(CTX, OP_NAME, ...)                                          \
-    do {                                                                                    \
-        uint64_t        workspaceSize = 0;                                                  \
-        aclOpExecutor * executor;                                                           \
-        void *          workspaceAddr = nullptr;                                            \
-        ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor));\
-        /* workspace should alloced in main thread to keep malloc order when using vmm. */  \
-        if (workspaceSize > 0) {                                                            \
-            ggml_cann_pool_alloc workspace_allocator(CTX.pool(), workspaceSize);            \
-            workspaceAddr = workspace_allocator.get();                                      \
-        }                                                                                   \
-        if (CTX.async_mode) {                                                               \
-            auto task =                                                                     \
-                std::make_unique<aclnn_task>(aclnn##OP_NAME, workspaceAddr, workspaceSize,  \
-                    executor, CTX.stream()); \
-            CTX.task_queue.submit_task(std::move(task));                                    \
-        } else {                                                                            \
-            ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, CTX.stream()));\
-        }                                                                                   \
+#define GGML_CANN_CALL_ACLNN_OP(CTX, OP_NAME, ...)                                           \
+    do {                                                                                     \
+        uint64_t        workspaceSize = 0;                                                   \
+        aclOpExecutor * executor;                                                            \
+        void *          workspaceAddr = nullptr;                                             \
+        ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor)); \
+        /* workspace should alloced in main thread to keep malloc order when using vmm. */   \
+        if (workspaceSize > 0) {                                                             \
+            ggml_cann_pool_alloc workspace_allocator(CTX.pool(), workspaceSize);             \
+            workspaceAddr = workspace_allocator.get();                                       \
+        }                                                                                    \
+        ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, CTX.stream()));     \
    } while (0)

-/**
- * @brief Registers and releases multiple ACL resources, optionally deferring the release
- *        using a task.
- *
- * @tparam Args Types of the ACL resources.
- * @param ctx Backend context which manages task submission and async mode.
- * @param args Pointers to ACL resources to be released.
- */
-template <typename... Args>
-void ggml_cann_release_resources(ggml_backend_cann_context & ctx, Args &&... args) {
-    std::vector<any_acl_resource> resources;
-    register_acl_resources(resources, std::forward<Args>(args)...);
-    if(ctx.async_mode) {
-        auto task = std::make_unique<release_resource_task>(std::move(resources));
-        ctx.task_queue.submit_task(std::move(task));
-    }
-}
-
-/**
- * @brief Performs an asynchronous memory copy operation, optionally deferred via task submission.
- *
- * @param ctx Backend context containing stream and async configuration.
- * @param dst Destination memory address.
- * @param src Source memory address.
- * @param len Size of memory to copy (in bytes).
- * @param kind Type of memory copy (host-to-device, device-to-host, etc).
- */
-inline void ggml_cann_async_memcpy(ggml_backend_cann_context & ctx, void * dst,
-                                   const void * src, size_t len, aclrtMemcpyKind kind) {
-    if (ctx.async_mode) {
-        auto task = std::make_unique<async_memcpy_task>(dst, const_cast<void *>(src), len, kind, ctx.stream());
-        ctx.task_queue.submit_task(std::move(task));
-    } else {
-        ACL_CHECK(aclrtMemcpyAsync(dst, len, src, len, kind, ctx.stream()));
-    }
-}
-
-inline void ggml_cann_async_memcpy(ggml_backend_cann_context * ctx, void * dst,
-                                   const void * src, size_t len, aclrtMemcpyKind kind) {
-    if (ctx->async_mode) {
-        auto task = std::make_unique<async_memcpy_task>(dst, const_cast<void *>(src), len, kind, ctx->stream());
-        ctx->task_queue.submit_task(std::move(task));
-    } else {
-        ACL_CHECK(aclrtMemcpyAsync(dst, len, src, len, kind, ctx->stream()));
-    }
-}
-
-/**
- * @brief Performs an asynchronous memory set operation, optionally deferred via task submission.
- *
- * @param ctx Backend context containing stream and async configuration.
- * @param buffer Memory buffer to be set.
- * @param size Size of the memory buffer (in bytes).
- * @param value Value to set in the buffer.
- */
-inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffer,
-                                   size_t size, int value) {
-    if (ctx.async_mode) {
-        auto task = std::make_unique<async_memset_task>(buffer, size, value, ctx.stream());
-        ctx.task_queue.submit_task(std::move(task));
-    } else {
-        ACL_CHECK(aclrtMemsetAsync(buffer, size, value, size, ctx.stream()));
-    }
-}
-
 /**
 * @brief   Performs sparse expert-based matrix multiplication using the CANN backend.
 *
@@ -1029,7 +932,7 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe
 * @param dst The destination tensor where the expert-weighted token outputs are stored.
 *            Expected to be of shape [M, K, N, 1].
 */
-void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_mul_mat_id(ggml_backend_cann_context & ctx, ggml_tensor * dst);

 /**
 * @brief   Check whether a tensor is a weight tensor for matrix multiplication.
@@ -1041,20 +944,14 @@ void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 *
 * @param tensor Pointer to the target ggml_tensor object (const-qualified).
 */
-static bool is_matmul_weight(const ggml_tensor* tensor) {
-    std::string name = ggml_get_name(tensor);
-    static const std::unordered_set<std::string> weight_suffixes{
-        "output.weight",
-        "attn_q.weight",
-        "attn_k.weight",
-        "attn_v.weight",
-        "attn_output.weight",
-        "ffn_gate.weight",
-        "ffn_up.weight",
-        "ffn_down.weight"
-    };
+static bool is_matmul_weight(const ggml_tensor * tensor) {
+    std::string                                  name = ggml_get_name(tensor);
+    static const std::unordered_set<std::string> weight_suffixes{ "output.weight",      "attn_q.weight",
+                                                                  "attn_k.weight",      "attn_v.weight",
+                                                                  "attn_output.weight", "ffn_gate.weight",
+                                                                  "ffn_up.weight",      "ffn_down.weight" };

-    for (const auto& suffix : weight_suffixes) {
+    for (const auto & suffix : weight_suffixes) {
        if (name.find(suffix) != std::string::npos) {
            return true;
        }
@@ -1078,23 +975,17 @@ static bool is_matmul_weight(const ggml_tensor* tensor) {
 * @param ctx The CANN backend context used to manage execution and resources.
 * @param dst The destination tensor.
 */
-template <auto binary_op>
-void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    ggml_tensor* src0 = dst->src[0];
-    ggml_tensor* src1 = dst->src[1];
+template <auto binary_op> void ggml_cann_binary_op(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+    ggml_tensor * src0 = dst->src[0];
+    ggml_tensor * src1 = dst->src[1];

-    aclTensor* acl_src0;
-    aclTensor* acl_src1;
-    aclTensor* acl_dst;
+    acl_tensor_ptr acl_src0, acl_src1, acl_dst;

    // Need bcast
-    bcast_shape(src0, src1, dst, &acl_src0, &acl_src1, &acl_dst);
-    binary_op(ctx, acl_src0, acl_src1, acl_dst);
-
-    ggml_cann_release_resources(ctx, acl_src0, acl_src1, acl_dst);
+    bcast_shape(src0, src1, dst, acl_src0, acl_src1, acl_dst);
+    binary_op(ctx, acl_src0.get(), acl_src1.get(), acl_dst.get());
 }

-
 /**
 * @brief Applies a unary operation to an input tensor using the CANN backend.
 *
@@ -1102,20 +993,19 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 * and stores the result in the destination tensor.
 *
 * @tparam unary_op A callable with the signature:
- *         void(ggml_backend_cann_context&, aclTensor*, aclTensor*)
+ *         void(ggml_backend_cann_context&, aclTensor *, aclTensor *)
 *         where the first aclTensor is the source and the second is the destination.
 * @param ctx The CANN backend context for managing resources and execution.
 * @param dst The destination tensor. Its src[0] is treated as the input tensor.
 */
-template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
-    void ggml_cann_op_unary(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    ggml_tensor* src = dst->src[0];
+template <void unary_op(ggml_backend_cann_context &, aclTensor *, aclTensor *)>
+void ggml_cann_op_unary(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+    ggml_tensor * src = dst->src[0];

-    aclTensor* acl_src = ggml_cann_create_tensor(src);
-    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
+    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);

-    unary_op(ctx, acl_src, acl_dst);
-    ggml_cann_release_resources(ctx, acl_src, acl_dst);
+    unary_op(ctx, acl_src.get(), acl_dst.get());
 }

 /**
@@ -1138,9 +1028,9 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
 *
 * @see GGML_CANN_CALL_OP_UNARY
 */
-void ggml_cann_op_unary(
-    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
-    ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_op_unary(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
+                        ggml_backend_cann_context &                                                ctx,
+                        ggml_tensor *                                                              dst);

 /**
 * @brief Applies a gated (GLU-style) unary operation using the CANN backend.
@@ -1172,9 +1062,9 @@ void ggml_cann_op_unary(
 *
 * @see GGML_CANN_CALL_OP_UNARY_GATED
 */
-void ggml_cann_op_unary_gated(
-    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
-    ggml_backend_cann_context& ctx, ggml_tensor* dst);
+void ggml_cann_op_unary_gated(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
+                              ggml_backend_cann_context &                                                ctx,
+                              ggml_tensor *                                                              dst);

 /**
 * @brief Helper macro to call a unary ACL operator via ggml_cann_op_unary.
@@ -1197,16 +1087,13 @@ void ggml_cann_op_unary_gated(
 * @see ggml_cann_op_unary
 * @see GGML_CANN_CALL_ACLNN_OP
 */
-#define GGML_CANN_CALL_OP_UNARY(OP_NAME)                              \
-    do {                                                              \
-        auto lambda = [](ggml_backend_cann_context& ctx,              \
-            aclTensor* acl_src,                                       \
-            aclTensor* acl_dst) {                                     \
-            GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);  \
-        };                                                            \
-        ggml_cann_op_unary(lambda, ctx, dst);                         \
-    }                                                                 \
-    while (0)
+#define GGML_CANN_CALL_OP_UNARY(OP_NAME)                                                              \
+    do {                                                                                              \
+        auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { \
+            GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);                                  \
+        };                                                                                            \
+        ggml_cann_op_unary(lambda, ctx, dst);                                                         \
+    } while (0)

 /**
 * @brief Helper macro to call a gated unary ACL operator via ggml_cann_op_unary_gated.
@@ -1229,15 +1116,32 @@ void ggml_cann_op_unary_gated(
 * @see ggml_cann_op_unary_gated
 * @see GGML_CANN_CALL_ACLNN_OP
 */
-#define GGML_CANN_CALL_OP_UNARY_GATED(OP_NAME)                        \
-    do {                                                              \
-        auto lambda = [](ggml_backend_cann_context& ctx,              \
-            aclTensor* acl_src,                                       \
-            aclTensor* acl_dst) {                                     \
-            GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);  \
-        };                                                            \
-        ggml_cann_op_unary_gated(lambda, ctx, dst);                   \
-    }                                                                 \
-    while (0)
+#define GGML_CANN_CALL_OP_UNARY_GATED(OP_NAME)                                                        \
+    do {                                                                                              \
+        auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { \
+            GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);                                  \
+        };                                                                                            \
+        ggml_cann_op_unary_gated(lambda, ctx, dst);                                                   \
+    } while (0)

 #endif  // CANN_ACLNN_OPS
+
+/**
+ * @brief Performs outer product operation on two ggml tensors using the CANN backend.
+ *
+ * @details This function computes the outer product of two input tensors (src0 and src1)
+ * and stores the result in the destination tensor. The outer product operation is defined as:
+ * dst[i,j,k,l] = sum_m (src0[i,m,k,l] * src1[j,m,k,l])
+ *
+ * The function supports multiple data types including F32, F16. For floating-point
+ * types, it uses batch matrix multiplication for efficient computation.
+ *
+ * The implementation handles 4D tensor broadcasting and batch processing automatically.
+ *
+ * @param ctx The CANN backend context for operation execution and memory management.
+ * @param dst The destination ggml_tensor where the outer product result will be stored.
+ *            The input tensors are assumed to be `dst->src[0]` and `dst->src[1]`.
+ *
+ * @see GGML_CANN_CALL_ACLNN_OP for CANN operator invocation
+ */
+void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@@ -23,28 +23,28 @@
 #ifndef CANN_COMMON_H
 #define CANN_COMMON_H

-#include <acl/acl.h>
-
-#include <cstdio>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-#include <atomic>
-#include <condition_variable>
-#include <mutex>
-#include <thread>
-#include <unistd.h>
-#include <functional>
-#include <optional>
-#include <list>
-
+#include "../ggml-impl.h"
 #include "../include/ggml-cann.h"
 #include "../include/ggml.h"
-#include "../ggml-impl.h"

-#define MATRIX_ROW_PADDING 512
+#include <acl/acl.h>
+#include <unistd.h>
+
+#include <atomic>
+#include <condition_variable>
+#include <cstdio>
+#include <functional>
+#include <iostream>
+#include <list>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <string>
+#include <thread>
+#include <vector>
+
+#define MATRIX_ROW_PADDING    512
 #define GGML_CANN_MAX_STREAMS 8

 /**
@@ -56,8 +56,7 @@
 * @param line The line number at which the error occurred.
 * @param msg The error message.
 */
-[[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
-                                  const char* file, int line, const char* msg);
+[[noreturn]] void ggml_cann_error(const char * stmt, const char * func, const char * file, int line, const char * msg);

 /**
 * @brief Checks the result of a CANN function call and invokes the error
@@ -89,25 +88,24 @@ struct ggml_cann_device_info {
     * @brief Information about a single CANN device.
     */
    struct cann_device_info {
-        int cc;                 /**< Compute capability.                   */
+        int    cc;              /**< Compute capability.                   */
        size_t smpb;            /**< Maximum shared memory per block.      */
-        bool vmm;               /**< Virtual memory support.               */
+        bool   vmm;             /**< Virtual memory support.               */
        size_t vmm_granularity; /**< Granularity of virtual memory.        */
        size_t total_vram;      /**< Total video RAM available on the device. */
    };

-    cann_device_info devices[GGML_CANN_MAX_DEVICES] =
-        {}; /**< Array of CANN device information. */
+    cann_device_info devices[GGML_CANN_MAX_DEVICES] = {}; /**< Array of CANN device information. */
 };

-const ggml_cann_device_info& ggml_cann_info();
+const ggml_cann_device_info & ggml_cann_info();

-void ggml_cann_set_device(int32_t device);
+void    ggml_cann_set_device(int32_t device);
 int32_t ggml_cann_get_device();

-std::optional<std::string> get_env(const std::string& name);
-bool parse_bool(const std::string& value);
-int parse_integer(const std::string& value);
+std::optional<std::string> get_env(const std::string & name);
+bool                       parse_bool(const std::string & value);
+int                        parse_integer(const std::string & value);

 /**
 * @brief Abstract base class for memory pools used by CANN.
@@ -126,7 +124,7 @@ struct ggml_cann_pool {
     *                     will be stored.
     * @return             Pointer to the allocated memory block.
     */
-    virtual void* alloc(size_t size, size_t* actual_size) = 0;
+    virtual void * alloc(size_t size, size_t * actual_size) = 0;

    /**
     * @brief Frees a previously allocated memory block.
@@ -136,16 +134,16 @@ struct ggml_cann_pool {
     * @note Note that all CANN opertors are running async. Make sure memory is
     *       still avaiable before this operator finished.
     */
-    virtual void free(void* ptr, size_t size) = 0;
+    virtual void free(void * ptr, size_t size) = 0;
 };

 /**
 * @brief RAII wrapper for managing memory allocations from a CANN memory pool.
 */
 struct ggml_cann_pool_alloc {
-    ggml_cann_pool* pool = nullptr; /**< Pointer to the memory pool. */
-    void* ptr = nullptr;    /**< Pointer to the allocated memory block. */
-    size_t actual_size = 0; /**< Actual size of the allocated memory block. */
+    ggml_cann_pool * pool        = nullptr; /**< Pointer to the memory pool. */
+    void *           ptr         = nullptr; /**< Pointer to the allocated memory block. */
+    size_t           actual_size = 0;       /**< Actual size of the allocated memory block. */

    /**
     * @brief Default constructor.
@@ -156,16 +154,14 @@ struct ggml_cann_pool_alloc {
     * @brief Constructor that initializes the memory pool.
     * @param pool Reference to the memory pool.
     */
-    explicit ggml_cann_pool_alloc(ggml_cann_pool& pool) : pool(&pool) {}
+    explicit ggml_cann_pool_alloc(ggml_cann_pool & pool) : pool(&pool) {}

    /**
     * @brief Constructor that initializes the memory pool and allocates memory.
     * @param pool Reference to the memory pool.
     * @param size Size of the memory block to allocate.
     */
-    ggml_cann_pool_alloc(ggml_cann_pool& pool, size_t size) : pool(&pool) {
-        alloc(size);
-    }
+    ggml_cann_pool_alloc(ggml_cann_pool & pool, size_t size) : pool(&pool) { alloc(size); }

    /**
     * @brief Destructor that frees the allocated memory block.
@@ -181,7 +177,7 @@ struct ggml_cann_pool_alloc {
     * @param size Size of the memory block to allocate.
     * @return Pointer to the allocated memory block.
     */
-    void* alloc(size_t size) {
+    void * alloc(size_t size) {
        GGML_ASSERT(pool != nullptr);
        GGML_ASSERT(ptr == nullptr);
        ptr = pool->alloc(size, &this->actual_size);
@@ -194,7 +190,7 @@ struct ggml_cann_pool_alloc {
     * @param size Size of the memory block to allocate.
     * @return Pointer to the allocated memory block.
     */
-    void* alloc(ggml_cann_pool& pool, size_t size) {
+    void * alloc(ggml_cann_pool & pool, size_t size) {
        this->pool = &pool;
        return alloc(size);
    }
@@ -203,151 +199,30 @@ struct ggml_cann_pool_alloc {
     * @brief Gets the pointer to the allocated memory block.
     * @return Pointer to the allocated memory block.
     */
-    void* get() { return ptr; }
+    void * get() { return ptr; }

    // Deleted copy constructor
-    ggml_cann_pool_alloc(const ggml_cann_pool_alloc&) = delete;
+    ggml_cann_pool_alloc(const ggml_cann_pool_alloc &) = delete;

    // Deleted move constructor
-    ggml_cann_pool_alloc(ggml_cann_pool_alloc&&) = delete;
+    ggml_cann_pool_alloc(ggml_cann_pool_alloc &&) = delete;

    // Deleted copy assignment operator
-    ggml_cann_pool_alloc& operator=(const ggml_cann_pool_alloc&) = delete;
+    ggml_cann_pool_alloc & operator=(const ggml_cann_pool_alloc &) = delete;

    // Deleted move assignment operator
-    ggml_cann_pool_alloc& operator=(ggml_cann_pool_alloc&&) = delete;
-};
-
-/**
- * @brief Function pointer type for ACLNN operator calls.
- */
-using aclnn_func_t = aclnnStatus (*)(void*, uint64_t, aclOpExecutor*, aclrtStream);
-
-/**
- * @brief Base class for all CANN tasks to be submitted to the task queue.
- *
- * Users should override the run_task() method with actual task logic.
- */
-class cann_task {
-public:
-    virtual void run_task() {}
-};
-
-/**
- * @brief A lock-free ring-buffer based task queue for asynchronously executing cann_task instances.
- */
-class cann_task_queue {
-public:
-    /**
-     * @brief Constructs a task queue with a fixed power-of-two capacity for a specific device.
-     *
-     * @param capacity Queue capacity. Must be a power of 2.
-     * @param device Target device ID (used for context setting).
-     */
-    explicit cann_task_queue(size_t capacity, int32_t device)
-        : buffer_(capacity), capacity_(capacity), head_(0), tail_(0),
-          running_(false), device_(device) {
-        GGML_ASSERT((capacity & (capacity - 1)) == 0 && "capacity must be power of 2");
-        mask_ = capacity_ - 1;
-    }
-
-    /**
-     * @brief Attempts to enqueue a task into the queue.
-     *
-     * @param item Unique pointer to the task.
-     * @return true if the task was successfully enqueued, false if the queue was full.
-     */
-    bool enqueue(std::unique_ptr<cann_task>&& item) {
-        size_t next_tail = (tail_ + 1) & mask_;
-
-        if (next_tail == head_) {
-            return false;
-        }
-
-        buffer_[tail_] = std::move(item);
-        std::atomic_thread_fence(std::memory_order_release);
-        tail_ = next_tail;
-
-        return true;
-    }
-
-    /**
-     * @brief Submits a task to the queue, and starts the worker thread if not already running.
-     *
-     * @param task Task to be submitted.
-     */
-    void submit_task(std::unique_ptr<cann_task>&& task) {
-        while(!enqueue(std::move(task))) {
-            std::this_thread::yield();
-            continue;
-        }
-
-        if (!running_) {
-            running_ = true;
-            thread_ = std::thread(&cann_task_queue::execute, this);
-        }
-
-    }
-
-    /**
-     * @brief Waits until the queue is completely empty and no tasks are being processed.
-     */
-    void wait() {
-        while (running_ && head_ != tail_) {
-            std::this_thread::yield();
-            continue;
-        }
-    }
-
-    /**
-     * @brief Stops the task queue and joins the worker thread.
-     */
-    void stop() {
-        running_ = false;
-        if (thread_.joinable()) {
-            thread_.join();
-        }
-    }
-
-private:
-    /**
-     * @brief Worker thread function that continuously dequeues and executes tasks.
-     */
-    void execute() {
-        ggml_cann_set_device(device_);
-
-        while (running_) {
-            if(head_ == tail_) {
-                std::this_thread::yield();
-                continue;
-            }
-
-            std::atomic_thread_fence(std::memory_order_acquire);
-            buffer_[head_]->run_task();
-            buffer_[head_].reset();
-            head_ = (head_ + 1) & mask_;
-        }
-    }
-
-    std::vector<std::unique_ptr<cann_task>> buffer_;
-    const size_t capacity_;
-    size_t mask_;
-    size_t head_;
-    size_t tail_;
-    bool running_;
-    std::thread thread_;
-    int32_t device_;
+    ggml_cann_pool_alloc & operator=(ggml_cann_pool_alloc &&) = delete;
 };

 #ifdef USE_ACL_GRAPH
 struct ggml_graph_node_properties {
    // dst tensor
-    void * node_address;
+    void *  node_address;
    int64_t ne[GGML_MAX_DIMS];
-    size_t nb[GGML_MAX_DIMS];
+    size_t  nb[GGML_MAX_DIMS];

    // src tensor
-    void * src_address[GGML_MAX_SRC];
+    void *  src_address[GGML_MAX_SRC];
    int64_t src_ne[GGML_MAX_SRC][GGML_MAX_DIMS];
    size_t  src_nb[GGML_MAX_SRC][GGML_MAX_DIMS];

@@ -376,13 +251,11 @@ struct ggml_cann_graph {
 * move existing graphs to the front (most recently used), and clear the cache.
 */
 struct ggml_cann_graph_lru_cache {
-    size_t capacity;  /**< Maximum number of graphs in the cache. */
+    size_t capacity;                         /**< Maximum number of graphs in the cache. */

-    std::list<ggml_cann_graph*> cache_list; /**< List storing cached graphs as raw pointers. */
+    std::list<ggml_cann_graph *> cache_list; /**< List storing cached graphs as raw pointers. */

-    ggml_cann_graph_lru_cache() {
-        capacity = parse_integer(get_env("GGML_CANN_GRAPH_CACHE_CAPACITY").value_or("12"));
-    }
+    ggml_cann_graph_lru_cache() { capacity = parse_integer(get_env("GGML_CANN_GRAPH_CACHE_CAPACITY").value_or("12")); }

    /**
     * @brief Push a new graph to the front of the cache.
@@ -390,11 +263,11 @@ struct ggml_cann_graph_lru_cache {
     * @param new_node Pointer to the new ggml_cann_graph to cache.
     *        Ownership is transferred to the cache (cache will delete it).
     */
-    void push(ggml_cann_graph* new_node) {
+    void push(ggml_cann_graph * new_node) {
        if (cache_list.size() >= capacity) {
-            ggml_cann_graph* old = cache_list.back();
+            ggml_cann_graph * old = cache_list.back();
            cache_list.pop_back();
-            delete old; // free the old graph
+            delete old;  // free the old graph
        }
        cache_list.push_front(new_node);
    }
@@ -403,7 +276,7 @@ struct ggml_cann_graph_lru_cache {
     * @brief Move an existing graph to the front of the cache.
     * @param node Pointer to the ggml_cann_graph to move.
     */
-    void move_to_front(ggml_cann_graph* node) {
+    void move_to_front(ggml_cann_graph * node) {
        cache_list.remove(node);
        cache_list.push_front(node);
    }
@@ -421,92 +294,145 @@ struct ggml_cann_graph_lru_cache {
    /**
     * @brief Destructor that clears the cache and frees all cached graphs.
     */
-    ~ggml_cann_graph_lru_cache() {
-        clear();
-    }
+    ~ggml_cann_graph_lru_cache() { clear(); }
 };
 #endif  // USE_ACL_GRAPH

 struct ggml_cann_rope_cache {
    ~ggml_cann_rope_cache() {
-        if(theta_scale_cache != nullptr) {
+        if (theta_scale_cache) {
            ACL_CHECK(aclrtFree(theta_scale_cache));
        }
-        if(sin_cache != nullptr) {
+        if (sin_cache) {
            ACL_CHECK(aclrtFree(sin_cache));
        }
-        if(cos_cache != nullptr) {
+        if (cos_cache) {
            ACL_CHECK(aclrtFree(cos_cache));
        }
+        if (position_select_index) {
+            ACL_CHECK(aclrtFree(position_select_index));
+        }
+        if (theta_scale_exp_host) {
+            free(theta_scale_exp_host);
+        }
+        if(position_select_index_host) {
+            free(position_select_index_host);
+        }
    }

-    void* theta_scale_cache = nullptr;
-    int64_t theta_scale_length = 0;
+    bool equal(int64_t theta_scale_length,
+               int64_t position_length,
+               float   ext_factor,
+               float   theta_scale,
+               float   freq_scale,
+               float   attn_factor,
+               bool    is_neox,
+               bool    indep_sects,
+               bool    mrope_used,
+               bool    is_imrope,
+               int     sections[4]) {
+        return this->theta_scale_length == theta_scale_length && this->position_length == position_length &&
+               this->ext_factor == ext_factor && this->theta_scale == theta_scale && this->freq_scale == freq_scale &&
+               this->attn_factor == attn_factor && this->is_neox == is_neox && this->indep_sects == indep_sects &&
+               this->mrope_used == mrope_used && this->is_imrope == is_imrope && this->sections[0] == sections[0] &&
+               this->sections[1] == sections[1] && this->sections[2] == sections[2] && this->sections[3] == sections[3];
+    }
+
+    void set(int64_t theta_scale_length,
+             int64_t position_length,
+             float    ext_factor,
+             float   theta_scale,
+             float   freq_scale,
+             float   attn_factor,
+             bool    is_neox,
+             bool    indep_sects,
+             bool    mrope_used,
+             bool    is_imrope,
+             int     sections[4]) {
+        this->theta_scale_length = theta_scale_length;
+        this->position_length    = position_length;
+        this->ext_factor         = ext_factor;
+        this->theta_scale        = theta_scale;
+        this->freq_scale         = freq_scale;
+        this->attn_factor        = attn_factor;
+        this->is_neox            = is_neox;
+        this->indep_sects        = indep_sects;
+        this->mrope_used         = mrope_used;
+        this->is_imrope          = is_imrope;
+        this->sections[0]        = sections[0];
+        this->sections[1]        = sections[1];
+        this->sections[2]        = sections[2];
+        this->sections[3]        = sections[3];
+    }
+
+    // memory cache, prepare before inferencing.
+    void *  theta_scale_cache          = nullptr;
+    float * theta_scale_exp_host       = nullptr;
+    int *   position_select_index_host = nullptr;
+    void *  position_select_index      = nullptr;
    // sin/cos cache, used only to accelerate first layer on each device
-    void* sin_cache = nullptr;
-    void* cos_cache = nullptr;
-    int64_t position_length = 0;
+    void *  sin_cache                  = nullptr;
+    void *  cos_cache                  = nullptr;
    // Properties to check before reusing the sincos cache
-    bool cached = false;
-    float ext_factor = 0.0f;
-    float theta_scale = 0.0f;
-    float freq_scale = 0.0f;
-    float attn_factor = 0.0f;
-    bool is_neox = false;
+    int64_t theta_scale_length         = 0;
+    int64_t position_length            = 0;
+    bool    cached                     = false;
+    float   ext_factor                 = 0.0f;
+    float   theta_scale                = 0.0f;
+    float   freq_scale                 = 0.0f;
+    float   attn_factor                = 0.0f;
+    bool    is_neox                    = false;
+    bool    indep_sects                = false;
+    bool    mrope_used                 = false;
+    int     sections[4]                = { 0, 0, 0, 0 };
+    bool    is_imrope                  = false;
 };

 struct ggml_cann_tensor_cache {
    ~ggml_cann_tensor_cache() {
-        if(cache != nullptr) {
+        if (cache != nullptr) {
            ACL_CHECK(aclrtFree(cache));
        }
    }

-    void* cache = nullptr;
-    int64_t size = 0;
+    void *  cache = nullptr;
+    int64_t size  = 0;
 };

 /**
 * @brief Context for managing CANN backend operations.
 */
 struct ggml_backend_cann_context {
-    int32_t device;                  /**< Device ID. */
-    std::string name;                /**< Name of the device. */
-    std::string description;         /**< Description of the device. */
-    aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
+    int32_t     device;               /**< Device ID. */
+    std::string name;                 /**< Name of the device. */
+    std::string description;          /**< Description of the device. */
+    aclrtEvent  copy_event = nullptr; /**< Event for managing copy operations. */
 #ifdef USE_ACL_GRAPH
    /// Cached CANN ACL graph used for executing the current ggml computation graph.
    ggml_cann_graph_lru_cache graph_lru_cache;
-    bool acl_graph_mode = true;
+    bool                      acl_graph_mode = true;
 #endif
-    cann_task_queue task_queue;
-    bool async_mode;
+    bool                   async_mode;
    // Rope Cache
-    ggml_cann_rope_cache rope_cache;
+    ggml_cann_rope_cache   rope_cache;
    // Constant Pool
    ggml_cann_tensor_cache rms_norm_one_tensor_cache;
    ggml_cann_tensor_cache rms_norm_zero_tensor_cache;

-    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
+    aclrtStream streams[GGML_CANN_MAX_STREAMS] = { nullptr }; /**< Array of streams for the device. */

    /**
     * @brief Constructor for initializing the context with a given device.
     * @param device Device ID.
     */
-    explicit ggml_backend_cann_context(int device)
-        : device(device), name("CANN" + std::to_string(device)), task_queue(1024, device) {
+    explicit ggml_backend_cann_context(int device) : device(device), name("CANN" + std::to_string(device)) {
        ggml_cann_set_device(device);
        description = aclrtGetSocName();

-        async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
-        GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
-            device, async_mode ? "ON" : "OFF");
 #ifdef USE_ACL_GRAPH
        acl_graph_mode = parse_bool(get_env("GGML_CANN_ACL_GRAPH").value_or("on"));
-        GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n",
-              __func__, device,
-              acl_graph_mode ? "GRAPH" : "EAGER",
-              acl_graph_mode ? "acl graph enabled" : "acl graph disabled");
+        GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n", __func__, device, acl_graph_mode ? "GRAPH" : "EAGER",
+                      acl_graph_mode ? "acl graph enabled" : "acl graph disabled");
 #endif
    }

@@ -515,7 +441,6 @@ struct ggml_backend_cann_context {
     */
    ~ggml_backend_cann_context() {
        ggml_cann_set_device(device);
-        task_queue.stop();
        if (copy_event != nullptr) {
            ACL_CHECK(aclrtDestroyEvent(copy_event));
        }
@@ -549,8 +474,7 @@ struct ggml_backend_cann_context {
    aclrtStream stream() { return stream(0); }

    // TODO: each stream should have a memory pool.
-    std::unique_ptr<ggml_cann_pool>
-        mem_pool; /**< Memory pool for the device. */
+    std::unique_ptr<ggml_cann_pool> mem_pool; /**< Memory pool for the device. */

    /**
     * @brief Create a new memory pool for a given device.
@@ -563,7 +487,7 @@ struct ggml_backend_cann_context {
     * @brief Get or create the memory pool for the context.
     * @return Reference to the memory pool.
     */
-    ggml_cann_pool& pool() {
+    ggml_cann_pool & pool() {
        if (mem_pool == nullptr) {
            mem_pool = new_pool_for_device(device);
        }
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -126,36 +126,48 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                )
                if (NOT ARM_MCPU_RESULT)
                    string(REGEX MATCH "-mcpu=[^ ']+" ARM_MCPU_FLAG "${ARM_MCPU}")
+                    string(REGEX MATCH "-march=[^ ']+" ARM_MARCH_FLAG "${ARM_MCPU}")
+
+                    # on some old GCC we need to read -march=
+                    if (ARM_MARCH_FLAG AND NOT "${ARM_MARCH_FLAG}" STREQUAL "-march=native")
+                        set(ARM_NATIVE_FLAG "${ARM_MARCH_FLAG}")
+                    elseif(ARM_MCPU_FLAG AND NOT "${ARM_MCPU_FLAG}" STREQUAL "-mcpu=native")
+                        set(ARM_NATIVE_FLAG "${ARM_MCPU_FLAG}")
+                    endif()
                endif()
-                if ("${ARM_MCPU_FLAG}" STREQUAL "")
-                    set(ARM_MCPU_FLAG -mcpu=native)
-                    message(STATUS "ARM -mcpu not found, -mcpu=native will be used")
+
+                if ("${ARM_NATIVE_FLAG}" STREQUAL "")
+                    set(ARM_NATIVE_FLAG -mcpu=native)
+                    message(WARNING "ARM -march/-mcpu not found, -mcpu=native will be used")
+                else()
+                    message(STATUS "ARM detected flags: ${ARM_NATIVE_FLAG}")
                endif()

                include(CheckCXXSourceRuns)

-                function(check_arm_feature tag code)
+                macro(check_arm_feature tag feature code)
                    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
-                    set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+${tag}")
+                    set(CMAKE_REQUIRED_FLAGS "${ARM_NATIVE_FLAG}+${tag}")
                    check_cxx_source_runs("${code}" GGML_MACHINE_SUPPORTS_${tag})
                    if (GGML_MACHINE_SUPPORTS_${tag})
-                        set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+${tag}" PARENT_SCOPE)
+                        set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+${tag}")
                    else()
-                        set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+no${tag}")
+                        set(CMAKE_REQUIRED_FLAGS "${ARM_NATIVE_FLAG}+no${tag}")
                        check_cxx_source_compiles("int main() { return 0; }" GGML_MACHINE_SUPPORTS_no${tag})
                        if (GGML_MACHINE_SUPPORTS_no${tag})
-                            set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+no${tag}" PARENT_SCOPE)
+                            set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+no${tag}")
+                            list(APPEND ARCH_FLAGS -U__ARM_FEATURE_${feature})
                        endif()
                    endif()
                    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
-                endfunction()
+                endmacro()

-                check_arm_feature(dotprod "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }")
-                check_arm_feature(i8mm    "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }")
-                check_arm_feature(sve     "#include <arm_sve.h>\nint main()  { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
-                check_arm_feature(sme     "#include <arm_sme.h>\n__arm_locally_streaming int main() { __asm__ volatile(\"smstart; smstop;\"); return 0; }")
+                check_arm_feature(dotprod DOTPROD     "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }")
+                check_arm_feature(i8mm    MATMUL_INT8 "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }")
+                check_arm_feature(sve     SVE         "#include <arm_sve.h>\nint main()  { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
+                check_arm_feature(sme     SME         "#include <arm_sme.h>\n__arm_locally_streaming int main() { __asm__ volatile(\"smstart; smstop;\"); return 0; }")

-                list(APPEND ARCH_FLAGS "${ARM_MCPU_FLAG}${ARM_MCPU_FLAG_FIX}")
+                list(APPEND ARCH_FLAGS "${ARM_NATIVE_FLAG}${ARM_NATIVE_FLAG_FIX}")
            else()
                if (GGML_CPU_ARM_ARCH)
                    list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
@@ -205,35 +217,28 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                endif()
            endif()

-            # show enabled features
-            if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
-                set(FEAT_INPUT_FILE "NUL")
-            else()
-                set(FEAT_INPUT_FILE "/dev/null")
-            endif()
+            message(STATUS "Checking for ARM features using flags:")
+            foreach(flag IN LISTS ARCH_FLAGS)
+                message(STATUS "  ${flag}")
+            endforeach()

-            execute_process(
-                COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
-                INPUT_FILE ${FEAT_INPUT_FILE}
-                OUTPUT_VARIABLE ARM_FEATURE
-                RESULT_VARIABLE ARM_FEATURE_RESULT
-            )
-            if (ARM_FEATURE_RESULT)
-                message(WARNING "Failed to get ARM features")
-            else()
-                foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
-                    string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
-                    if (NOT ${feature_pos} EQUAL -1)
-                        # Special handling for MATMUL_INT8 when machine doesn't support i8mm
-                        if ("${feature}" STREQUAL "MATMUL_INT8" AND GGML_MACHINE_SUPPORTS_noi8mm)
-                            message(STATUS "ARM feature ${feature} detected but unsetting due to machine not supporting i8mm")
-                            list(APPEND ARCH_FLAGS -U__ARM_FEATURE_MATMUL_INT8)
-                        else()
-                            message(STATUS "ARM feature ${feature} enabled")
-                        endif()
-                    endif()
-                endforeach()
-            endif()
+            include(CheckCXXSourceCompiles)
+            set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+            string(REPLACE ";" " " ARCH_FLAGS_STR "${ARCH_FLAGS}")
+            set(CMAKE_REQUIRED_FLAGS "${ARCH_FLAGS_STR}")
+            foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
+                set(ARM_FEATURE "HAVE_${feature}")
+                check_cxx_source_compiles(
+                    "
+                    #if !defined(__ARM_FEATURE_${feature})
+                    #  error \"Feature ${feature} is not defined\"
+                    #endif
+                    int main() { return 0; }
+                    "
+                    ${ARM_FEATURE}
+                )
+            endforeach()
+            set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
        endif()
    elseif (GGML_SYSTEM_ARCH STREQUAL "x86")
        message(STATUS "x86 detected")
@@ -388,9 +393,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")

            if (EXTRACTED_NUMBER GREATER_EQUAL 10)
-                list(APPEND ARCH_FLAGS -mcpu=power10 -mpowerpc64)
+                list(APPEND ARCH_FLAGS -mcpu=power10)
            elseif (EXTRACTED_NUMBER EQUAL 9)
-                list(APPEND ARCH_FLAGS -mcpu=power9 -mpowerpc64)
+                list(APPEND ARCH_FLAGS -mcpu=power9)
            elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
                list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
            else()
@@ -448,51 +453,87 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                ggml-cpu/spacemit/ime_kernels.h
            )
        endif()
-        set(MARCH_STR "rv64gc")
-        if (GGML_RV_ZFH)
-            string(APPEND MARCH_STR "_zfh")
-        endif()
-        if (GGML_XTHEADVECTOR)
-            string(APPEND MARCH_STR "_xtheadvector")
-        elseif (GGML_RVV)
-            string(APPEND MARCH_STR "_v")
-            if (GGML_RV_ZVFH)
-                string(APPEND MARCH_STR "_zvfh")
+        if(NOT GGML_CPU_ALL_VARIANTS)
+            set(MARCH_STR "rv64gc")
+            if (GGML_RV_ZFH)
+                string(APPEND MARCH_STR "_zfh")
            endif()
+            if (GGML_XTHEADVECTOR)
+                string(APPEND MARCH_STR "_xtheadvector")
+            elseif (GGML_RVV)
+                string(APPEND MARCH_STR "_v")
+                if (GGML_RV_ZVFH)
+                    string(APPEND MARCH_STR "_zvfh")
+                endif()
+            endif()
+            if (GGML_RV_ZICBOP)
+                string(APPEND MARCH_STR "_zicbop")
+            endif()
+            list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
+        else()
+            # Begin with the lowest baseline
+            set(ARCH_DEFINITIONS "")
+
+            if (GGML_INTERNAL_RVV)
+                message(STATUS "RVV enabled")
+                list(APPEND ARCH_DEFINITIONS GGML_USE_RVV)
+                list(APPEND ARCH_FLAGS -march=rv64gc_v -mabi=lp64d)
+            endif()
+
+            ggml_add_cpu_backend_features(${GGML_CPU_NAME} riscv ${ARCH_DEFINITIONS})
        endif()
-        if (GGML_RV_ZICBOP)
-            string(APPEND MARCH_STR "_zicbop")
-        endif()
-        list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
    elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
        message(STATUS "s390x detected")
-        list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
-        file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
-        string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
+        list(APPEND GGML_CPU_SOURCES
+            ggml-cpu/arch/s390/quants.c)

-        # TODO: Separation to determine activation of VX/VXE/VXE2
-        if (${S390X_M} MATCHES "8561|8562")
-            message(STATUS "z15 target")
-            list(APPEND ARCH_FLAGS -march=z15)
-        elseif (${S390X_M} MATCHES "3931")
-            message(STATUS "z16 target")
-            list(APPEND ARCH_FLAGS -march=z16)
-        elseif (${S390X_M} MATCHES "9175|9176")
-            # NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
-            #       binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
-            message(STATUS "z17 target")
-            list(APPEND ARCH_FLAGS -march=arch15)
-        else()
-            message(STATUS "Unknown target")
-            message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
-            list(APPEND ARCH_FLAGS -march=native -mtune=native)
+        # for native compilation
+        if (GGML_NATIVE)
+            # check machine level to determine target
+            file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
+            string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
+
+            # TODO: Separation to determine activation of VX/VXE/VXE2
+            if (${S390X_M} MATCHES "8561|8562")
+                message(STATUS "z15 target")
+                list(APPEND ARCH_FLAGS -march=z15)
+            elseif (${S390X_M} MATCHES "3931")
+                message(STATUS "z16 target")
+                list(APPEND ARCH_FLAGS -march=z16)
+            elseif (${S390X_M} MATCHES "9175|9176")
+                # NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
+                #       binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
+                message(STATUS "z17 target")
+                list(APPEND ARCH_FLAGS -march=arch15)
+            else()
+                message(STATUS "Unknown target")
+                message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
+                list(APPEND ARCH_FLAGS -march=native -mtune=native)
+            endif()
+        # for cross-compilation
+        elseif(GGML_CPU_ALL_VARIANTS)
+            # range through IBM z15 to z17
+            # NOTE: update when a new hardware level is released
+            foreach (ZHW RANGE 15 17)
+                if(DEFINED GGML_INTERNAL_Z${ZHW})
+                    message(STATUS "z${ZHW} cross-compile target")
+                    list(APPEND ARCH_FLAGS -march=z${ZHW})
+                endif()
+            endforeach()
        endif()

-        if (GGML_VXE)
-            message(STATUS "VX/VXE/VXE2 enabled")
+        if (GGML_VXE OR GGML_INTERNAL_VXE2)
+            message(STATUS "VXE2 enabled")
            list(APPEND ARCH_FLAGS -mvx -mzvector)
-            list(APPEND ARCH_DEFINITIONS GGML_VXE)
+            list(APPEND ARCH_DEFINITIONS GGML_USE_VXE2)
        endif()
+
+        if (GGML_INTERNAL_NNPA)
+            message(STATUS "NNPA enabled")
+            list(APPEND ARCH_DEFINITIONS GGML_USE_NNPA)
+        endif()
+
+        ggml_add_cpu_backend_features(${GGML_CPU_NAME} s390 ${ARCH_DEFINITIONS})
    elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
        message(STATUS "Wasm detected")
        list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
@@ -556,6 +597,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            ${KLEIDIAI_SRC}/kai/ukernels/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)

@@ -574,23 +616,34 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p4x8sb_f32_neon.c
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
-            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.c)

        if (NOT DOTPROD_ENABLED MATCHES -1)
            list(APPEND GGML_KLEIDIAI_SOURCES
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c
-                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.c)
        endif()

        if (NOT I8MM_ENABLED MATCHES -1)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.c)
+            list(APPEND GGML_KLEIDIAI_SOURCES
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.c)
        endif()

        if (NOT SME_ENABLED MATCHES -1)
            list(APPEND GGML_KLEIDIAI_SOURCES
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_asm.S
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot_asm.S
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_asm.S
                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@@ -33,10 +33,12 @@
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
+#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
+#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
@@ -44,27 +46,30 @@
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
+#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
 // repack.cpp
+#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
-#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
-#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
+#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
+#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
+#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #elif defined(__POWERPC__) || defined(__powerpc__)
 // ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
@@ -76,10 +81,12 @@
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
+#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
+#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
@@ -87,6 +94,7 @@
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
+#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
@@ -101,10 +109,12 @@
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
+#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
+#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
@@ -112,6 +122,7 @@
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
+#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
@@ -134,15 +145,18 @@
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
+#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
+#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
+#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
@@ -163,10 +177,12 @@
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
+#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
+#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
@@ -174,6 +190,7 @@
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
+#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
@@ -196,10 +213,12 @@
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
+#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
+#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
@@ -207,6 +226,7 @@
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
+#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
--- a/ggml/src/ggml-cpu/arch/arm/quants.c
+++ b/ggml/src/ggml-cpu/arch/arm/quants.c
@@ -2044,6 +2044,26 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi

 }

+#ifdef __ARM_FEATURE_SVE
+static inline svuint32_t ggml_decode_q4scales_and_mins_for_mmla(const uint32_t * vx_scales) {
+    const svbool_t pg_all   = svptrue_pat_b32(SV_VL4);
+    const svbool_t pg_false = svpfalse_b();            // 0x0000
+    const svbool_t pg_lo_8  = svwhilelt_b8_s32(0,  8); // 0x00ff
+    const svbool_t pg_odd   = svzip1_b32(pg_false, pg_lo_8);
+
+    svuint32_t vutmp_hi, vutmp_lo;
+    svuint32_t vx01 = svld1_u32(pg_lo_8, vx_scales);
+    vutmp_hi = svzip1_u32(vx01, vx01);
+    vutmp_hi = svlsr_n_u32_m(pg_odd, vutmp_hi, 2);
+    vutmp_hi = svreinterpret_u32_u64(svand_n_u64_x(pg_all, svreinterpret_u64_u32(vutmp_hi), UINT64_C(0x303030303f3f3f3f)));
+    const svuint32_t vx2 = svdup_u32(vx_scales[2]);
+    vutmp_lo = svlsr_u32_x(pg_all, vx2, svreinterpret_u32_s32(svindex_s32(-2, 2)));
+    vutmp_lo = svand_n_u32_z(pg_odd, vutmp_lo, UINT32_C(0x0f0f0f0f));
+    svuint32_t vutmp = svorr_u32_z(pg_all, vutmp_hi, vutmp_lo);
+    return vutmp;
+}
+#endif
+
 void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
 #ifdef __ARM_FEATURE_MATMUL_INT8
@@ -2066,8 +2086,220 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    static const uint32_t kmask3 = 0x03030303;

    uint32_t utmp[4];
+#ifdef __ARM_FEATURE_SVE
+    const int vector_length = ggml_cpu_get_sve_cnt()*8;
+#endif

-#if defined(__ARM_FEATURE_MATMUL_INT8)
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+    if (nrc == 2) {
+        svbool_t pg32_2 = svptrue_pat_b32(SV_VL2);
+
+        const block_q4_K * GGML_RESTRICT vx0 = vx;
+        const block_q8_K * GGML_RESTRICT vy0 = vy;
+        const block_q4_K * GGML_RESTRICT vx1 = (const block_q4_K *) ((const uint8_t*)vx + bx);
+        const block_q8_K * GGML_RESTRICT vy1 = (const block_q8_K *) ((const uint8_t*)vy + by);
+
+        union {
+            uint32_t u32[8];
+            uint64_t u64[4];
+        } new_utmp;
+
+        svfloat32_t sumf1 = svdup_n_f32(0);
+
+        switch (vector_length) {
+            case 128:
+                {
+                    svbool_t pg_false = svpfalse_b();
+                    svbool_t pg_lo_8  = svwhilelt_b8_s32(0,  8);
+                    svbool_t vmins_mask1= svzip1_b32(pg_lo_8, pg_false);
+                    svbool_t vmins_mask2 = svzip1_b32(pg_false, pg_lo_8);
+                    svbool_t pg128_all  = svptrue_pat_b8(SV_VL16);
+                    for (int i = 0; i < nb; ++i) {
+                        svfloat32_t vy_d = svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d));
+                        svfloat32_t vx_d = svzip1_f32(svdup_n_f32(GGML_FP16_TO_FP32(vx0[i].d)), svdup_n_f32(GGML_FP16_TO_FP32(vx1[i].d)));
+                        svfloat32_t svsuper_block_scales = svmul_f32_x(pg128_all, vy_d, vx_d);
+                        svfloat32_t vx_dmins = svzip1_f32(svdup_n_f32(GGML_FP16_TO_FP32(vx0[i].dmin)), svdup_n_f32(GGML_FP16_TO_FP32(vx1[i].dmin)));
+                        svfloat32_t vy_dmins = svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d));
+                        svfloat32_t svdmins = svmul_n_f32_x(pg128_all, svmul_f32_x(pg128_all, vy_dmins, vx_dmins), -1);
+                        const uint8_t * GGML_RESTRICT q4_0 = vx0[i].qs;
+                        const int8_t  * GGML_RESTRICT q8_0 = vy0[i].qs;
+                        const uint8_t * GGML_RESTRICT q4_1 = vx1[i].qs;
+                        const int8_t  * GGML_RESTRICT q8_1 = vy1[i].qs;
+                        svint16_t lo = svld1_s16(pg128_all, vy0[i].bsums + 0);
+                        svint16_t hi = svld1_s16(pg128_all, vy0[i].bsums + 8);
+                        svint16_t sum_tmp1 = svuzp1_s16(lo, hi);
+                        svint16_t sum_tmp2 = svuzp2_s16(lo, hi);
+                        svint16_t svq8sums_0 = svadd_s16_x(pg128_all, sum_tmp1, sum_tmp2);
+                        lo = svld1_s16(pg128_all, vy1[i].bsums + 0);
+                        hi = svld1_s16(pg128_all, vy1[i].bsums + 8);
+                        sum_tmp1 = svuzp1(lo, hi);
+                        sum_tmp2 = svuzp2(lo, hi);
+                        svint16_t svq8sums_1 = svadd_s16_x(pg128_all, sum_tmp1, sum_tmp2);
+                        svuint32_t decoded_scales0 = ggml_decode_q4scales_and_mins_for_mmla((const uint32_t *)vx0[i].scales);
+                        svuint32_t decoded_scales1 = ggml_decode_q4scales_and_mins_for_mmla((const uint32_t *)vx1[i].scales);
+                        svuint32x2_t decoded_scales = svcreate2_u32(decoded_scales0, decoded_scales1);
+                        svst2_u32(pg128_all, new_utmp.u32, decoded_scales);
+                        svint16_t svmins8_0 = svreinterpret_s16_u16(svunpklo_u16(svreinterpret_u8_u32(svuzp1_u32(svld1_u32(vmins_mask1, new_utmp.u32+4), svdup_n_u32(0)))));
+                        svint16_t svmins8_1 = svreinterpret_s16_u16(svunpklo_u16(svreinterpret_u8_u32(svuzp2_u32(svld1_u32(vmins_mask2, new_utmp.u32+4), svdup_n_u32(0)))));
+                        svint32_t svsumfs_tmp1 = svreinterpret_s32_s64(svdot_s64(svdup_n_s64(0), svq8sums_0, svmins8_0));
+                        svint32_t svsumfs_tmp2 = svreinterpret_s32_s64(svdot_s64(svdup_n_s64(0), svq8sums_0, svmins8_1));
+                        svint32_t svsumfs_tmp3 = svtrn1_s32(svsumfs_tmp1, svsumfs_tmp2);
+                        svint32_t svsumfs_tmp4 = svreinterpret_s32_s64(svdot_s64(svdup_n_s64(0), svq8sums_1, svmins8_0));
+                        svint32_t svsumfs_tmp5 = svreinterpret_s32_s64(svdot_s64(svdup_n_s64(0), svq8sums_1, svmins8_1));
+                        svint32_t svsumfs_tmp6 = svtrn1_s32(svsumfs_tmp4, svsumfs_tmp5);
+                        svint32_t svsumfs_tmp7 = svreinterpret_s32_s64(svtrn2_s64(svreinterpret_s64_s32(svsumfs_tmp3), svreinterpret_s64_s32(svsumfs_tmp6)));
+                        svint32_t svsumfs_tmp8 = svreinterpret_s32_s64(svtrn1_s64(svreinterpret_s64_s32(svsumfs_tmp3), svreinterpret_s64_s32(svsumfs_tmp6)));
+                        svint32_t svsumfs_tmp = svadd_s32_x(pg128_all, svsumfs_tmp7, svsumfs_tmp8);
+                        svint32_t svscales, sumi1, sumi2;
+                        svint32_t acc_sumif1 = svdup_n_s32(0);
+                        svint32_t acc_sumif2 = svdup_n_s32(0);
+                        svint8_t q4bytes_0_l, q4bytes_0_h, q4bytes_1_l, q4bytes_1_h, l0, l1, l2, l3,
+                                 q8bytes_0_h, q8bytes_0_l, q8bytes_1_h, q8bytes_1_l, r0, r1, r2, r3;
+#pragma GCC unroll 1
+                        for (int j = 0; j < QK_K/64; ++j) {
+                            q4bytes_0_l = svreinterpret_s8_u8(svand_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_0), 0xf));
+                            q4bytes_1_l = svreinterpret_s8_u8(svand_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_1), 0xf));
+                            q4bytes_0_h = svreinterpret_s8_u8(svand_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_0+16), 0xf));
+                            q4bytes_1_h = svreinterpret_s8_u8(svand_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_1+16), 0xf));
+                            l0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q4bytes_0_l), svreinterpret_s64_s8(q4bytes_1_l)));
+                            l1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q4bytes_0_l), svreinterpret_s64_s8(q4bytes_1_l)));
+                            l2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q4bytes_0_h), svreinterpret_s64_s8(q4bytes_1_h)));
+                            l3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q4bytes_0_h), svreinterpret_s64_s8(q4bytes_1_h)));
+                            q8bytes_0_h = svld1_s8(pg128_all, q8_0);
+                            q8bytes_1_h = svld1_s8(pg128_all, q8_1);
+                            q8bytes_0_l = svld1_s8(pg128_all, q8_0+16);
+                            q8bytes_1_l = svld1_s8(pg128_all, q8_1+16);
+                            r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0_h), svreinterpret_s64_s8(q8bytes_1_h)));
+                            r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0_h), svreinterpret_s64_s8(q8bytes_1_h)));
+                            r2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0_l), svreinterpret_s64_s8(q8bytes_1_l)));
+                            r3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0_l), svreinterpret_s64_s8(q8bytes_1_l)));
+                            sumi1 = svmmla_s32(svmmla_s32(svmmla_s32(svmmla_s32(svdup_n_s32(0), r0, l0), r1, l1), r2, l2), r3, l3);
+                            svscales = svreinterpret_s32_u32(svlsr_n_u32_x(pg128_all, svlsl_n_u32_x(pg128_all, svreinterpret_u32_u64(svdup_n_u64(new_utmp.u64[j/2])), 8*(4-2*(j%2)-1)), 24));
+                            acc_sumif1 = svmla_s32_x(pg128_all, acc_sumif1, svscales, sumi1);
+
+                            q4bytes_0_l = svreinterpret_s8_u8(svlsr_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_0), 4));
+                            q4bytes_1_l = svreinterpret_s8_u8(svlsr_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_1), 4));
+                            q4bytes_0_h = svreinterpret_s8_u8(svlsr_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_0+16), 4));
+                            q4bytes_1_h = svreinterpret_s8_u8(svlsr_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_1+16), 4));
+                            l0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q4bytes_0_l), svreinterpret_s64_s8(q4bytes_1_l)));
+                            l1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q4bytes_0_l), svreinterpret_s64_s8(q4bytes_1_l)));
+                            l2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q4bytes_0_h), svreinterpret_s64_s8(q4bytes_1_h)));
+                            l3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q4bytes_0_h), svreinterpret_s64_s8(q4bytes_1_h)));
+                            q8bytes_0_h = svld1_s8(pg128_all, q8_0+32);
+                            q8bytes_1_h = svld1_s8(pg128_all, q8_1+32);
+                            q8bytes_0_l = svld1_s8(pg128_all, q8_0+48);
+                            q8bytes_1_l = svld1_s8(pg128_all, q8_1+48);
+                            r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0_h), svreinterpret_s64_s8(q8bytes_1_h)));
+                            r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0_h), svreinterpret_s64_s8(q8bytes_1_h)));
+                            r2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0_l), svreinterpret_s64_s8(q8bytes_1_l)));
+                            r3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0_l), svreinterpret_s64_s8(q8bytes_1_l)));
+                            sumi2 = svmmla_s32(svmmla_s32(svmmla_s32(svmmla_s32(svdup_n_s32(0), r0, l0), r1, l1), r2, l2), r3, l3);
+                            svscales = svreinterpret_s32_u32(svlsr_n_u32_x(pg128_all, svlsl_n_u32_x(pg128_all, svreinterpret_u32_u64(svdup_n_u64(new_utmp.u64[j/2])), 8*(4-2*(j%2)-2)), 24));
+                            acc_sumif2 = svmla_s32_x(pg128_all, acc_sumif2, svscales, sumi2);
+                            q4_0 += 32; q4_1 += 32; q8_0 += 64; q8_1 += 64;
+                        }
+                        sumf1 = svmla_f32_x(pg128_all,
+                                svmla_f32_x(pg128_all,
+                                    sumf1,
+                                    svcvt_f32_x(pg128_all,
+                                        svadd_s32_x(pg128_all, acc_sumif1, acc_sumif2)),
+                                    svsuper_block_scales),
+                                svdmins,
+                                svcvt_f32_s32_x(pg128_all, svsumfs_tmp));
+                    }  //end of for nb
+                } // end of case 128
+                break;
+            case 256:
+            case 512:
+                {
+                    const svbool_t pg32_4 = svptrue_pat_b32(SV_VL4);
+                    const svbool_t pg8_16 = svptrue_pat_b8(SV_VL16);
+                    const svbool_t pg256_all = svptrue_pat_b8(SV_ALL);
+                    for (int i = 0; i < nb; ++i) {
+                        const uint8_t * GGML_RESTRICT q4_0 = vx0[i].qs;
+                        const int8_t  * GGML_RESTRICT q8_0 = vy0[i].qs;
+                        const uint8_t * GGML_RESTRICT q4_1 = vx1[i].qs;
+                        const int8_t  * GGML_RESTRICT q8_1 = vy1[i].qs;
+                        svint32_t svscales, sumi1, sumi2;
+                        svint32_t acc_sumif1 = svdup_n_s32(0);
+                        svint32_t acc_sumif2 = svdup_n_s32(0);
+                        svint8_t l0, l1, l2, l3, r0, r1, r2, r3;
+                        svfloat32_t vx_d = svzip1_f32(svdup_n_f32(GGML_FP16_TO_FP32(vx0[i].d)), svdup_n_f32(GGML_FP16_TO_FP32(vx1[i].d)));
+                        svfloat64_t vy_d_tmp = svreinterpret_f64_f32(svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d)));
+                        svfloat32_t vy_d = svreinterpret_f32_f64(svuzp1_f64(vy_d_tmp, vy_d_tmp));
+                        svfloat32_t svsuper_block_scales = svmul_f32_z(pg32_4, vy_d, vx_d);
+                        svfloat32_t vx_dmins = svzip1_f32(svdup_n_f32(GGML_FP16_TO_FP32(vx0[i].dmin)), svdup_n_f32(GGML_FP16_TO_FP32(vx1[i].dmin)));
+                        svfloat64_t vy_dmins_tmp = svreinterpret_f64_f32(svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d)));
+                        svfloat32_t vy_dmins = svreinterpret_f32_f64(svuzp1_f64(vy_dmins_tmp, vy_dmins_tmp));
+                        svfloat32_t svdmins = svmul_n_f32_x(pg32_4, svmul_f32_x(pg32_4, vx_dmins, vy_dmins), -1);
+                        svint16_t rc1 = svuzp1_s16(svld1_s16(pg256_all, vy0[i].bsums), svld1_s16(pg256_all, vy1[i].bsums));
+                        svint16_t rc2 = svuzp2_s16(svld1_s16(pg256_all, vy0[i].bsums), svld1_s16(pg256_all, vy1[i].bsums));
+                        svint16_t svq8sums = svadd_s16_x(pg256_all, rc1, rc2);
+                        svuint32_t decoded_scales0 = ggml_decode_q4scales_and_mins_for_mmla((const uint32_t *)vx0[i].scales);
+                        svuint32_t decoded_scales1 = ggml_decode_q4scales_and_mins_for_mmla((const uint32_t *)vx1[i].scales);
+                        svuint32x2_t decoded_scales = svcreate2_u32(decoded_scales0, decoded_scales1);
+                        svst2_u32(pg8_16, new_utmp.u32, decoded_scales);
+                        svint16_t new_svq8sums_0 = svreinterpret_s16_u64(svtrn1_u64(svreinterpret_u64_s16(svq8sums), svreinterpret_u64_s16(svq8sums)));
+                        svint16_t new_svq8sums_1 = svreinterpret_s16_u64(svtrn2_u64(svreinterpret_u64_s16(svq8sums), svreinterpret_u64_s16(svq8sums)));
+                        svuint64_t new_mins_0 = svdup_u64(new_utmp.u64[2]);
+                        svuint64_t new_mins_1 = svdup_u64(new_utmp.u64[3]);
+                        svint16_t new_svmins8_0 = svreinterpret_s16_u16(svunpklo_u16(svreinterpret_u8_u64(new_mins_0)));
+                        svint16_t new_svmins8_1 = svreinterpret_s16_u16(svunpklo_u16(svreinterpret_u8_u64(new_mins_1)));
+                        svint64_t dot_prod_0 = svdot_s64(svdup_s64(0), new_svmins8_0, new_svq8sums_0);
+                        svint64_t dot_prod_1 = svdot_s64(dot_prod_0, new_svmins8_1, new_svq8sums_1);
+                        svfloat32_t converted_dot_prod_1 = svcvt_f32_s64_x(pg256_all, dot_prod_1);
+                        svfloat32_t svsumfs_tmp = svuzp1_f32(converted_dot_prod_1, converted_dot_prod_1);
+
+#pragma GCC unroll 1
+                        for (int j = 0; j < QK_K/64; ++j) {
+                            svuint8_t q4bytes_0 = svand_n_u8_x(pg256_all, svld1_u8(pg256_all, q4_0), 0xf);
+                            svuint8_t q4bytes_1 = svand_n_u8_x(pg256_all, svld1_u8(pg256_all, q4_1), 0xf);
+                            svuint8_t q4bytes_2 = svlsr_n_u8_x(pg256_all, svld1_u8(pg256_all, q4_0), 4);
+                            svuint8_t q4bytes_3 = svlsr_n_u8_x(pg256_all, svld1_u8(pg256_all, q4_1), 4);
+                            l0 = svreinterpret_s8_u64(svzip1_u64(svreinterpret_u64_u8(q4bytes_0), svreinterpret_u64_u8(q4bytes_1)));
+                            l1 = svreinterpret_s8_u64(svzip2_u64(svreinterpret_u64_u8(q4bytes_0), svreinterpret_u64_u8(q4bytes_1)));
+                            l2 = svreinterpret_s8_u64(svzip1_u64(svreinterpret_u64_u8(q4bytes_2), svreinterpret_u64_u8(q4bytes_3)));
+                            l3 = svreinterpret_s8_u64(svzip2_u64(svreinterpret_u64_u8(q4bytes_2), svreinterpret_u64_u8(q4bytes_3)));
+                            svint8_t q8bytes_0 = svld1_s8(pg256_all, q8_0);
+                            svint8_t q8bytes_1 = svld1_s8(pg256_all, q8_1);
+                            svint8_t q8bytes_2 = svld1_s8(pg256_all, q8_0+32);
+                            svint8_t q8bytes_3 = svld1_s8(pg256_all, q8_1+32);
+                            r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1)));
+                            r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1)));
+                            r2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_2), svreinterpret_s64_s8(q8bytes_3)));
+                            r3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_2), svreinterpret_s64_s8(q8bytes_3)));
+                            sumi1 = svmmla(svmmla(svdup_n_s32(0), r0, l0), r1, l1);
+                            svscales = svreinterpret_s32_u32(svlsr_n_u32_x(pg256_all, svlsl_n_u32_x(pg256_all, svreinterpret_u32_u64(svdup_n_u64(new_utmp.u64[j/2])), 8*(4-2*(j%2)-1)), 24));
+                            acc_sumif1 = svmla_s32_x(pg256_all, acc_sumif1, svscales, sumi1);
+                            sumi2 = svmmla(svmmla(svdup_n_s32(0), r2, l2), r3, l3);
+                            svscales = svreinterpret_s32_u32(svlsr_n_u32_x(pg256_all, svlsl_n_u32_x(pg256_all, svreinterpret_u32_u64(svdup_n_u64(new_utmp.u64[j/2])), 8*(4-2*(j%2)-2)), 24));
+                            acc_sumif2 = svmla_s32_x(pg256_all, acc_sumif2, svscales, sumi2);
+                            q4_0 += 32; q4_1 += 32; q8_0 += 64; q8_1 += 64;
+                        }
+                        svint32_t acc_sumif = svadd_s32_x(pg256_all, acc_sumif1, acc_sumif2);
+                        svint32_t swap_acc_sumif = svext_s32(acc_sumif, acc_sumif, 4);
+                        acc_sumif = svadd_s32_x(pg32_4, acc_sumif, swap_acc_sumif);
+                        sumf1 = svmla_f32_x(pg32_4,
+                                svmla_f32_x(pg32_4,
+                                    sumf1,
+                                    svcvt_f32_x(pg32_4, acc_sumif),
+                                    svsuper_block_scales),
+                                svdmins,
+                                svsumfs_tmp);
+                    } // end of for nb
+                } // end of case 256-512
+                break;
+            default:
+                assert(false && "Unsupported vector length");
+                break;
+        }
+
+        svst1_f32(pg32_2, s, sumf1);
+        svst1_f32(pg32_2, s + bs, svreinterpret_f32_u8(svext_u8(svreinterpret_u8_f32(sumf1), svdup_n_u8(0), 8)));
+
+        return;
+    }
+#elif defined(__ARM_FEATURE_MATMUL_INT8)
    if (nrc == 2) {
        const block_q4_K * GGML_RESTRICT x0 = x;
        const block_q4_K * GGML_RESTRICT x1 = (const block_q4_K *) ((const uint8_t *)vx + bx);
@@ -2235,7 +2467,6 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
        const int8_t  * GGML_RESTRICT q8 = y[i].qs;

-        const int vector_length = ggml_cpu_get_sve_cnt()*8;
        const svuint8_t m4b = svdup_n_u8(0xf);
        const svint32_t mzero = svdup_n_s32(0);
        svint32_t sumi1 = svdup_n_s32(0);
@@ -2480,7 +2711,201 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi

    const int nb = n / QK_K;

-#if defined(__ARM_FEATURE_MATMUL_INT8)
+#ifdef __ARM_FEATURE_SVE
+    const int vector_length = ggml_cpu_get_sve_cnt()*8;
+#endif
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+    if (nrc == 2) {
+        const svbool_t pg32_2 = svptrue_pat_b32(SV_VL2);
+
+        svfloat32_t sum = svdup_n_f32(0);
+
+        const block_q6_K * GGML_RESTRICT vx0 = vx;
+        const block_q8_K * GGML_RESTRICT vy0 = vy;
+        const block_q6_K * GGML_RESTRICT vx1 = (const block_q6_K *) ((const uint8_t*)vx + bx);
+        const block_q8_K * GGML_RESTRICT vy1 = (const block_q8_K *) ((const uint8_t*)vy + by);
+
+        switch (vector_length) {
+            case 128:
+                {
+                    const svbool_t pg128_all = svptrue_pat_b8(SV_ALL);
+                    for (int i = 0; i < nb; ++i) {
+                        const uint8_t * GGML_RESTRICT ql0 = vx0[i].ql;
+                        const uint8_t * GGML_RESTRICT qh0 = vx0[i].qh;
+                        const uint8_t * GGML_RESTRICT ql1 = vx1[i].ql;
+                        const uint8_t * GGML_RESTRICT qh1 = vx1[i].qh;
+                        const int8_t  * GGML_RESTRICT q80 = vy0[i].qs;
+                        const int8_t  * GGML_RESTRICT q81 = vy1[i].qs;
+
+                        const int8_t * GGML_RESTRICT scale0 = vx0[i].scales;
+                        const int8_t * GGML_RESTRICT scale1 = vx1[i].scales;
+
+                        svfloat32_t vy_d = svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d));
+                        svfloat32_t vx_d = svzip1_f32(svdup_n_f32(GGML_FP16_TO_FP32(vx0[i].d)), svdup_n_f32(GGML_FP16_TO_FP32(vx1[i].d)));
+                        svfloat32_t svsuper_block_scales = svmul_f32_x(pg128_all, vy_d, vx_d);
+                        // process q8sum summation 128 bit route
+                        const svint16_t q8sums_01 = svld1_s16(pg128_all, vy0[i].bsums);
+                        const svint16_t q8sums_02 = svld1_s16(pg128_all, vy0[i].bsums + 8);
+                        const svint16_t q8sums_11 = svld1_s16(pg128_all, vy1[i].bsums);
+                        const svint16_t q8sums_12 = svld1_s16(pg128_all, vy1[i].bsums + 8);
+                        const svint64x2_t q6scales_0_tmp = svld2_s64(pg128_all, (const int64_t *)scale0);
+                        const svint16_t q6scales_01 = svunpklo_s16(svreinterpret_s8_s64(svget2_s64(q6scales_0_tmp, 0)));
+                        const svint16_t q6scales_02 = svunpklo_s16(svreinterpret_s8_s64(svget2_s64(q6scales_0_tmp, 1)));
+                        const svint64x2_t q6scales_1_tmp = svld2_s64(pg128_all, (const int64_t *)scale1);
+                        const svint16_t q6scales_11 = svunpklo_s16(svreinterpret_s8_s64(svget2_s64(q6scales_1_tmp, 0)));
+                        const svint16_t q6scales_12 = svunpklo_s16(svreinterpret_s8_s64(svget2_s64(q6scales_1_tmp, 1)));
+                        const svint64_t prod = svdup_n_s64(0);
+
+                        svint32_t isum_tmp1 = svreinterpret_s32_s64(svdot_s64(svdot_s64(prod, q8sums_01, q6scales_01), q8sums_02, q6scales_02));
+                        svint32_t isum_tmp2 = svreinterpret_s32_s64(svdot_s64(svdot_s64(prod, q8sums_01, q6scales_11), q8sums_02, q6scales_12));
+                        svint32_t isum_tmp3 = svtrn1_s32(isum_tmp1, isum_tmp2);
+                        svint32_t isum_tmp4 = svreinterpret_s32_s64(svdot_s64(svdot_s64(prod, q8sums_11, q6scales_01), q8sums_12, q6scales_02));
+                        svint32_t isum_tmp5 = svreinterpret_s32_s64(svdot_s64(svdot_s64(prod, q8sums_11, q6scales_11), q8sums_12, q6scales_12));
+                        svint32_t isum_tmp6 = svtrn1_s32(isum_tmp4, isum_tmp5);
+                        svint32_t isum_tmp7 = svreinterpret_s32_s64(svtrn2_s64(svreinterpret_s64_s32(isum_tmp3), svreinterpret_s64_s32(isum_tmp6)));
+                        svint32_t isum_tmp8 = svreinterpret_s32_s64(svtrn1_s64(svreinterpret_s64_s32(isum_tmp3), svreinterpret_s64_s32(isum_tmp6)));
+                        svint32_t svisum_mins = svadd_s32_x(pg128_all, isum_tmp7, isum_tmp8);
+
+                        // process mmla
+                        svint8_t  l0, l1, r0, r1;
+                        svint32_t isum_tmp = svdup_n_s32(0);
+                        for (int j = 0; j < QK_K/128; ++j) {
+                            for (int k = 0; k < 8; ++k) {
+                                svuint8_t qhbits_0 = svld1_u8(pg128_all, qh0+16*(k%2));
+                                svuint8_t qhbits_1 = svld1_u8(pg128_all, qh1+16*(k%2));
+                                svuint8_t q6bits_0 = svld1_u8(pg128_all, ql0+16*(k%4));
+                                svuint8_t q6bits_1 = svld1_u8(pg128_all, ql1+16*(k%4));
+                                const int ql_pos = (k/4)*4;
+                                svuint8_t q6bytes_0_lo = (ql_pos < 4) ? svand_n_u8_x(pg128_all, q6bits_0, 0xf) : svlsr_n_u8_x(pg128_all, q6bits_0, 4);
+                                svuint8_t q6bytes_1_lo = (ql_pos < 4) ? svand_n_u8_x(pg128_all, q6bits_1, 0xf) : svlsr_n_u8_x(pg128_all, q6bits_1, 4);
+                                const int qh_pos = (k/2)*2;
+                                svuint8_t q6bytes_0_hi = svand_n_u8_x(pg128_all, qhbits_0, 0x3 << qh_pos);
+                                svuint8_t q6bytes_1_hi = svand_n_u8_x(pg128_all, qhbits_1, 0x3 << qh_pos);
+                                svint8_t  q6bytes_0, q6bytes_1;
+                                if (qh_pos <= 4) {
+                                    q6bytes_0 = svreinterpret_s8_u8(svmla_n_u8_x(pg128_all, q6bytes_0_lo, q6bytes_0_hi, 1 << (4 - qh_pos)));
+                                    q6bytes_1 = svreinterpret_s8_u8(svmla_n_u8_x(pg128_all, q6bytes_1_lo, q6bytes_1_hi, 1 << (4 - qh_pos)));
+                                } else {
+                                    q6bytes_0 = svreinterpret_s8_u8(svorr_u8_x(pg128_all, q6bytes_0_lo, svlsr_n_u8_x(pg128_all, q6bytes_0_hi, (qh_pos - 4))));
+                                    q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg128_all, q6bytes_1_lo, svlsr_n_u8_x(pg128_all, q6bytes_1_hi, (qh_pos - 4))));
+                                }
+                                svint8_t  q8bytes_0 = svld1_s8(pg128_all, q80+16*(k%8));
+                                svint8_t  q8bytes_1 = svld1_s8(pg128_all, q81+16*(k%8));
+                                l0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q6bytes_0), svreinterpret_s64_s8(q6bytes_1)));
+                                l1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q6bytes_0), svreinterpret_s64_s8(q6bytes_1)));
+                                r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1)));
+                                r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1)));
+                                svint32_t svscale = svzip1_s32(svdup_n_s32(scale0[k]), svdup_n_s32(scale1[k]));
+                                isum_tmp = svmla_s32_x(pg128_all, isum_tmp, svmmla_s32(svmmla_s32(svdup_n_s32(0), r0, l0), r1, l1), svscale);
+                            }
+                            qh0 += 32;  qh1 += 32;
+                            ql0 += 64;  ql1 += 64;
+                            q80 += 128; q81 += 128;
+                            scale0 += 8; scale1 += 8;
+                        }
+                        sum = svmla_f32_x(pg128_all, sum,
+                                svcvt_f32_x(pg128_all, svmla_s32_x(pg128_all, isum_tmp,
+                                        svisum_mins, svdup_n_s32(-32))),
+                                svsuper_block_scales);
+                    }
+                } // end of case 128
+                break;
+            case 256:
+            case 512:
+                {
+                    const svbool_t pg256_all = svptrue_pat_b8(SV_ALL);
+                    const svbool_t pg32_4 = svptrue_pat_b32(SV_VL4);
+                    for (int i = 0; i < nb; ++i) {
+                        const uint8_t * GGML_RESTRICT ql0 = vx0[i].ql;
+                        const uint8_t * GGML_RESTRICT qh0 = vx0[i].qh;
+                        const uint8_t * GGML_RESTRICT ql1 = vx1[i].ql;
+                        const uint8_t * GGML_RESTRICT qh1 = vx1[i].qh;
+                        const int8_t  * GGML_RESTRICT q80 = vy0[i].qs;
+                        const int8_t  * GGML_RESTRICT q81 = vy1[i].qs;
+
+                        const int8_t * GGML_RESTRICT scale0 = vx0[i].scales;
+                        const int8_t * GGML_RESTRICT scale1 = vx1[i].scales;
+                        svfloat32_t vx_d = svzip1_f32(svdup_n_f32(GGML_FP16_TO_FP32(vx0[i].d)), svdup_n_f32(GGML_FP16_TO_FP32(vx1[i].d)));
+                        svfloat64_t vy_d_tmp = svreinterpret_f64_f32(svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d)));
+                        svfloat32_t vy_d = svreinterpret_f32_f64(svuzp1_f64(vy_d_tmp, vy_d_tmp));
+                        svfloat32_t svsuper_block_scales = svmul_f32_x(pg32_4, vy_d, vx_d);
+                        // process q8sum summation 256 bit route
+                        const svint16_t q8sums_0 = svld1_s16(pg256_all, vy0[i].bsums);
+                        const svint16_t q8sums_1 = svld1_s16(pg256_all, vy1[i].bsums);
+                        const svint16_t q6scales_0 = svunpklo_s16(svld1_s8(pg256_all, scale0));
+                        const svint16_t q6scales_1 = svunpklo_s16(svld1_s8(pg256_all, scale1));
+                        const svint64_t prod = svdup_n_s64(0);
+                        svint32_t isum_tmp1  = svreinterpret_s32_s64(svdot_s64(prod, q8sums_0, q6scales_0));
+                        svint32_t isum_tmp2  = svreinterpret_s32_s64(svdot_s64(prod, q8sums_0, q6scales_1));
+                        svint32_t isum_tmp3  = svreinterpret_s32_s64(svdot_s64(prod, q8sums_1, q6scales_0));
+                        svint32_t isum_tmp4  = svreinterpret_s32_s64(svdot_s64(prod, q8sums_1, q6scales_1));
+                        svint32_t isum_tmp5  = svtrn1_s32(isum_tmp1, isum_tmp2);
+                        svint32_t isum_tmp6  = svtrn1_s32(isum_tmp3, isum_tmp4);
+                        svint32_t isum_tmp7  = svreinterpret_s32_s64(svtrn2_s64(svreinterpret_s64_s32(isum_tmp5), svreinterpret_s64_s32(isum_tmp6)));
+                        svint32_t isum_tmp8  = svreinterpret_s32_s64(svtrn1_s64(svreinterpret_s64_s32(isum_tmp5), svreinterpret_s64_s32(isum_tmp6)));
+                        svint32_t isum_tmp9  = svadd_s32_x(pg256_all, isum_tmp7, isum_tmp8);
+                        svint32_t isum_tmp10 = svreinterpret_s32_u8(svext_u8(svreinterpret_u8_s32(isum_tmp9), svreinterpret_u8_s32(isum_tmp9), 16));
+                        svint32_t svisum_mins = svadd_s32_z(pg32_4, isum_tmp9, isum_tmp10);
+
+                        // process mmla
+                        svint8_t l0, l1, r0, r1;
+                        svint32_t isum_tmp = svdup_n_s32(0);
+                        for (int j = 0; j < QK_K/128; ++j) {
+                            for (int k = 0; k < 8; k+=2) { // process 2 block
+                                svuint8_t qhbits_0  = svld1_u8(pg256_all, qh0);
+                                svuint8_t qhbits_1  = svld1_u8(pg256_all, qh1);
+                                svuint8_t q6bits_0  = svld1_u8(pg256_all, ql0+32*((k%4)/2));
+                                svuint8_t q6bits_1  = svld1_u8(pg256_all, ql1+32*((k%4)/2));
+                                const int ql_pos = (k/4)*4;
+                                svuint8_t q6bytes_0_lo = (ql_pos < 4) ? svand_n_u8_x(pg256_all, q6bits_0, 0xf) : svlsr_n_u8_x(pg256_all, q6bits_0, 4);
+                                svuint8_t q6bytes_1_lo = (ql_pos < 4) ? svand_n_u8_x(pg256_all, q6bits_1, 0xf) : svlsr_n_u8_x(pg256_all, q6bits_1, 4);
+                                const int qh_pos = (k/2)*2;
+                                svuint8_t q6bytes_0_hi = svand_n_u8_x(pg256_all, qhbits_0, 0x3 << qh_pos);
+                                svuint8_t q6bytes_1_hi = svand_n_u8_x(pg256_all, qhbits_1, 0x3 << qh_pos);
+                                svint8_t  q6bytes_0, q6bytes_1;
+                                if (qh_pos <= 4) {
+                                    q6bytes_0 = svreinterpret_s8_u8(svmla_n_u8_x(pg256_all, q6bytes_0_lo, q6bytes_0_hi, 1 << (4 - qh_pos)));
+                                    q6bytes_1 = svreinterpret_s8_u8(svmla_n_u8_x(pg256_all, q6bytes_1_lo, q6bytes_1_hi, 1 << (4 - qh_pos)));
+                                } else {
+                                    q6bytes_0 = svreinterpret_s8_u8(svorr_u8_x(pg256_all, q6bytes_0_lo, svlsr_n_u8_x(pg256_all, q6bytes_0_hi, (qh_pos - 4))));
+                                    q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg256_all, q6bytes_1_lo, svlsr_n_u8_x(pg256_all, q6bytes_1_hi, (qh_pos - 4))));
+                                }
+                                svint8_t  q8bytes_0 = svld1_s8(pg256_all, q80+32*(k/2));
+                                svint8_t  q8bytes_1 = svld1_s8(pg256_all, q81+32*(k/2));
+                                l0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q6bytes_0), svreinterpret_s64_s8(q6bytes_1)));
+                                l1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q6bytes_0), svreinterpret_s64_s8(q6bytes_1)));
+                                r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1)));
+                                r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1)));
+                                svint32_t svscale0 = svzip1_s32(svdup_n_s32(scale0[k]), svdup_n_s32(scale1[k]));
+                                svint32_t svscale1 = svzip1_s32(svdup_n_s32(scale0[k+1]), svdup_n_s32(scale1[k+1]));
+                                isum_tmp = svmla_s32_x(pg256_all, isum_tmp, svmmla_s32(svdup_n_s32(0), r0, l0), svscale0);
+                                isum_tmp = svmla_s32_x(pg256_all, isum_tmp, svmmla_s32(svdup_n_s32(0), r1, l1), svscale1);
+                            }
+                            qh0 += 32;  qh1 += 32;
+                            ql0 += 64;  ql1 += 64;
+                            q80 += 128; q81 += 128;
+                            scale0 += 8; scale1 += 8;
+                        } // end of for
+                        svint32_t swap_isum_tmp = svext_s32(isum_tmp, isum_tmp, 4);
+                        isum_tmp = svadd_s32_x(pg32_4, isum_tmp, swap_isum_tmp);
+                        sum = svmla_f32_x(pg32_4, sum,
+                                svcvt_f32_x(pg32_4, svmla_s32_x(pg32_4, isum_tmp,
+                                        svisum_mins, svdup_n_s32(-32))),
+                                svsuper_block_scales);
+                    }
+                } // end of case 256
+                break;
+            default:
+                assert(false && "Unsupported vector length");
+                break;
+        } // end of switch
+
+        svst1_f32(pg32_2, s, sum);
+        svst1_f32(pg32_2, s + bs, svreinterpret_f32_u8(svext_u8(svreinterpret_u8_f32(sum), svdup_n_u8(0), 8)));
+
+        return;
+    }
+#elif defined(__ARM_FEATURE_MATMUL_INT8)
    if (nrc == 2) {
        const block_q6_K * GGML_RESTRICT x0 = x;
        const block_q6_K * GGML_RESTRICT x1 = (const block_q6_K *) ((const uint8_t *)vx + bx);
@@ -2594,27 +3019,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
            // adjust bias, apply superblock scale
            {
                int32_t bias[4];
-#ifdef __ARM_FEATURE_SVE
-                const svbool_t pg16_8 = svptrue_pat_b16(SV_VL8);
-                const svbool_t pg8_8 = svptrue_pat_b8(SV_VL8);
-                const svint16_t y0_q8sums_0 = svld1_s16(pg16_8, y0->bsums);
-                const svint16_t y0_q8sums_1 = svld1_s16(pg16_8, y0->bsums + 8);
-                const svint16_t y1_q8sums_0 = svld1_s16(pg16_8, y1->bsums);
-                const svint16_t y1_q8sums_1 = svld1_s16(pg16_8, y1->bsums + 8);
-                const svint16_t x0_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x0->scales));
-                const svint16_t x0_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x0->scales + 8));
-                const svint16_t x1_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x1->scales));
-                const svint16_t x1_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x1->scales + 8));
-                const svint64_t zero = svdup_n_s64(0);
-                bias[0] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x0_q6scales_0),
-                                                                               svdot_s64(zero, y0_q8sums_1, x0_q6scales_1)));
-                bias[1] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x0_q6scales_0),
-                                                                               svdot_s64(zero, y1_q8sums_1, x0_q6scales_1)));
-                bias[2] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x1_q6scales_0),
-                                                                               svdot_s64(zero, y0_q8sums_1, x1_q6scales_1)));
-                bias[3] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x1_q6scales_0),
-                                                                               svdot_s64(zero, y1_q8sums_1, x1_q6scales_1)));
-#else
                // NEON doesn't support int16 dot product, fallback to separated mul and add
                const int16x8x2_t q8sums0 = vld1q_s16_x2(y0->bsums);
                const int16x8x2_t q8sums1 = vld1q_s16_x2(y1->bsums);
@@ -2646,7 +3050,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
                                           vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales1.val[1]))));
                bias[3] = vaddvq_s32(prod);

-#endif
                const int32x4_t vibias = vmulq_n_s32(vld1q_s32(bias), 32);

                const float32x4_t superblock_scale = {
@@ -2672,7 +3075,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
 #endif

 #ifdef __ARM_FEATURE_SVE
-    const int vector_length = ggml_cpu_get_sve_cnt()*8;
    float sum = 0;
    svuint8_t m4b = svdup_n_u8(0xf);
    svint32_t vzero = svdup_n_s32(0);
--- a/ggml/src/ggml-cpu/arch/arm/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp
@@ -24,6 +24,29 @@

 #define UNUSED GGML_UNUSED

+static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
+                                             int16x8_t *     out_mins,
+                                             int8_t *        out_scales) {
+    constexpr uint32_t kmask1 = 0x3f3f3f3f;
+    constexpr uint32_t kmask2 = 0x0f0f0f0f;
+    constexpr uint32_t kmask3 = 0x03030303;
+    constexpr uint8_t  scales_size = 12;
+
+    uint32_t sm[3];
+    memcpy(sm, scales_in, scales_size);
+
+    const uint32_t   mins_0_3 = sm[1] & kmask1;
+    const uint32_t   mins_4_7 = ((sm[2] >> 4) & kmask2) | (((sm[1] >> 6) & kmask3) << 4);
+    const uint32x2_t mins_u32 = { mins_0_3, mins_4_7 };
+
+    *out_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins_u32)));
+
+    uint32_t scales_u32[2];
+    scales_u32[0] = sm[0] & kmask1;
+    scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4);
+    memcpy(out_scales, scales_u32, 8);
+}
+
 void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
    assert(QK8_0 == 32);
    assert(k % QK8_0 == 0);
@@ -474,6 +497,295 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
    ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
 }

+void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    constexpr int qk = QK_K;
+    const int     nb = n / qk;
+
+    constexpr int ncols_interleaved = 8;
+    constexpr int blocklen          = 8;
+
+    assert(n % qk == 0);
+    assert(nr % 4 == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    constexpr int    col_groups = ncols_interleaved / 4; // 0123 and 4567
+    const uint8x16_t m4b        = vdupq_n_u8(0x0f);
+
+    // 1x8 tile = 2 x 4
+    float32x4_t acc_f32[col_groups];
+
+    const block_q8_K * GGML_RESTRICT q8_ptr = (const block_q8_K *) vy;
+
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q4_Kx8 * GGML_RESTRICT q4_ptr = (const block_q4_Kx8 *) vx + (x * nb);
+
+        for (int i = 0; i < col_groups; i++) {
+            acc_f32[i] = vdupq_n_f32(0);
+        }
+
+        for (int b = 0; b < nb; b++) {
+            float32x4_t q4_d_0        = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d));      // d0 d1 d2 d3
+            float32x4_t q4_d_1        = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d + 4));  // d4 d5 d6 d7
+            float32x4_t q8_d          = vdupq_n_f32(q8_ptr[b].d);
+            float32x4_t sb_scale_0123 = vmulq_f32(q4_d_0, q8_d);
+            float32x4_t sb_scale_4567 = vmulq_f32(q4_d_1, q8_d);
+            float32x4_t q4_dmin_0     = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin));      // dmin 0..3
+            float32x4_t q4_dmin_1     = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin + 4));  // dmin 4..7
+            float32x4_t sb_min_0123   = vmulq_f32(q4_dmin_0, q8_d);
+            float32x4_t sb_min_4567   = vmulq_f32(q4_dmin_1, q8_d);
+
+            // interleaved bias_acc: [0]->r0 0123, [1]->r0 4567
+            int32x4_t bias_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+            int32x4_t acc_lo[col_groups];
+            int32x4_t acc_hi[col_groups];
+
+            // Each bsum is 16 elements, pairwise add leaves us with the 8 bsums of the entire block
+            const int16x8_t bsums = vpaddq_s16(vld1q_s16(q8_ptr[b].bsums), vld1q_s16(q8_ptr[b].bsums + 8));
+            int16_t         bsums_arr[8];
+            vst1q_s16(bsums_arr, bsums);
+            for (int sb = 0; sb < QK_K / 64; sb++) {
+                for (int i = 0; i < col_groups; i++) {
+                    acc_lo[i] = vdupq_n_s32(0);
+                    acc_hi[i] = vdupq_n_s32(0);
+                }
+                // Need scales for the low and high nibbles
+                // 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
+                int16x8_t q4sb_mins[2];
+                int16x8_t q4sb_scales[2];
+                for (int i = 0; i < 2; i++) {
+                    int8_t    aux_q4sb[8];
+                    const int offset = sb * 24 + i * 12;
+                    decode_q4_Kx8_scales_mins(&q4_ptr[b].scales[offset], &q4sb_mins[i], aux_q4sb);
+                    q4sb_scales[i] = vmovl_s8(vld1_s8(aux_q4sb));
+                }
+
+                int8x16_t q8_qs[64 / 16];
+                for (int i = 0; i < 64 / 16; i++) {
+                    q8_qs[i] = vld1q_s8(q8_ptr[b].qs + sb * 64 + i * 16);
+                }
+
+                for (int c = 0; c < col_groups; c++) {
+                    uint8x16_t q4_cols[8];
+                    for (int i = 0; i < 8; i++) {
+                        q4_cols[i] = vld1q_u8(q4_ptr[b].qs + sb * QK_K + i * 32 + 16 * c);
+                    }
+
+                    acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[0], m4b)), q8_qs[0], 0);
+                    acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[1], m4b)), q8_qs[0], 1);
+                    acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[2], m4b)), q8_qs[0], 2);
+                    acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[3], m4b)), q8_qs[0], 3);
+                    acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[4], m4b)), q8_qs[1], 0);
+                    acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[5], m4b)), q8_qs[1], 1);
+                    acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[6], m4b)), q8_qs[1], 2);
+                    acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[7], m4b)), q8_qs[1], 3);
+
+                    acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[0], 4)), q8_qs[2], 0);
+                    acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[1], 4)), q8_qs[2], 1);
+                    acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[2], 4)), q8_qs[2], 2);
+                    acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[3], 4)), q8_qs[2], 3);
+                    acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[4], 4)), q8_qs[3], 0);
+                    acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[5], 4)), q8_qs[3], 1);
+                    acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[6], 4)), q8_qs[3], 2);
+                    acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[7], 4)), q8_qs[3], 3);
+                }
+
+                // Scales
+                // row c0123 blk0 and blk1
+                const int16x4_t   sc_0123_lo = vget_low_s16(q4sb_scales[0]);
+                const int16x4_t   sc_0123_hi = vget_low_s16(q4sb_scales[1]);
+                const float32x4_t sumf_0123  = vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_0123_lo), acc_lo[0]),
+                                                                       vmulq_s32(vmovl_s16(sc_0123_hi), acc_hi[0])));
+                acc_f32[0]                   = vfmaq_f32(acc_f32[0], sb_scale_0123, sumf_0123);
+                // row c4567 blk0 and blk1
+                const int16x4_t   sc_4567_lo = vget_high_s16(q4sb_scales[0]);
+                const int16x4_t   sc_4567_hi = vget_high_s16(q4sb_scales[1]);
+                const float32x4_t sumf_4567  = vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_4567_lo), acc_lo[1]),
+                                                                       vmulq_s32(vmovl_s16(sc_4567_hi), acc_hi[1])));
+                acc_f32[1]                   = vfmaq_f32(acc_f32[1], sb_scale_4567, sumf_4567);
+
+                // Bias Correction
+                const int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[2 * sb + 0]);
+                const int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[2 * sb + 1]);
+
+                bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_lo, vget_low_s16(q4sb_mins[0]));
+                bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_hi, vget_low_s16(q4sb_mins[1]));
+                bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_lo, vget_high_s16(q4sb_mins[0]));
+                bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_hi, vget_high_s16(q4sb_mins[1]));
+            }  // for sb
+
+            acc_f32[0] = vmlsq_f32(acc_f32[0], vcvtq_f32_s32(bias_acc[0]), sb_min_0123);
+            acc_f32[1] = vmlsq_f32(acc_f32[1], vcvtq_f32_s32(bias_acc[1]), sb_min_4567);
+        }  // for b
+
+        int base = x * ncols_interleaved;
+        vst1q_f32(s + base, acc_f32[0]);
+        vst1q_f32(s + base + 4, acc_f32[1]);
+    }  // for x
+    return;
+#endif  // #if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    ggml_gemv_q4_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
+}
+
+void ggml_gemv_q4_K_8x8_q8_K(int                        n,
+                             float * GGML_RESTRICT      s,
+                             size_t                     bs,
+                             const void * GGML_RESTRICT vx,
+                             const void * GGML_RESTRICT vy,
+                             int                        nr,
+                             int                        nc) {
+    constexpr int qk = QK_K;
+    const int     nb = n / qk;
+
+    constexpr int ncols_interleaved = 8;
+    constexpr int blocklen          = 8;
+
+    assert(n % qk == 0);
+    assert(nr % 4 == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    constexpr int    col_pairs = ncols_interleaved / 2;
+    const uint8x16_t m4b       = vdupq_n_u8(0x0f);
+
+    // 1x8 tile = 2 x 4
+    float32x4_t acc_f32[ncols_interleaved / 4];
+
+    const block_q8_K * GGML_RESTRICT q8_ptr = (const block_q8_K *) vy;
+
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q4_Kx8 * GGML_RESTRICT q4_ptr = (const block_q4_Kx8 *) vx + (x * nb);
+
+        for (int i = 0; i < ncols_interleaved / 4; i++) {
+            acc_f32[i] = vdupq_n_f32(0);
+        }
+
+        for (int b = 0; b < nb; b++) {
+            float32x4_t q4_d_0     = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d));      // d0 d1 d2 d3
+            float32x4_t q4_d_1     = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d + 4));  // d4 d5 d6 d7
+            float32x4_t q8_d       = vdupq_n_f32(q8_ptr[b].d);
+            float32x4_t sb_scale_0 = vmulq_f32(q4_d_0, q8_d);
+            float32x4_t sb_scale_1 = vmulq_f32(q4_d_1, q8_d);
+            float32x4_t q4_dmin_0  = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin));      // dmin 0..3
+            float32x4_t q4_dmin_1  = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin + 4));  // dmin 4..7
+            float32x4_t sb_min_0   = vmulq_f32(q4_dmin_0, q8_d);
+            float32x4_t sb_min_1   = vmulq_f32(q4_dmin_1, q8_d);
+
+            // interleaved bias_acc: [0]->r0 0123, [1]->r0 4567
+            int32x4_t bias_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+            // 2 sb each iteration
+            int32x4_t acc_lo[col_pairs];
+            int32x4_t acc_hi[col_pairs];
+
+            // Each bsum is 16 elements, pairwise add leaves us with the 8 bsums of the entire block
+            const int16x8_t bsums = vpaddq_s16(vld1q_s16(q8_ptr[b].bsums), vld1q_s16(q8_ptr[b].bsums + 8));
+            int16_t         bsums_arr[8];
+            vst1q_s16(bsums_arr, bsums);
+            for (int sb = 0; sb < QK_K / 64; sb++) {
+                for (int i = 0; i < col_pairs; i++) {
+                    acc_lo[i] = vdupq_n_s32(0);
+                    acc_hi[i] = vdupq_n_s32(0);
+                }
+                // Need scales for the low and high nibbles
+                // 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
+                int16x8_t q4sb_mins[2];  // int16 as its needed for bias_acc later
+                int16x8_t q4sb_scales[2];
+                for (int i = 0; i < 2; i++) {
+                    int8_t    aux_q4sb[8];
+                    const int offset = sb * 24 + i * 12;
+                    decode_q4_Kx8_scales_mins(&q4_ptr[b].scales[offset], &q4sb_mins[i], aux_q4sb);
+                    q4sb_scales[i] = vmovl_s8(vld1_s8(aux_q4sb));
+                }
+
+                const uint8_t * q4_base = q4_ptr[b].qs + sb * QK_K;
+
+                // Load the 64 quants from q8K duplicated to use vecdots with the interelaved columns
+                // but still need the qs to use the low and hi bits from q4
+                const int8_t * q8_base = q8_ptr[b].qs + sb * 64;
+                int8x16_t      q8_qs[8];
+                for (int i = 0; i < 8; i++) {
+                    q8_qs[i] = (int8x16_t) vld1q_dup_s64((const int64_t *) (q8_base + i * 8));
+                }
+
+                // Q4s columns iterated in pairs (01, 23, 45, 67)
+                for (int cp = 0; cp < col_pairs; cp++) {
+                    uint8x16_t q4_qs_cp_0 = vld1q_u8(q4_base + 16 * cp);
+                    uint8x16_t q4_qs_cp_1 = vld1q_u8(q4_base + 16 * cp + 64);
+                    uint8x16_t q4_qs_cp_2 = vld1q_u8(q4_base + 16 * cp + 128);
+                    uint8x16_t q4_qs_cp_3 = vld1q_u8(q4_base + 16 * cp + 192);
+
+                    acc_lo[cp] =
+                        ggml_vdotq_s32(acc_lo[cp], vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_0, m4b)), q8_qs[0]);  // 0 .. 7
+                    acc_lo[cp] =
+                        ggml_vdotq_s32(acc_lo[cp], vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_1, m4b)), q8_qs[1]);  // 8 ..15
+                    acc_lo[cp] =
+                        ggml_vdotq_s32(acc_lo[cp], vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_2, m4b)), q8_qs[2]);  // 16..23
+                    acc_lo[cp] =
+                        ggml_vdotq_s32(acc_lo[cp], vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_3, m4b)), q8_qs[3]);  // 24..31
+
+                    acc_hi[cp] =
+                        ggml_vdotq_s32(acc_hi[cp], vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_0, 4)), q8_qs[4]);  // 32..39
+                    acc_hi[cp] =
+                        ggml_vdotq_s32(acc_hi[cp], vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_1, 4)), q8_qs[5]);  // 40..47
+                    acc_hi[cp] =
+                        ggml_vdotq_s32(acc_hi[cp], vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_2, 4)), q8_qs[6]);  // 48..55
+                    acc_hi[cp] =
+                        ggml_vdotq_s32(acc_hi[cp], vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_3, 4)), q8_qs[7]);  // 56..63
+                }
+
+                // Iterates over a pair of column pairs (4 columns) to use a single 128 register
+                // p = 0 -> 0123  p2 -> 4567
+                for (int i = 0, p = 0; p < col_pairs; i++, p += 2) {
+                    int16x4_t   group_scales_lo = p == 0 ? vget_low_s16(q4sb_scales[0]) : vget_high_s16(q4sb_scales[0]);
+                    int16x4_t   group_scales_hi = p == 0 ? vget_low_s16(q4sb_scales[1]) : vget_high_s16(q4sb_scales[1]);
+                    float32x4_t sb_scale        = p == 0 ? sb_scale_0 : sb_scale_1;
+
+                    // 0123 or 4567
+                    float32x4_t sumf_0 =
+                        vcvtq_f32_s32(vmulq_s32(vmovl_s16(group_scales_lo), vpaddq_s32(acc_lo[p], acc_lo[p + 1])));
+                    acc_f32[i] = vfmaq_f32(acc_f32[i], sb_scale, sumf_0);
+
+                    float32x4_t sumf_1 =
+                        vcvtq_f32_s32(vmulq_s32(vmovl_s16(group_scales_hi), vpaddq_s32(acc_hi[p], acc_hi[p + 1])));
+                    acc_f32[i] = vfmaq_f32(acc_f32[i], sb_scale, sumf_1);
+                }
+
+                // Multiply Acc bsum + mins
+                // Each pair of subblocks share the same bsums
+                // Load scalar bsum → broadcast to a vector (vdupq_n_s16(s)).
+                int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[2 * sb + 0]);
+                int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[2 * sb + 1]);
+
+                // cols 0-3 bias
+                bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_lo, vget_low_s16(q4sb_mins[0]));
+                bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_hi, vget_low_s16(q4sb_mins[1]));
+
+                // cols 4-7 bias
+                bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_lo, vget_high_s16(q4sb_mins[0]));
+                bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_hi, vget_high_s16(q4sb_mins[1]));
+            }  // for sb
+
+            acc_f32[0] = vmlsq_f32(acc_f32[0], vcvtq_f32_s32(bias_acc[0]), sb_min_0);
+            acc_f32[1] = vmlsq_f32(acc_f32[1], vcvtq_f32_s32(bias_acc[1]), sb_min_1);
+        }  // for b
+
+        int base = x * ncols_interleaved;
+        vst1q_f32(s + base, acc_f32[0]);
+        vst1q_f32(s + base + 4, acc_f32[1]);
+    }  // for x
+    return;
+#endif  // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
+}
+
 void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@@ -1889,3 +2201,412 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
 #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
    ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
 }
+
+void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    constexpr int qk = QK_K;
+    const int     nb = n / qk;
+
+    constexpr int ncols_interleaved = 8;
+    constexpr int blocklen          = 4;
+
+    assert(n % qk == 0);
+    assert(nr % 4 == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    constexpr int    q8_k_blocklen = 4;
+    constexpr int    acc_size  = 2 * 4;  // 2 row pairs × 4 col pairs
+    const uint8x16_t m4b       = vdupq_n_u8(0x0f);
+
+    // 8 accumulators: 2 row pairs × 4 col pairs
+    float32x4_t acc_f32[acc_size];
+
+    for (int y = 0; y < nr / q8_k_blocklen; y++) {
+        const block_q8_Kx4 * GGML_RESTRICT q8_ptr = (const block_q8_Kx4 *) vy + (y * nb);
+
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_Kx8 * GGML_RESTRICT q4_ptr = (const block_q4_Kx8 *) vx + (x * nb);
+
+            for (int i = 0; i < acc_size; i++) {
+                acc_f32[i] = vdupq_n_f32(0);
+            }
+
+            for (int b = 0; b < nb; b++) {
+                // d4 0 1 2 3, 4 5 6 7
+                float32x4_t q4_d_0123    = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d));
+                float32x4_t q4_d_4567    = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d + 4));
+                // d8 0 1 2 3
+                float32x4_t q8_d_0123    = vld1q_f32(q8_ptr[b].d);
+                // mins
+                float32x4_t q4_dmin_0123 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin));
+                float32x4_t q4_dmin_4567 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin + 4));
+
+                // Precomputation of scales and mins
+                float32x4_t sbd_scale_0123[q8_k_blocklen];
+                float32x4_t sbd_scale_4567[q8_k_blocklen];
+                float32x4_t sbd_min_0123[q8_k_blocklen];
+                float32x4_t sbd_min_4567[q8_k_blocklen];
+
+                sbd_scale_0123[0] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 0);
+                sbd_scale_4567[0] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 0);
+                sbd_min_0123[0]   = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 0);
+                sbd_min_4567[0]   = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 0);
+
+                sbd_scale_0123[1] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 1);
+                sbd_scale_4567[1] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 1);
+                sbd_min_0123[1]   = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 1);
+                sbd_min_4567[1]   = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 1);
+
+                sbd_scale_0123[2] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 2);
+                sbd_scale_4567[2] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 2);
+                sbd_min_0123[2]   = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 2);
+                sbd_min_4567[2]   = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 2);
+
+                sbd_scale_0123[3] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 3);
+                sbd_scale_4567[3] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 3);
+                sbd_min_0123[3]   = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 3);
+                sbd_min_4567[3]   = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 3);
+
+                // Precomputation of bsums, each vpaddq calcs all the bsums for each row
+                const int16x8_t bsums[q8_k_blocklen] = {
+                    vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)),
+                    vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)),
+                    vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 2), vld1q_s16(q8_ptr[b].bsums + 16 * 2 + 8)),
+                    vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 3), vld1q_s16(q8_ptr[b].bsums + 16 * 3 + 8)),
+                };
+                int16_t bsums_arr[QK_K / 64][8];
+                for (int q8_row = 0; q8_row < 4; q8_row++) {
+                    vst1q_s16(bsums_arr[q8_row], bsums[q8_row]);
+                }
+
+                // interleaved bias_acc: [0]->r0 0123, [1]->r1 0123, .., [4]->r0 4567, [5]->r1 4567 ..
+                int32x4_t bias_acc[acc_size];
+                for (int i = 0; i < acc_size; i++) {
+                    bias_acc[i] = vdupq_n_s32(0);
+                }
+
+                for (int sb = 0; sb < QK_K / 64; sb++) {
+                    // Int accumulators for qs vecdot (4 row x 2 col quartets)
+                    int32x4_t acc_lo[acc_size];
+                    int32x4_t acc_hi[acc_size];
+                    for (int i = 0; i < acc_size; i++) {
+                        acc_lo[i] = vdupq_n_s32(0);
+                        acc_hi[i] = vdupq_n_s32(0);
+                    }
+                    // Need scales for the low and high nibbles
+                    // 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
+                    int16x8_t q4sb_scales[2];
+                    int16x8_t q4sb_mins[2];
+                    for (int i = 0; i < 2; i++) {
+                        int8_t    aux_q4sb[8];
+                        const int offset = sb * 24 + i * 12;
+                        decode_q4_Kx8_scales_mins(&q4_ptr[b].scales[offset], &q4sb_mins[i], aux_q4sb);
+                        q4sb_scales[i] = vmovl_s8(vld1_s8(aux_q4sb));
+                    }
+
+                    constexpr int reads_per_sb = 8;  // 8 * 16 bytes each => 32 qs * 4 rows
+                    for (int k = 0; k < reads_per_sb; k++) {
+                        const int8x16_t q8_blk0 = vld1q_s8(q8_ptr[b].qs + sb * 256 + 16 * k);
+                        const int8x16_t q8_blk1 = vld1q_s8(q8_ptr[b].qs + sb * 256 + 16 * k + 128);
+
+                        // 0..3 & 32..35
+                        const uint8x16_t q4_0123 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 32 * k);
+                        const uint8x16_t q4_4567 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 32 * k + 16);
+
+                        const int8x16_t q4_0123_lo = vreinterpretq_s8_u8(vandq_u8(q4_0123, m4b));
+                        const int8x16_t q4_0123_hi = vreinterpretq_s8_u8(vshrq_n_u8(q4_0123, 4));
+
+                        acc_lo[0] = vdotq_laneq_s32(acc_lo[0], q4_0123_lo, q8_blk0, 0);  //  0..3  r0 c0123
+                        acc_lo[1] = vdotq_laneq_s32(acc_lo[1], q4_0123_lo, q8_blk0, 1);  //  0..3  r1 c0123
+                        acc_lo[2] = vdotq_laneq_s32(acc_lo[2], q4_0123_lo, q8_blk0, 2);  //  0..3  r2 c0123
+                        acc_lo[3] = vdotq_laneq_s32(acc_lo[3], q4_0123_lo, q8_blk0, 3);  //  0..3  r3 c0123
+
+                        acc_hi[0] = vdotq_laneq_s32(acc_hi[0], q4_0123_hi, q8_blk1, 0);  // 32..35 r0 c0123
+                        acc_hi[1] = vdotq_laneq_s32(acc_hi[1], q4_0123_hi, q8_blk1, 1);  // 32..35 r1 c0123
+                        acc_hi[2] = vdotq_laneq_s32(acc_hi[2], q4_0123_hi, q8_blk1, 2);  // 32..35 r2 c0123
+                        acc_hi[3] = vdotq_laneq_s32(acc_hi[3], q4_0123_hi, q8_blk1, 3);  // 32..35 r3 c0123
+
+                        const int8x16_t q4_4567_lo = vreinterpretq_s8_u8(vandq_u8(q4_4567, m4b));
+                        const int8x16_t q4_4567_hi = vreinterpretq_s8_u8(vshrq_n_u8(q4_4567, 4));
+
+                        acc_lo[4] = vdotq_laneq_s32(acc_lo[4], q4_4567_lo, q8_blk0, 0);  //  0..3  r0 c4567
+                        acc_lo[5] = vdotq_laneq_s32(acc_lo[5], q4_4567_lo, q8_blk0, 1);  //  0..3  r1 c4567
+                        acc_lo[6] = vdotq_laneq_s32(acc_lo[6], q4_4567_lo, q8_blk0, 2);  //  0..3  r2 c4567
+                        acc_lo[7] = vdotq_laneq_s32(acc_lo[7], q4_4567_lo, q8_blk0, 3);  //  0..3  r3 c4567
+
+                        acc_hi[4] = vdotq_laneq_s32(acc_hi[4], q4_4567_hi, q8_blk1, 0);  // 32..35 r0 c4567
+                        acc_hi[5] = vdotq_laneq_s32(acc_hi[5], q4_4567_hi, q8_blk1, 1);  // 32..35 r1 c4567
+                        acc_hi[6] = vdotq_laneq_s32(acc_hi[6], q4_4567_hi, q8_blk1, 2);  // 32..35 r2 c4567
+                        acc_hi[7] = vdotq_laneq_s32(acc_hi[7], q4_4567_hi, q8_blk1, 3);  // 32..35 r3 c4567
+                    }
+
+                    // Scale and bias application
+                    // acc is stored interleaved to match output layout
+                    const int16x4_t sc_0123_lo = vget_low_s16(q4sb_scales[0]);
+                    const int16x4_t sc_4567_lo = vget_high_s16(q4sb_scales[0]);
+                    const int16x4_t sc_0123_hi = vget_low_s16(q4sb_scales[1]);
+                    const int16x4_t sc_4567_hi = vget_high_s16(q4sb_scales[1]);
+                    for (int row = 0; row < q8_k_blocklen; row++) {
+                        // Bias correction
+                        // row c0123 blk0 and blk1
+                        const float32x4_t sumf_0123 =
+                            vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_0123_lo), acc_lo[row]),
+                                                    vmulq_s32(vmovl_s16(sc_0123_hi), acc_hi[row])));
+                        acc_f32[2 * row] = vfmaq_f32(acc_f32[2 * row], sbd_scale_0123[row], sumf_0123);
+
+                        // row c4567 blk0 and blk1
+                        const float32x4_t sumf_4567 =
+                            vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_4567_lo), acc_lo[row + 4]),
+                                                    vmulq_s32(vmovl_s16(sc_4567_hi), acc_hi[row + 4])));
+                        acc_f32[2 * row + 1] = vfmaq_f32(acc_f32[2 * row + 1], sbd_scale_4567[row], sumf_4567);
+
+                        // Bias
+                        const int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[sb][row * 2]);
+                        const int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[sb][row * 2 + 1]);
+
+                        // row c0123 blk0 and blk1
+                        bias_acc[2 * row] = vmlal_s16(bias_acc[2 * row], bsums_vec_lo, vget_low_s16(q4sb_mins[0]));
+                        bias_acc[2 * row] = vmlal_s16(bias_acc[2 * row], bsums_vec_hi, vget_low_s16(q4sb_mins[1]));
+
+                        // row c4567 blk0 and blk1
+                        bias_acc[2 * row + 1] =
+                            vmlal_s16(bias_acc[2 * row + 1], bsums_vec_lo, vget_high_s16(q4sb_mins[0]));
+                        bias_acc[2 * row + 1] =
+                            vmlal_s16(bias_acc[2 * row + 1], bsums_vec_hi, vget_high_s16(q4sb_mins[1]));
+                    }
+                }  // for sb
+
+                for (int row = 0; row < q8_k_blocklen; row++) {
+                    acc_f32[2 * row] = vmlsq_f32(acc_f32[2 * row], vcvtq_f32_s32(bias_acc[2 * row]), sbd_min_0123[row]);
+                    acc_f32[2 * row + 1] =
+                        vmlsq_f32(acc_f32[2 * row + 1], vcvtq_f32_s32(bias_acc[2 * row + 1]), sbd_min_4567[row]);
+                }
+            }  // for b
+
+            for (int i = 0; i < q8_k_blocklen; i++) {
+                int row = y * q8_k_blocklen + i;
+                for (int j = 0; j < 2; j++) {
+                    int col    = x * ncols_interleaved + j * 4;
+                    int offset = row * bs + col;
+                    vst1q_f32(s + offset, acc_f32[2 * i + j]);
+                }
+            }
+        }  // for x
+    }  // for y
+    return;
+#endif  // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    ggml_gemm_q4_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
+}
+
+void ggml_gemm_q4_K_8x8_q8_K(int                        n,
+                             float * GGML_RESTRICT      s,
+                             size_t                     bs,
+                             const void * GGML_RESTRICT vx,
+                             const void * GGML_RESTRICT vy,
+                             int                        nr,
+                             int                        nc) {
+    constexpr int qk = QK_K;
+    const int     nb = n / qk;
+
+    constexpr int ncols_interleaved = 8;
+    constexpr int blocklen          = 8;
+
+    assert(n % qk == 0);
+    assert(nr % 4 == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    constexpr int    q8_k_blocklen = 4;
+    const uint8x16_t m4b           = vdupq_n_u8(0x0f);
+
+    // 8 accumulators: 2 row pairs × 4 col pairs
+    float32x4_t acc_f32[blocklen];
+
+    for (int y = 0; y < nr / q8_k_blocklen; y++) {
+        const block_q8_Kx4 * GGML_RESTRICT q8_ptr = (const block_q8_Kx4 *) vy + (y * nb);
+
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_Kx8 * GGML_RESTRICT q4_ptr = (const block_q4_Kx8 *) vx + (x * nb);
+
+            for (int i = 0; i < blocklen; i++) {
+                acc_f32[i] = vdupq_n_f32(0);
+            }
+
+            for (int b = 0; b < nb; b++) {
+                // bsums pairs belongs to the same q8_k subblock
+                const int16x8_t bsums[4]{
+                    vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)),
+                    vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)),
+                    vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 2), vld1q_s16(q8_ptr[b].bsums + 16 * 2 + 8)),
+                    vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 3), vld1q_s16(q8_ptr[b].bsums + 16 * 3 + 8)),
+                };
+                int16_t bsums_arr[4][8];
+                for (int q8_row = 0; q8_row < 4; q8_row++) {
+                    vst1q_s16(bsums_arr[q8_row], bsums[q8_row]);
+                }
+
+                int32x4_t sb_acc[4];    // Aux accumulators to store subblock (partial) results
+                int32x4_t acc[8];       // rows 01 stored in [0][1][2][3] rows 23 stored in [4][5][6][7]
+                int32x4_t bias_acc[8];  // interleaved bias_acc: [0]->r0 0123, [1]->r0 4567, [2]->r1 0123 ...
+                for (int i = 0; i < 8; i++) {
+                    acc[i]      = vdupq_n_s32(0);
+                    bias_acc[i] = vdupq_n_s32(0);
+                }
+
+                for (int sb = 0; sb < QK_K / 64; sb++) {
+                    // Need scales for the low and high nibbles
+                    // 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
+                    int8_t    q4sb_scales[2][8];
+                    int16x8_t q4sb_mins[2];  // int16 as its needed for bias_acc later
+                    for (int i = 0; i < 2; i++) {
+                        const int offset = sb * 24 + i * 12;
+                        decode_q4_Kx8_scales_mins(&q4_ptr[b].scales[offset], &q4sb_mins[i], q4sb_scales[i]);
+                    }
+
+                    // q8_ptr[b].qs has interleaved Q8 rows (01, 23)
+                    const int8_t * q8_base = q8_ptr[b].qs + sb * 256;
+
+                    int8x16_t q8_qs_01[8];
+                    int8x16_t q8_qs_23[8];
+
+                    // Load 32-byte per row pair, 1 subblock each time
+                    for (int i = 0; i < 8; i++) {
+                        const int offset = i * 32;  // 16 for row 01, 16 for row 23
+                        q8_qs_01[i]      = vld1q_s8(q8_base + offset);
+                        q8_qs_23[i]      = vld1q_s8(q8_base + offset + 16);
+                    }
+
+                    const int8x16_t q8s[2][8] = {
+                        { q8_qs_01[0], q8_qs_01[1], q8_qs_01[2], q8_qs_01[3],
+                          q8_qs_01[4], q8_qs_01[5], q8_qs_01[6], q8_qs_01[7] },
+                        { q8_qs_23[0], q8_qs_23[1], q8_qs_23[2], q8_qs_23[3],
+                          q8_qs_23[4], q8_qs_23[5], q8_qs_23[6], q8_qs_23[7] },
+                    };
+
+                    // Q4s columns iterated in pairs (01, 23, 45, 67)
+                    for (int cp = 0; cp < ncols_interleaved / 2; cp++) {
+                        for (int i = 0; i < 4; i++) {
+                            sb_acc[i] = vdupq_n_s32(0);
+                        }
+
+                        uint8x16_t q4_qs_cp_0 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 16 * cp + 0);    // 0 .. 7 & 32..39
+                        uint8x16_t q4_qs_cp_1 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 16 * cp + 64);   // 8 ..15 & 40..47
+                        uint8x16_t q4_qs_cp_2 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 16 * cp + 128);  // 16..23 & 48..55
+                        uint8x16_t q4_qs_cp_3 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 16 * cp + 192);  // 24..31 & 56..63
+                        const int8x16_t q4_nibbles[2][4] = {
+                            {
+                                vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_0, m4b)),
+                                vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_1, m4b)),
+                                vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_2, m4b)),
+                                vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_3, m4b)),
+                            },
+                            {
+                                vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_0, 4)),
+                                vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_1, 4)),
+                                vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_2, 4)),
+                                vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_3, 4)),
+                            }
+                        };
+
+                        // Calculates the Qs muladd of every row pair (rp) rows 01 and 23 of q8
+                        // for each of the internal 32 qs subblock (blk)
+                        for (int rp = 0; rp < 2; rp++) {
+                            for (int blk = 0; blk < 2; blk++) {
+                                const int8x16_t * q8  = &q8s[rp][4 * blk];
+                                const int8x16_t * q4  = q4_nibbles[blk];
+                                int32x4_t         acc = sb_acc[2 * rp + blk];
+                                // mul add for each qs in the same subblock
+                                for (int qs_offset = 0; qs_offset < 4; qs_offset++) {
+                                    acc = vmmlaq_s32(acc, q4[qs_offset], q8[qs_offset]);
+                                }
+                                sb_acc[2 * rp + blk] = acc;
+                            }
+                        }
+
+                        // Scales[i] corresponds to column i
+                        const int scale_offset = cp * 2;
+                        for (int blk = 0; blk < 2; blk++) {
+                            const int32x4_t block_scale = {
+                                (int32_t) q4sb_scales[blk][scale_offset],
+                                (int32_t) q4sb_scales[blk][scale_offset],
+                                (int32_t) q4sb_scales[blk][scale_offset + 1],
+                                (int32_t) q4sb_scales[blk][scale_offset + 1],
+                            };
+                            acc[cp]     = vmlaq_s32(acc[cp], sb_acc[blk], block_scale);
+                            acc[cp + 4] = vmlaq_s32(acc[cp + 4], sb_acc[blk + 2], block_scale);
+                        }
+                    }
+
+                    // Multiply Acc bsum + mins
+                    for (int q8_row = 0; q8_row < 4; q8_row++) {
+                        // Each pair of subblocks share the same bsums
+                        // Load scalar bsum → broadcast to a vector (vdupq_n_s16(s)).
+                        int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[sb][q8_row * 2]);
+                        int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[sb][q8_row * 2 + 1]);
+
+                        bias_acc[2 * q8_row] =
+                            vmlal_s16(bias_acc[2 * q8_row], bsums_vec_lo, vget_low_s16(q4sb_mins[0]));
+                        bias_acc[2 * q8_row] =
+                            vmlal_s16(bias_acc[2 * q8_row], bsums_vec_hi, vget_low_s16(q4sb_mins[1]));
+                        bias_acc[2 * q8_row + 1] =
+                            vmlal_s16(bias_acc[2 * q8_row + 1], bsums_vec_lo, vget_high_s16(q4sb_mins[0]));
+                        bias_acc[2 * q8_row + 1] =
+                            vmlal_s16(bias_acc[2 * q8_row + 1], bsums_vec_hi, vget_high_s16(q4sb_mins[1]));
+                    }
+                }  // for sb
+
+                // Reorder of i8mm output with bias and output layout
+                for (int i = 0; i < 8; i++) {
+                    int32x2x2_t aux = vzip_s32(vget_low_s32(acc[i]), vget_high_s32(acc[i]));
+                    acc[i]          = vcombine_s32(aux.val[0], aux.val[1]);
+                }
+                int32x4_t reorder_acc[8] = {
+                    vcombine_s32(vget_low_s32(acc[0]), vget_low_s32(acc[1])),
+                    vcombine_s32(vget_low_s32(acc[2]), vget_low_s32(acc[3])),
+                    vcombine_s32(vget_high_s32(acc[0]), vget_high_s32(acc[1])),
+                    vcombine_s32(vget_high_s32(acc[2]), vget_high_s32(acc[3])),
+                    vcombine_s32(vget_low_s32(acc[4]), vget_low_s32(acc[5])),
+                    vcombine_s32(vget_low_s32(acc[6]), vget_low_s32(acc[7])),
+                    vcombine_s32(vget_high_s32(acc[4]), vget_high_s32(acc[5])),
+                    vcombine_s32(vget_high_s32(acc[6]), vget_high_s32(acc[7])),
+                };
+
+                for (int i = 0; i < q8_k_blocklen; i++) {
+                    for (int j = 0; j < 2; j++) {
+                        float32x4_t       q8_d    = vdupq_n_f32(q8_ptr[b].d[i]);
+                        float32x4_t       q4_dmin = vcvt_f32_f16(vld1_f16((const __fp16 *) (q4_ptr[b].dmin + j * 4)));
+                        const float32x4_t dmins   = vmulq_f32(q4_dmin, q8_d);
+
+                        float32x4_t       q4_d  = vcvt_f32_f16(vld1_f16((const __fp16 *) (q4_ptr[b].d + j * 4)));
+                        const float32x4_t scale = vmulq_f32(q4_d, q8_d);
+
+                        acc_f32[2 * i + j] = vmlsq_f32(acc_f32[2 * i + j], vcvtq_f32_s32(bias_acc[2 * i + j]), dmins);
+                        acc_f32[2 * i + j] =
+                            vmlaq_f32(acc_f32[2 * i + j], vcvtq_f32_s32(reorder_acc[2 * i + j]), scale);
+                    }
+                }
+            }  // for b
+
+            // With the previous reorder, the tile is already in the correct memory layout.
+            for (int i = 0; i < q8_k_blocklen; i++) {
+                int row = y * q8_k_blocklen + i;
+                for (int j = 0; j < 2; j++) {
+                    int col    = x * ncols_interleaved + j * 4;
+                    int offset = row * bs + col;
+                    vst1q_f32(s + offset, acc_f32[2 * i + j]);
+                }
+            }
+        }  // for x
+    }  // for y
+    return;
+#endif  // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    ggml_gemm_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
+}
--- a/ggml/src/ggml-cpu/arch/loongarch/quants.c
+++ b/ggml/src/ggml-cpu/arch/loongarch/quants.c
@@ -700,7 +700,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    for (; ib + 1 < nb; ib += 2) {

        // Compute combined scale for the block 0 and 1
-        const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
+        const float ft0 = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
+        const __m128 d_0_1 = (__m128)(v4f32){ft0, ft0, ft0, ft0};

        const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);

@@ -714,11 +715,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
        bx_1 = __lsx_vsub_b(bx_1, off);
        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);

-        //_mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
-        //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
-
        // Compute combined scale for the block 2 and 3
-        const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
+        const float ft1 = GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d);
+        const __m128 d_2_3 = (__m128)(v4f32){ft1, ft1, ft1, ft1};

        const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);

--- a/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp
+++ b/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp
@@ -0,0 +1,35 @@
+#include "ggml-backend-impl.h"
+
+#if defined(__riscv) && __riscv_xlen == 64
+#include <sys/auxv.h>
+
+//https://github.com/torvalds/linux/blob/master/arch/riscv/include/uapi/asm/hwcap.h#L24
+#ifndef COMPAT_HWCAP_ISA_V
+#define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
+#endif
+
+struct riscv64_features {
+    bool has_rvv = false;
+
+    riscv64_features() {
+        uint32_t hwcap = getauxval(AT_HWCAP);
+
+        has_rvv = !!(hwcap & COMPAT_HWCAP_ISA_V);
+    }
+};
+
+static int ggml_backend_cpu_riscv64_score() {
+    int score = 1;
+    riscv64_features rf;
+
+#ifdef GGML_USE_RVV
+    if (!rf.has_rvv) { return 0; }
+    score += 1 << 1;
+#endif
+
+    return score;
+}
+
+GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_riscv64_score)
+
+#endif  // __riscv && __riscv_xlen == 64
--- a/ggml/src/ggml-cpu/arch/riscv/quants.c
+++ b/ggml/src/ggml-cpu/arch/riscv/quants.c
@@ -580,16 +580,19 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
            const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
            uint8_t *patmp = atmp;
            int vsums;
-            int tmp;
+            int tmp, t1, t2, t3, t4, t5, t6, t7;
            __asm__ __volatile__(
                "vsetivli zero, 16, e8, m1\n\t"
                "vmv.v.x v8, zero\n\t"
+                "lb zero, 15(%[sc])\n\t"
                "vle8.v v1, (%[sc])\n\t"
+                "vle8.v v2, (%[bsums])\n\t"
+                "addi %[tmp], %[bsums], 16\n\t"
                "vand.vi v0, v1, 0xF\n\t"
                "vsrl.vi v1, v1, 4\n\t"
+                "vle8.v v3, (%[tmp])\n\t"
                "vse8.v v0, (%[scale])\n\t"
                "vsetivli zero, 16, e16, m2\n\t"
-                "vle16.v v2, (%[bsums])\n\t"
                "vzext.vf2 v0, v1\n\t"
                "vwmul.vv v4, v0, v2\n\t"
                "vsetivli zero, 16, e32, m4\n\t"
@@ -608,46 +611,89 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi

            for (int j = 0; j < QK_K/128; ++j) {
                __asm__ __volatile__(
-                    "vsetvli zero, %[vl32], e8, m2\n\t"
+                    "lb zero, 31(%[q2])\n\t"
+                    "addi %[tmp], %[q2], 16\n\t"
+                    "addi %[t1], %[q8], 16\n\t"
+                    "vsetivli zero, 16, e8, m1\n\t"
                    "vle8.v v0, (%[q2])\n\t"
+                    "vle8.v v1, (%[tmp])\n\t"
                    "vsrl.vi v2, v0, 2\n\t"
+                    "vsrl.vi v3, v1, 2\n\t"
                    "vsrl.vi v4, v0, 4\n\t"
-                    "vsrl.vi v6, v0, 6\n\t"
-                    "vand.vi v0, v0, 0x3\n\t"
-                    "vand.vi v2, v2, 0x3\n\t"
-                    "vand.vi v4, v4, 0x3\n\t"
-                    "vsetvli zero, %[vl128], e8, m8\n\t"
+                    "addi %[tmp], %[q8], 32\n\t"
                    "vle8.v v8, (%[q8])\n\t"
-                    "vsetvli zero, %[vl64], e8, m4\n\t"
+                    "vle8.v v9, (%[t1])\n\t"
+                    "addi %[t1], %[t1], 32\n\t"
+                    "vsrl.vi v5, v1, 4\n\t"
+                    "vsrl.vi v6, v0, 6\n\t"
+                    "vsrl.vi v7, v1, 6\n\t"
+                    "vle8.v v10, (%[tmp])\n\t"
+                    "vle8.v v11, (%[t1])\n\t"
+                    "addi %[tmp], %[tmp], 32\n\t"
+                    "addi %[t1], %[t1], 32\n\t"
+                    "vand.vi v0, v0, 0x3\n\t"
+                    "vand.vi v1, v1, 0x3\n\t"
+                    "vand.vi v2, v2, 0x3\n\t"
+                    "vle8.v v12, (%[tmp])\n\t"
+                    "vle8.v v13, (%[t1])\n\t"
+                    "addi %[tmp], %[tmp], 32\n\t"
+                    "addi %[t1], %[t1], 32\n\t"
+                    "vand.vi v3, v3, 0x3\n\t"
+                    "vand.vi v4, v4, 0x3\n\t"
+                    "vand.vi v5, v5, 0x3\n\t"
+                    "vle8.v v14, (%[tmp])\n\t"
+                    "vle8.v v15, (%[t1])\n\t"
                    "vwmul.vv v16, v0, v8\n\t"
+                    "vwmul.vv v18, v1, v9\n\t"
+                    "vwmul.vv v20, v2, v10\n\t"
+                    "vwmul.vv v22, v3, v11\n\t"
                    "vwmul.vv v24, v4, v12\n\t"
-                    "vsetivli zero, 16, e16, m2\n\t"
+                    "vwmul.vv v26, v5, v13\n\t"
+                    "vwmul.vv v28, v6, v14\n\t"
+                    "vwmul.vv v30, v7, v15\n\t"
+                    "vsetivli zero, 8, e16, m1\n\t"
                    "vmv.v.x v0, zero\n\t"
-                    "vwredsum.vs v10, v16, v0\n\t"
+                    "lbu %[tmp], 0(%[scale])\n\t"
+                    "vwredsum.vs v8, v16, v0\n\t"
                    "vwredsum.vs v9, v18, v0\n\t"
-                    "vwredsum.vs v8, v20, v0\n\t"
-                    "vwredsum.vs v7, v22, v0\n\t"
-                    "vwredsum.vs v11, v24, v0\n\t"
-                    "vwredsum.vs v12, v26, v0\n\t"
-                    "vwredsum.vs v13, v28, v0\n\t"
-                    "vwredsum.vs v14, v30, v0\n\t"
+                    "lbu %[t1], 1(%[scale])\n\t"
+                    "vwredsum.vs v10, v20, v0\n\t"
+                    "vwredsum.vs v11, v22, v0\n\t"
+                    "lbu %[t2], 2(%[scale])\n\t"
+                    "vwredsum.vs v12, v24, v0\n\t"
+                    "vwredsum.vs v13, v26, v0\n\t"
+                    "lbu %[t3], 3(%[scale])\n\t"
+                    "vwredsum.vs v14, v28, v0\n\t"
+                    "vwredsum.vs v15, v30, v0\n\t"
+                    "lbu %[t4], 4(%[scale])\n\t"
+                    "vwredsum.vs v8, v17, v8\n\t"
+                    "vwredsum.vs v9, v19, v9\n\t"
+                    "lbu %[t5], 5(%[scale])\n\t"
+                    "vwredsum.vs v10, v21, v10\n\t"
+                    "vwredsum.vs v11, v23, v11\n\t"
+                    "lbu %[t6], 6(%[scale])\n\t"
+                    "vwredsum.vs v12, v25, v12\n\t"
+                    "vwredsum.vs v13, v27, v13\n\t"
+                    "lbu %[t7], 7(%[scale])\n\t"
+                    "vwredsum.vs v14, v29, v14\n\t"
+                    "vwredsum.vs v15, v31, v15\n\t"
                    "vsetivli zero, 4, e32, m1\n\t"
-                    "vslideup.vi v10, v9, 1\n\t"
-                    "vslideup.vi v8, v7, 1\n\t"
-                    "vslideup.vi v11, v12, 1\n\t"
-                    "vslideup.vi v13, v14, 1\n\t"
-                    "vslideup.vi v10, v8, 2\n\t"
-                    "vslideup.vi v11, v13, 2\n\t"
-                    "vsetivli zero, 8, e32, m2\n\t"
-                    "vle8.v v15, (%[scale])\n\t"
-                    "vzext.vf4 v12, v15\n\t"
-                    "vmul.vv v10, v10, v12\n\t"
-                    "vredsum.vs v0, v10, v0\n\t"
+                    "vmul.vx v0, v8, %[tmp]\n\t"
+                    "vmul.vx v1, v9, %[t1]\n\t"
+                    "vmacc.vx v0, %[t2], v10\n\t"
+                    "vmacc.vx v1, %[t3], v11\n\t"
+                    "vmacc.vx v0, %[t4], v12\n\t"
+                    "vmacc.vx v1, %[t5], v13\n\t"
+                    "vmacc.vx v0, %[t6], v14\n\t"
+                    "vmacc.vx v1, %[t7], v15\n\t"
                    "vmv.x.s %[tmp], v0\n\t"
-                    "add %[isum], %[isum], %[tmp]"
-                    : [tmp] "=&r" (tmp), [isum] "+&r" (isum)
+                    "vmv.x.s %[t1], v1\n\t"
+                    "add %[isum], %[isum], %[tmp]\n\t"
+                    "add %[isum], %[isum], %[t1]"
+                    : [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
+                    , [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
+                    , [isum] "+&r" (isum)
                    : [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
-                    , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
                    : "memory"
                    , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
                    , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
@@ -929,7 +975,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
            const  int8_t * restrict q8 = y[i].qs;

            int8_t * scale = (int8_t *)utmp;
-            int tmp;
+            int tmp, t1, t2, t3, t4, t5, t6, t7;
            __asm__ __volatile__(
                "vsetivli zero, 12, e8, m1\n\t"
                "vle8.v v0, (%[s6b])\n\t"
@@ -967,19 +1013,23 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
            int isum = 0;
            for (int j = 0; j < QK_K; j += 128) {
                __asm__ __volatile__(
+                    "lb zero, 31(%[q3])\n\t"
                    "vsetvli zero, %[vl32], e8, m2, ta, mu\n\t"
                    "vle8.v v8, (%[q3])\n\t"
                    "vsrl.vi v10, v8, 2\n\t"
                    "vsrl.vi v12, v8, 4\n\t"
                    "vsrl.vi v14, v8, 6\n\t"
+                    "lb zero, 64(%[q8])\n\t"
                    "vand.vi v8, v8, 3\n\t"
                    "vand.vi v10, v10, 3\n\t"
                    "vand.vi v12, v12, 3\n\t"
                    "vle8.v v2, (%[qh])\n\t"
+                    "lb zero, 127(%[q8])\n\t"
                    "vand.vx v4, v2, %[m]\n\t"
                    "slli %[m], %[m], 1\n\t"
                    "vmseq.vx v0, v4, zero\n\t"
                    "vadd.vi v8, v8, -4, v0.t\n\t"
+                    "lb zero, 0(%[q8])\n\t"
                    "vand.vx v4, v2, %[m]\n\t"
                    "slli %[m], %[m], 1\n\t"
                    "vmseq.vx v0, v4, zero\n\t"
@@ -994,34 +1044,43 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
                    "vadd.vi v14, v14, -4, v0.t\n\t"
                    "vsetvli zero, %[vl128], e8, m8\n\t"
                    "vle8.v v0, (%[q8])\n\t"
+                    "lb %[tmp], 0(%[scale])\n\t"
+                    "lb %[t1], 1(%[scale])\n\t"
+                    "lb %[t2], 2(%[scale])\n\t"
+                    "lb %[t3], 3(%[scale])\n\t"
                    "vsetvli zero, %[vl64], e8, m4\n\t"
                    "vwmul.vv v16, v0, v8\n\t"
                    "vwmul.vv v24, v4, v12\n\t"
                    "vsetivli zero, 16, e16, m2\n\t"
                    "vmv.v.x v0, zero\n\t"
-                    "vwredsum.vs v10, v16, v0\n\t"
+                    "vwredsum.vs v8, v16, v0\n\t"
+                    "lb %[t4], 4(%[scale])\n\t"
+                    "lb %[t5], 5(%[scale])\n\t"
                    "vwredsum.vs v9, v18, v0\n\t"
-                    "vwredsum.vs v8, v20, v0\n\t"
-                    "vwredsum.vs v7, v22, v0\n\t"
-                    "vwredsum.vs v11, v24, v0\n\t"
-                    "vwredsum.vs v12, v26, v0\n\t"
-                    "vwredsum.vs v13, v28, v0\n\t"
-                    "vwredsum.vs v14, v30, v0\n\t"
+                    "vwredsum.vs v10, v20, v0\n\t"
+                    "vwredsum.vs v11, v22, v0\n\t"
+                    "vwredsum.vs v12, v24, v0\n\t"
+                    "lb %[t6], 6(%[scale])\n\t"
+                    "lb %[t7], 7(%[scale])\n\t"
+                    "vwredsum.vs v13, v26, v0\n\t"
+                    "vwredsum.vs v14, v28, v0\n\t"
+                    "vwredsum.vs v15, v30, v0\n\t"
                    "vsetivli zero, 4, e32, m1\n\t"
-                    "vslideup.vi v10, v9, 1\n\t"
-                    "vslideup.vi v8, v7, 1\n\t"
-                    "vslideup.vi v11, v12, 1\n\t"
-                    "vslideup.vi v13, v14, 1\n\t"
-                    "vslideup.vi v10, v8, 2\n\t"
-                    "vslideup.vi v11, v13, 2\n\t"
-                    "vsetivli zero, 8, e32, m2\n\t"
-                    "vle8.v v15, (%[scale])\n\t"
-                    "vsext.vf4 v12, v15\n\t"
-                    "vmul.vv v10, v10, v12\n\t"
-                    "vredsum.vs v0, v10, v0\n\t"
+                    "vmul.vx v0, v8, %[tmp]\n\t"
+                    "vmul.vx v1, v9, %[t1]\n\t"
+                    "vmacc.vx v0, %[t2], v10\n\t"
+                    "vmacc.vx v1, %[t3], v11\n\t"
+                    "vmacc.vx v0, %[t4], v12\n\t"
+                    "vmacc.vx v1, %[t5], v13\n\t"
+                    "vmacc.vx v0, %[t6], v14\n\t"
+                    "vmacc.vx v1, %[t7], v15\n\t"
                    "vmv.x.s %[tmp], v0\n\t"
-                    "add %[isum], %[isum], %[tmp]"
-                    : [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
+                    "vmv.x.s %[t1], v1\n\t"
+                    "add %[isum], %[isum], %[tmp]\n\t"
+                    "add %[isum], %[isum], %[t1]"
+                    : [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
+                    , [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
+                    , [m] "+&r" (m), [isum] "+&r" (isum)
                    : [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
                    , [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
                    : "memory"
--- a/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp
+++ b/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp
@@ -0,0 +1,50 @@
+#include "ggml-backend-impl.h"
+
+#if defined(__s390x__)
+#include <sys/auxv.h>
+
+// find hwcap bits in asm/elf.h
+#ifndef HWCAP_VXRS_EXT2
+#define HWCAP_VXRS_EXT2 (1 << 15)
+#endif
+
+#ifndef HWCAP_NNPA
+#define HWCAP_NNPA (1 << 20)
+#endif
+
+struct s390x_features {
+    bool has_vxe2 = false;
+    bool has_nnpa = false;
+
+    s390x_features() {
+        uint32_t hwcap = getauxval(AT_HWCAP);
+        // NOTE: use hwcap2 with DFLT for z17 and later
+        // uint32_t hwcap2 = getauxval(AT_HWCAP2);
+
+        has_vxe2 = !!(hwcap & HWCAP_VXRS_EXT2);
+        has_nnpa = !!(hwcap & HWCAP_NNPA);
+    }
+};
+
+static int ggml_backend_cpu_s390x_score() {
+    int score = 1;
+    s390x_features sf;
+
+// IBM z15 / LinuxONE 3
+#ifdef GGML_USE_VXE2
+    if (!sf.has_vxe2) { return 0; }
+    score += 1 << 1;
+#endif
+
+// IBM z16 / LinuxONE 4 and z17 / LinuxONE 5
+#ifdef GGML_USE_NNPA
+    if (!sf.has_nnpa) { return 0; }
+    score += 1 << 2;
+#endif
+
+    return score;
+}
+
+GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_s390x_score)
+
+#endif  // __s390x__
--- a/ggml/src/ggml-cpu/arch/x86/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp
@@ -646,7 +646,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
    __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
    int64_t xstart = 0;
    int anr = nr - nr%16; // Used to align nr with boundary of 16
-#ifdef __AVX512F__
+#if defined(__AVX512BW__) && defined(__AVX512DQ__)
    int anc = nc - nc%16; // Used to align nc with boundary of 16
                          // Mask to mask out nibbles from packed bytes expanded to 512 bit length
    const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
@@ -1041,7 +1041,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
        xstart = anc/8;
        y = 0;
    }
-#endif // __AVX512F__
+#endif // __AVX512BW__ && __AVX512DQ__

    // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation

@@ -1989,7 +1989,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
    int64_t xstart = 0;
    int anr = nr - nr % 16;; // Used to align nr with boundary of 16
-#ifdef __AVX512F__
+#if defined(__AVX512BW__) && defined(__AVX512DQ__)
    int anc = nc - nc % 16; // Used to align nc with boundary of 16
    // Mask to mask out nibbles from packed bytes expanded to 512 bit length
    const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
@@ -2727,7 +2727,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
        xstart = anc/8;
        y = 0;
    }
-#endif //AVX512F
+#endif // __AVX512BW__ && __AVX512DQ__

    // Take group of four block_q8_Kx4 structures at each pass of the loop and perform dot product operation
    for (; y < anr / 4; y += 4) {
@@ -3467,7 +3467,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    __m256i scalesmask2 = _mm256_castsi128_si256(scalesmask2_sse);
    scalesmask2 = _mm256_permute2f128_si256(scalesmask2, scalesmask2, 0);

-#ifdef __AVX512F__
+#if defined(__AVX512BW__) && defined(__AVX512DQ__)

    int anc = nc - nc % 16; // Used to align nc with boundary of 16

@@ -4947,7 +4947,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
        y = 0;
    }

-#endif //AVX512F
+#endif // __AVX512BW__ && __AVX512DQ__

    // Take group of four block_q8_Kx4 structures at each pass of the loop and perform dot product operation
    for (; y < anr / 4; y += 4) {
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -500,13 +500,15 @@ inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {

 #endif

-#if defined(__loongarch_asx)
+#if defined(__loongarch_sx)
 /* float type data load instructions */
 static __m128 __lsx_vreplfr2vr_s(const float val) {
    v4f32 res = {val, val, val, val};
    return (__m128)res;
 }
+#endif

+#if defined(__loongarch_asx)
 static __m256 __lasx_xvreplfr2vr_s(const float val) {
    v8f32 res = {val, val, val, val, val, val, val, val};
    return (__m256)res;
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1613,13 +1613,8 @@ static void ggml_compute_forward_mul_mat_id(
            chunk_size = 64;
        }

-#if defined(__aarch64__)
-        // disable for ARM
-        const bool disable_chunking = true;
-#else
        // disable for NUMA
        const bool disable_chunking = ggml_is_numa();
-#endif // defined(__aarch64__)

        int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
        int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
@@ -1736,6 +1731,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_sum_rows(params, tensor);
            } break;
+        case GGML_OP_CUMSUM:
+            {
+                ggml_compute_forward_cumsum(params, tensor);
+            } break;
        case GGML_OP_MEAN:
            {
                ggml_compute_forward_mean(params, tensor);
@@ -1812,22 +1811,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_cont(params, tensor);
            } break;
-        case GGML_OP_RESHAPE:
-            {
-                ggml_compute_forward_reshape(params, tensor);
-            } break;
-        case GGML_OP_VIEW:
-            {
-                ggml_compute_forward_view(params, tensor);
-            } break;
-        case GGML_OP_PERMUTE:
-            {
-                ggml_compute_forward_permute(params, tensor);
-            } break;
-        case GGML_OP_TRANSPOSE:
-            {
-                ggml_compute_forward_transpose(params, tensor);
-            } break;
        case GGML_OP_GET_ROWS:
            {
                ggml_compute_forward_get_rows(params, tensor);
@@ -1944,10 +1927,22 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_argsort(params, tensor);
            } break;
+        case GGML_OP_TOP_K:
+            {
+                ggml_compute_forward_top_k(params, tensor);
+            } break;
        case GGML_OP_LEAKY_RELU:
            {
                ggml_compute_forward_leaky_relu(params, tensor);
            } break;
+        case GGML_OP_TRI:
+            {
+                ggml_compute_forward_tri(params, tensor);
+            } break;
+        case GGML_OP_FILL:
+            {
+                ggml_compute_forward_fill(params, tensor);
+            } break;
        case GGML_OP_FLASH_ATTN_EXT:
            {
                ggml_compute_forward_flash_attn_ext(params, tensor);
@@ -2003,6 +1998,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_rwkv_wkv7(params, tensor);
            } break;
+        case GGML_OP_SOLVE_TRI:
+            {
+                ggml_compute_forward_solve_tri(params, tensor);
+            } break;
        case GGML_OP_MAP_CUSTOM1:
            {
                ggml_compute_forward_map_custom1(params, tensor);
@@ -2047,6 +2046,22 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                // nop
            } break;
+        case GGML_OP_RESHAPE:
+            {
+                // nop
+            } break;
+        case GGML_OP_PERMUTE:
+            {
+                // nop
+            } break;
+        case GGML_OP_VIEW:
+            {
+                // nop
+            } break;
+        case GGML_OP_TRANSPOSE:
+            {
+                // nop
+            } break;
        case GGML_OP_COUNT:
            {
                GGML_ABORT("fatal error");
@@ -2145,6 +2160,9 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
        case GGML_OP_ADD_ID:
        case GGML_OP_ADD1:
        case GGML_OP_ACC:
+        case GGML_OP_CUMSUM:
+        case GGML_OP_TRI:
+        case GGML_OP_FILL:
            {
                n_tasks = n_threads;
            } break;
@@ -2162,6 +2180,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                n_tasks = 1;
            } break;
        case GGML_OP_COUNT_EQUAL:
+        case GGML_OP_SOLVE_TRI:
            {
                n_tasks = n_threads;
            } break;
@@ -2184,6 +2203,12 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                case GGML_UNARY_OP_HARDSWISH:
                case GGML_UNARY_OP_HARDSIGMOID:
                case GGML_UNARY_OP_EXP:
+                case GGML_UNARY_OP_SOFTPLUS:
+                case GGML_UNARY_OP_EXPM1:
+                case GGML_UNARY_OP_FLOOR:
+                case GGML_UNARY_OP_CEIL:
+                case GGML_UNARY_OP_ROUND:
+                case GGML_UNARY_OP_TRUNC:
                    {
                        n_tasks = 1;
                    } break;
@@ -2290,6 +2315,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_ARGSORT:
+        case GGML_OP_TOP_K:
        case GGML_OP_FLASH_ATTN_EXT:
        case GGML_OP_FLASH_ATTN_BACK:
        case GGML_OP_SSM_CONV:
@@ -2813,6 +2839,10 @@ struct ggml_cplan ggml_graph_plan(
                        cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
                        cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
                    } break;
+                case GGML_OP_TOP_K:
+                    {
+                        cur += sizeof(int32_t)*node->src[0]->ne[0]*n_tasks;
+                    } break;
                case GGML_OP_FLASH_ATTN_EXT:
                    {
                        const int64_t ne10 = node->src[1]->ne[0]; // DK
@@ -2885,6 +2915,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
    for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
        struct ggml_tensor * node = cgraph->nodes[node_n];

+        if (ggml_op_is_empty(node->op)) {
+            // skip NOPs
+            continue;
+        }
+
        ggml_compute_forward(&params, node);

        if (state->ith == 0 && cplan->abort_callback &&
@@ -3270,6 +3305,13 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
        __m128 y_vec = _mm_cvtph_ps(x_vec);
        _mm_storeu_ps(y + i, y_vec);
    }
+#elif defined(__riscv_zvfh)
+    for (int vl; i < n; i += vl) {
+        vl = __riscv_vsetvl_e16m1(n - i);
+        vfloat16m1_t vx = __riscv_vle16_v_f16m1((_Float16 *)&x[i], vl);
+        vfloat32m2_t vy = __riscv_vfwcvt_f_f_v_f32m2(vx, vl);
+        __riscv_vse32_v_f32m2(&y[i], vy, vl);
+    }
 #endif

    for (; i < n; ++i) {
@@ -3563,13 +3605,17 @@ void ggml_cpu_init(void) {
 #ifdef GGML_USE_OPENMP
            //if (!getenv("OMP_WAIT_POLICY")) {
            //    // set the wait policy to active, so that OpenMP threads don't sleep
-            //    putenv("OMP_WAIT_POLICY=active");
+            //    setenv("OMP_WAIT_POLICY", "active", 0)
            //}

            if (!getenv("KMP_BLOCKTIME")) {
                // set the time to wait before sleeping a thread
                // this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
-                putenv("KMP_BLOCKTIME=200"); // 200ms
+#ifdef _WIN32
+                _putenv_s("KMP_BLOCKTIME", "200"); // 200ms
+#else
+                setenv("KMP_BLOCKTIME", "200", 0); // 200ms
+#endif
            }
 #endif
        }
--- a/ggml/src/ggml-cpu/kleidiai/kernels.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kernels.cpp
@@ -4,6 +4,7 @@

 // KleidiAI micro-kernels
 #include "kai_matmul_clamp_f32_qsi8d32p_qsi4c32p_interface.h"
+#include "kai_matmul_clamp_f32_qai8dxp_qsi8cxp_interface.h"
 #include "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h"
 #include "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h"
 #include "kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h"
@@ -11,23 +12,34 @@
 #include "kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h"
 #include "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h"
 #include "kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.h"
+#include "kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.h"
+#include "kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot.h"
+#include "kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.h"
+#include "kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.h"
+#include "kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.h"
+#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.h"

 #include "kai_lhs_pack_bf16p2vlx2_f32_sme.h"
 #include "kai_lhs_quant_pack_qsi8d32p_f32.h"
 #include "kai_lhs_quant_pack_qsi8d32p4x8sb_f32_neon.h"
 #include "kai_lhs_quant_pack_qsi8d32p_f32_neon.h"
+#include "kai_lhs_quant_pack_qai8dxp_f32.h"

 #include "kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.h"
 #include "kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
 #include "kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h"
+#include "kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.h"

 #include "kai_common.h"

 #include "simd-mappings.h"

+#define GGML_COMMON_DECL_CPP
+#include "ggml-common.h"
+
 #include "kernels.h"

-#define NELEMS(x) sizeof(x) / sizeof(*x)
+#define NELEMS(x) (sizeof(x) / sizeof(*x))

 template<size_t(*Fn)(size_t,size_t,size_t)>
 static inline size_t kernel_offs_fn3(size_t a, size_t b, size_t c) {
@@ -55,6 +67,14 @@ static inline void kernel_run_fn10(size_t m, size_t n, size_t k, size_t /*bl*/,
    Fn(m, n, k, lhs, rhs, dst, dst_stride_row, dst_stride_col, clamp_min, clamp_max);
 }

+template<void(*Fn)(size_t,size_t,size_t,const void*,const void*,float*,size_t,size_t,float,float)>
+static inline void kernel_run_float_fn10(size_t m, size_t n, size_t k, size_t /*bl*/,
+                                     const void* lhs, const void* rhs, void* dst,
+                                     size_t dst_stride_row, size_t dst_stride_col,
+                                     float clamp_min, float clamp_max) {
+    Fn(m, n, k, lhs, rhs, static_cast<float*>(dst), dst_stride_row, dst_stride_col, clamp_min, clamp_max);
+}
+
 template<size_t(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t)>
 static inline size_t lhs_ps_fn6(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr) {
    return Fn(m, k, bl, mr, kr, sr);
@@ -93,6 +113,12 @@ static inline void lhs_pack_void_fn9(size_t m, size_t k, size_t /*bl*/, size_t m
    Fn(m, k, mr, kr, sr, m_idx_start, lhs, lhs_stride, lhs_packed);
 }

+template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,const float*,size_t,void*)>
+static inline void lhs_pack_float_fn9_no_bl(size_t m, size_t k, size_t /*bl*/, size_t mr, size_t kr, size_t sr,
+                                            size_t m_idx_start, const void * lhs, size_t lhs_stride, void * lhs_packed) {
+    Fn(m, k, mr, kr, sr, m_idx_start, static_cast<const float*>(lhs), lhs_stride, lhs_packed);
+}
+
 template<size_t(*Fn)(size_t,size_t,size_t,size_t,size_t)>
 static inline size_t rhs_ps_fn5(size_t n, size_t k, size_t nr, size_t kr, size_t bl) {
    return Fn(n, k, nr, kr, bl);
@@ -124,6 +150,18 @@ static inline void rhs_pack_fn12(size_t num_groups, size_t n, size_t k, size_t n
       static_cast<const kai_rhs_pack_qs4cxs1s0_param*>(params));
 }

+template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,const int8_t*,const float*,const float*,void*,size_t,const struct kai_rhs_pack_qsi8cx_params*)>
+static inline void rhs_pack_scale_fn12(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t /*bl*/,
+                                               size_t /*rhs_stride*/, const void* rhs, const void* bias, const void* scale,
+                                               void* rhs_packed, size_t extra_bytes, const void* params) {
+    Fn(num_groups, n, k, nr, kr, sr,
+       static_cast<const int8_t*>(rhs),
+       static_cast<const float*>(bias),
+       static_cast<const float*>(scale),
+       rhs_packed, extra_bytes,
+       static_cast<const kai_rhs_pack_qsi8cx_params*>(params));
+}
+
 template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,size_t,const void*,const void*,const void*,void*,size_t,const void*)>
 static inline void rhs_pack_fn13(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t /*bl*/,
                                               size_t rhs_stride, const void* rhs, const void* bias, const void* scale,
@@ -213,6 +251,57 @@ static void dequantize_row_qsi4c32ps1s0scalef16(
    GGML_UNUSED(kr);
 }

+static void dequantize_row_qsi8cxp(
+    const void *packed_data,
+    int32_t row_idx,
+    int64_t k,
+    float *out,
+    size_t nr,
+    size_t packed_row_stride,
+    size_t kr,
+    size_t bl,
+    size_t num_bytes_multiplier
+) {
+    GGML_UNUSED(bl);
+    GGML_UNUSED(num_bytes_multiplier);
+
+    const size_t k_internal = ((size_t) k + QK8_0 - 1) / QK8_0 * QK8_0;
+    const size_t group_idx = row_idx / nr;
+    const size_t row_in_group = row_idx % nr;
+
+    const uint8_t * group_ptr = static_cast<const uint8_t *>(packed_data) + group_idx * packed_row_stride;
+    const int8_t  * data_base = reinterpret_cast<const int8_t *>(group_ptr);
+
+    const size_t num_blocks = k_internal / kr;
+
+    for (size_t block = 0; block < num_blocks; ++block) {
+        const int8_t * block_ptr = data_base + (block * nr + row_in_group) * kr;
+        for (size_t i = 0; i < kr; ++i) {
+            const size_t k_idx = block * kr + i;
+            if (k_idx < (size_t) k) {
+                out[k_idx] = static_cast<float>(block_ptr[i]);
+            }
+        }
+    }
+
+    const uint8_t * sums_ptr = group_ptr + nr * k_internal;
+    GGML_UNUSED(sums_ptr);
+
+    const float * scale_ptr = reinterpret_cast<const float *>(sums_ptr + nr * sizeof(int32_t));
+    const float scale = scale_ptr[row_in_group];
+
+    if (scale == 0.0f) {
+        for (size_t i = 0; i < (size_t) k; ++i) {
+            out[i] = 0.0f;
+        }
+        return;
+    }
+
+    for (size_t i = 0; i < (size_t) k; ++i) {
+        out[i] *= scale;
+    }
+}
+
 static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
 #if defined(__ARM_FEATURE_SME)
    {
@@ -546,6 +635,176 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
    },
 #endif
 #endif
+    { /* Sentinel */ }
+};
+
+static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
+#if defined(__ARM_FEATURE_SME)
+    {
+        /* SME GEMM */
+        {
+            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+            /* .get_lhs_offset_ex     = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa>,
+            /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa>,
+            /* .run_kernel_ex         = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa>,
+        },
+        /* .gemm_lhs_info = */ {
+            /* .get_offset            = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
+            /* .get_packed_offset_ex  = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
+            /* .packed_size_ex        = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
+            /* .pack_func_ex          = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
+        },
+        /* SME GEMV */
+        {
+            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
+            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
+            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
+            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
+            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
+            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
+            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
+            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
+            /* .get_lhs_offset_ex     = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot>,
+            /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot>,
+            /* .run_kernel_ex         = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot>,
+        },
+        /* .gemv_lhs_info = */ {
+            /* .get_offset            = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
+            /* .get_packed_offset_ex  = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
+            /* .packed_size_ex        = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
+            /* .pack_func_ex          = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
+        },
+        /* .rhs_info = */ {
+            /* .packed_stride         = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon,
+            /* .to_float              = */ dequantize_row_qsi8cxp,
+            /* .packed_size_ex        = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
+            /* .packed_stride_ex      = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
+            /* .pack_func_ex          = */ &rhs_pack_scale_fn12<kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
+        },
+        /* .required_cpu       = */ CPU_FEATURE_SME,
+        /* .lhs_type           = */ GGML_TYPE_F32,
+        /* .rhs_type           = */ GGML_TYPE_Q8_0,
+        /* .op_type            = */ GGML_TYPE_F32,
+    },
+#endif
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    {
+        /* I8MM GEMM */
+        {
+            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
+            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
+            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
+            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
+            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
+            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
+            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
+            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
+            /* .get_lhs_offset_ex     = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm>,
+            /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm>,
+            /* .run_kernel_ex         = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm>,
+        },
+        /* .gemm_lhs_info = */ {
+            /* .get_offset            = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
+            /* .get_packed_offset_ex  = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
+            /* .packed_size_ex        = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
+            /* .pack_func_ex          = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
+        },
+        /* I8MM GEMV (dotprod fallback) */
+        {
+            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
+            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
+            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
+            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
+            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
+            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
+            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
+            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
+            /* .get_lhs_offset_ex     = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod>,
+            /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod>,
+            /* .run_kernel_ex         = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod>,
+        },
+        /* .gemv_lhs_info = */ {
+            /* .get_offset            = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
+            /* .get_packed_offset_ex  = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
+            /* .packed_size_ex        = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
+            /* .pack_func_ex          = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
+        },
+        /* .rhs_info = */ {
+            /* .packed_stride         = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon,
+            /* .to_float              = */ dequantize_row_qsi8cxp,
+            /* .packed_size_ex        = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
+            /* .packed_stride_ex      = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
+            /* .pack_func_ex          = */ &rhs_pack_scale_fn12<kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
+        },
+        /* .required_cpu       = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
+        /* .lhs_type           = */ GGML_TYPE_F32,
+        /* .rhs_type           = */ GGML_TYPE_Q8_0,
+        /* .op_type            = */ GGML_TYPE_F32,
+    },
+#endif
+#if defined(__ARM_FEATURE_DOTPROD)
+    {
+        /* DOTPROD GEMM */
+        {
+            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
+            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
+            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
+            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
+            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
+            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
+            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
+            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
+            /* .get_lhs_offset_ex     = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod>,
+            /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod>,
+            /* .run_kernel_ex         = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod>,
+        },
+        /* .gemm_lhs_info = */ {
+            /* .get_offset            = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
+            /* .get_packed_offset_ex  = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
+            /* .packed_size_ex        = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
+            /* .pack_func_ex          = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
+        },
+        /* DOTPROD GEMV */
+        {
+            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
+            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
+            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
+            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
+            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
+            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
+            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
+            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
+            /* .get_lhs_offset_ex     = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod>,
+            /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod>,
+            /* .run_kernel_ex         = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod>,
+        },
+        /* .gemv_lhs_info = */ {
+            /* .get_offset            = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
+            /* .get_packed_offset_ex  = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
+            /* .packed_size_ex        = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
+            /* .pack_func_ex          = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
+        },
+        /* .rhs_info = */ {
+            /* .packed_stride         = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon,
+            /* .to_float              = */ dequantize_row_qsi8cxp,
+            /* .packed_size_ex        = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
+            /* .packed_stride_ex      = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
+            /* .pack_func_ex          = */ &rhs_pack_scale_fn12<kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
+        },
+        /* .required_cpu       = */ CPU_FEATURE_DOTPROD,
+        /* .lhs_type           = */ GGML_TYPE_F32,
+        /* .rhs_type           = */ GGML_TYPE_Q8_0,
+        /* .op_type            = */ GGML_TYPE_F32,
+    },
+#endif
+    { /* Sentinel */ }
 };

 ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor) {
@@ -553,7 +812,7 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c

    if (tensor->op == GGML_OP_MUL_MAT && tensor->src[0] != nullptr && tensor->src[1] != nullptr) {
 #if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)
-        for (size_t i = 0; i < NELEMS(gemm_gemv_kernels); ++i) {
+        for (size_t i = 0; i < NELEMS(gemm_gemv_kernels) - 1; ++i) {
            if ((cpu_features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu &&
                gemm_gemv_kernels[i].lhs_type == tensor->src[1]->type &&
                gemm_gemv_kernels[i].rhs_type == tensor->src[0]->type &&
@@ -562,6 +821,21 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c
                break;
            }
        }
+        if (!kernel) {
+            for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8) - 1; ++i) {
+                if ((cpu_features & gemm_gemv_kernels_q8[i].required_cpu) == gemm_gemv_kernels_q8[i].required_cpu &&
+                    gemm_gemv_kernels_q8[i].lhs_type == tensor->src[1]->type &&
+                    gemm_gemv_kernels_q8[i].rhs_type == tensor->src[0]->type &&
+                    gemm_gemv_kernels_q8[i].op_type  == tensor->type) {
+                    kernel = &gemm_gemv_kernels_q8[i];
+                    break;
+                }
+            }
+        }
+#else
+    GGML_UNUSED(gemm_gemv_kernels);
+    GGML_UNUSED(gemm_gemv_kernels_q8);
+    GGML_UNUSED(cpu_features);
 #endif
    }

@@ -572,12 +846,31 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features)
    ggml_kleidiai_kernels * kernels = nullptr;

 #if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)
-    for (size_t i = 0; i < NELEMS(gemm_gemv_kernels); ++i) {
+    for (size_t i = 0; i < NELEMS(gemm_gemv_kernels) - 1; ++i) {
        if ((features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu) {
            kernels = &gemm_gemv_kernels[i];
            break;
        }
    }
+#else
+    GGML_UNUSED(features);
+#endif
+
+    return kernels;
+}
+
+ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q8_0(cpu_feature features) {
+    ggml_kleidiai_kernels * kernels = nullptr;
+
+#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)
+    for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8) - 1; ++i) {
+        if ((features & gemm_gemv_kernels_q8[i].required_cpu) == gemm_gemv_kernels_q8[i].required_cpu) {
+            kernels = &gemm_gemv_kernels_q8[i];
+            break;
+        }
+    }
+#else
+    GGML_UNUSED(features);
 #endif

    return kernels;
--- a/ggml/src/ggml-cpu/kleidiai/kernels.h
+++ b/ggml/src/ggml-cpu/kleidiai/kernels.h
@@ -87,3 +87,4 @@ struct ggml_kleidiai_kernels {

 ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor);
 ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features);
+ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q8_0(cpu_feature features);
--- a/Show More
+++ b/Show More