server: streaming of tool calls and thoughts when --jinja is on (#12379 )

* add common_json w/ support for truncated json healing * add common_chat_msg_diff * partial common_chat_parse * refactor parser w/ optionals * server: wire chat diffs in stream mode * fix trigger of thinking models (must happen after thoughts are closed) * fix functionary v3.2 raw python! * rename: common_chat_syntax (now contains format) * rm common_regex.at_start * don't return empty <think></think> * accommodate yet another deepseek r1 distill fantasy syntax (`<｜tool▁calls｜>`) * fix QwQ 32B tool call parsing after thoughts (hermes2) * better logs for grammar triggers * consume spaces after parse_json_tool_calls * fix required tool calls w/ thinking models that have pre-opened thinking tags * fix thinking model's initial trigger + test qwq's template * run most test_tool_call tests in stream + non-stream modes * make functionary v3.2 parsing more strict (differentiate first match from others) * send final diff from server, to close off raw python arguments * support partial content streaming in Generic mode * tool-call: allow content prelude before hermes2 tool calls (for Qwen2.5) * Update function-calling.md * Update tool_bench.py * chat-parser: remove input from exception (llm output may contain PII) --------- Co-authored-by: ochafik <ochafik@google.com> Co-authored-by: Olivier Chafik <ochafik@users.noreply.github.com>
releases : bundle llvm omp library in windows release (#13763 )
2026-02-05 13:53:23 +02:00 · 2025-05-25 01:48:08 +01:00 · 2025-05-25 00:55:16 +02:00 · 2025-05-24 22:27:03 +02:00 · 2025-05-24 22:26:47 +02:00 · 2025-05-24 16:49:12 +02:00
213 changed files with 110116 additions and 7650 deletions
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
+ARG ONEAPI_VERSION=2025.1.1-0-devel-ubuntu24.04

 ## Build Image

--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -1,10 +1,10 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc3.1.1
+ARG MUSA_VERSION=rc4.0.1
 # Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}

-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}

 FROM ${BASE_MUSA_DEV_CONTAINER} AS build

@@ -21,21 +21,14 @@ RUN apt-get update && \
    libcurl4-openssl-dev \
    libgomp1

-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-
 WORKDIR /app

 COPY . .

-# Use the default MUSA archs if not specified
 RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
--- a/.editorconfig
+++ b/.editorconfig
@@ -48,3 +48,7 @@ end_of_line = unset
 charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset
+
+[tools/mtmd/miniaudio.h]
+trim_trailing_whitespace = unset
+insert_final_newline = unset
--- a/.github/actions/windows-setup-curl/action.yml
+++ b/.github/actions/windows-setup-curl/action.yml
@@ -5,6 +5,10 @@ inputs:
    description: 'CURL version'
    required: false
    default: '8.6.0_6'
+  architecture:
+    description: 'Architecture of the libcurl to download'
+    required: false
+    default: 'win64'
 outputs:
  curl_path:
    description: "Path to the downloaded libcurl"
@@ -18,8 +22,9 @@ runs:
      shell: powershell
      env:
        CURL_VERSION: ${{ inputs.curl_version }}
+        ARCHITECTURE: ${{ inputs.architecture }}
      run: |
-        curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
+        curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-${env:ARCHITECTURE}-mingw.zip"
        mkdir $env:RUNNER_TEMP/libcurl
        tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
        echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -140,3 +140,94 @@ jobs:
                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH

          cmake --build build --config Release -j $(nproc)
+
+  ubuntu-24-ppc64el-cpu-cross:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup PowerPC64le
+        run: |
+          sudo dpkg --add-architecture ppc64el
+
+          # Add arch-specific repositories for non-amd64 architectures
+          cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
+          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
+          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
+          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
+          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
+          EOF
+
+          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
+
+          sudo apt-get install -y --no-install-recommends \
+                  build-essential \
+                  gcc-14-powerpc64le-linux-gnu \
+                  g++-14-powerpc64le-linux-gnu \
+                  libcurl4-openssl-dev:ppc64el
+
+      - name: Build
+        run: |
+          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+                         -DGGML_OPENMP=OFF \
+                         -DLLAMA_BUILD_EXAMPLES=ON \
+                         -DLLAMA_BUILD_TOOLS=ON \
+                         -DLLAMA_BUILD_TESTS=OFF \
+                         -DCMAKE_SYSTEM_NAME=Linux \
+                         -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
+                         -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
+                         -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
+                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+
+          cmake --build build --config Release -j $(nproc)
+
+  ubuntu-24-ppc64el-vulkan-cross:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup PowerPC64le
+        run: |
+          sudo dpkg --add-architecture ppc64el
+
+          # Add arch-specific repositories for non-amd64 architectures
+          cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
+          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
+          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
+          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
+          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
+          EOF
+
+          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
+
+          sudo apt-get install -y --no-install-recommends \
+                  build-essential \
+                  glslc \
+                  gcc-14-powerpc64le-linux-gnu \
+                  g++-14-powerpc64le-linux-gnu \
+                  libvulkan-dev:ppc64el \
+                  libcurl4-openssl-dev:ppc64el
+
+      - name: Build
+        run: |
+          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+                         -DGGML_VULKAN=ON \
+                         -DGGML_OPENMP=OFF \
+                         -DLLAMA_BUILD_EXAMPLES=ON \
+                         -DLLAMA_BUILD_TOOLS=ON \
+                         -DLLAMA_BUILD_TESTS=OFF \
+                         -DCMAKE_SYSTEM_NAME=Linux \
+                         -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
+                         -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
+                         -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
+                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+
+          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -351,7 +351,7 @@ jobs:

  ubuntu-22-cmake-musa:
    runs-on: ubuntu-22.04
-    container: mthreads/musa:rc3.1.1-devel-ubuntu22.04
+    container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04

    steps:
      - name: Clone
@@ -899,7 +899,7 @@ jobs:
        shell: bash

    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
    steps:
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,4 +1,4 @@
-name: Create Release
+name: Release

 on:
  workflow_dispatch: # allows manual triggering
@@ -227,6 +227,69 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
          name: llama-bin-ubuntu-vulkan-x64.zip

+  windows-cpu:
+    runs-on: windows-latest
+
+    strategy:
+      matrix:
+        include:
+          - arch: 'x64'
+          - arch: 'arm64'
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-cpu-${{ matrix.arch }}
+          variant: ccache
+          evict-old-files: 1d
+
+      - name: Install Ninja
+        run: |
+          choco install ninja
+
+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+        with:
+          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
+
+      - name: Build
+        shell: cmd
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch }}
+          cmake -S . -B build -G "Ninja Multi-Config" ^
+            -D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
+            -DGGML_NATIVE=OFF ^
+            -DGGML_BACKEND_DL=ON ^
+            -DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^
+            -DGGML_OPENMP=ON ^
+            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build --config Release
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
+          Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.42.34433\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
+          7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-bin-win-cpu-${{ matrix.arch }}.zip
+          name: llama-bin-win-cpu-${{ matrix.arch }}.zip
+
  windows:
    runs-on: windows-latest

@@ -237,47 +300,30 @@ jobs:
    strategy:
      matrix:
        include:
-          - build: 'cpu-x64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
-          #- build: 'openblas-x64'
-          #  defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
-          - build: 'vulkan-x64'
-            defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
-          - build: 'cpu-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
-          - build: 'opencl-adreno-arm64'
+          - backend: 'vulkan'
+            arch: 'x64'
+            defines: '-DGGML_VULKAN=ON'
+            target: 'ggml-vulkan'
+          - backend: 'opencl-adreno'
+            arch: 'arm64'
            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
+            target: 'ggml-opencl'

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0

      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
-          key: windows-latest-cmake-${{ matrix.build }}
+          key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
          variant: ccache
          evict-old-files: 1d

-      - name: Download OpenBLAS
-        id: get_openblas
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
-          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
-          mkdir $env:RUNNER_TEMP/openblas
-          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
-          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
-          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
-          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
-          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
-
      - name: Install Vulkan SDK
        id: get_vulkan
-        if: ${{ matrix.build == 'vulkan-x64' }}
+        if: ${{ matrix.backend == 'vulkan' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
@@ -291,7 +337,7 @@ jobs:

      - name: Install OpenCL Headers and Libs
        id: install_opencl
-        if: ${{ matrix.build == 'opencl-adreno-arm64' }}
+        if: ${{ matrix.backend == 'opencl-adreno' && matrix.arch == 'arm64' }}
        run: |
          git clone https://github.com/KhronosGroup/OpenCL-Headers
          cd OpenCL-Headers
@@ -309,44 +355,22 @@ jobs:
            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
          cmake --build build-arm64-release --target install --config release

-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
      - name: Build
        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
-          cmake -S . -B build ${{ matrix.defines }} `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
-
-      - name: Add libopenblas.dll
-        id: add_libopenblas_dll
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
-          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
+          cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF
+          cmake --build build --config Release --target ${{ matrix.target }}

      - name: Pack artifacts
        id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
-          Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
+          7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
-          name: llama-bin-win-${{ matrix.build }}.zip
+          path: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
+          name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip

  windows-cuda:
    runs-on: windows-2019
@@ -359,8 +383,6 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
-        with:
-            fetch-depth: 0

      - name: Install ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
@@ -379,45 +401,30 @@ jobs:
        run: |
          choco install ninja

-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
      - name: Build
        id: cmake_build
        shell: cmd
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
          cmake -S . -B build -G "Ninja Multi-Config" ^
-            -DGGML_NATIVE=OFF ^
            -DGGML_BACKEND_DL=ON ^
-            -DGGML_CPU_ALL_VARIANTS=ON ^
+            -DGGML_NATIVE=OFF ^
+            -DGGML_CPU=OFF ^
            -DGGML_CUDA=ON ^
-            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
-            ${{ env.CMAKE_ARGS }}
+            -DLLAMA_CURL=OFF
          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
-          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
-          cmake --build build --config Release
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
+          cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda

      - name: Pack artifacts
        id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
-          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
+          7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip
-          name: llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
+          path: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
+          name: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip

      - name: Copy and pack Cuda runtime
        run: |
@@ -425,13 +432,13 @@ jobs:
          $dst='.\build\bin\cudart\'
          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          7z a cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip $dst\*
+          7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*

      - name: Upload Cuda runtime
        uses: actions/upload-artifact@v4
        with:
-          path: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
-          name: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
+          path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
+          name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip

  windows-sycl:
    runs-on: windows-latest
@@ -441,15 +448,14 @@ jobs:
        shell: bash

    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0

      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
@@ -462,15 +468,18 @@ jobs:
        run:  |
          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL

-      # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
      - name: Build
        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
+        shell: cmd
+        run: |
+          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+          cmake -G "Ninja" -B build ^
+            -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
+            -DCMAKE_BUILD_TYPE=Release ^
+            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
+            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
+            -DLLAMA_CURL=OFF
+          cmake --build build --target ggml-sycl -j

      - name: Build the release package
        id: pack_artifacts
@@ -495,12 +504,12 @@ jobs:
          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin

          echo "cp oneAPI running time dll files to ./build/bin done"
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
+          7z a llama-bin-win-sycl-x64.zip ./build/bin/*

      - name: Upload the release package
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
+          path: llama-bin-win-sycl-x64.zip
          name: llama-bin-win-sycl-x64.zip

  windows-hip:
@@ -508,14 +517,14 @@ jobs:

    strategy:
      matrix:
-        gpu_target: [gfx1100, gfx1101, gfx1030]
+        include:
+          - name: "radeon"
+            gpu_targets: "gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
-        with:
-            fetch-depth: 0

      - name: Clone rocWMMA repository
        id: clone_rocwmma
@@ -525,7 +534,7 @@ jobs:
      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
-          key: windows-latest-cmake-hip-release
+          key: windows-latest-cmake-hip-${{ matrix.name }}-x64
          evict-old-files: 1d

      - name: Install
@@ -543,50 +552,39 @@ jobs:
        run: |
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version

-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
      - name: Build
        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
          cmake -G "Unix Makefiles" -B build -S . `
            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
            -DCMAKE_BUILD_TYPE=Release `
-            -DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
+            -DGGML_BACKEND_DL=ON `
+            -DGGML_NATIVE=OFF `
+            -DGGML_CPU=OFF `
+            -DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
            -DGGML_HIP_ROCWMMA_FATTN=ON `
            -DGGML_HIP=ON `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+            -DLLAMA_CURL=OFF
+          cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
          md "build\bin\rocblas\library\"
          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"

-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
      - name: Pack artifacts
        id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
-          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
+          7z a llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
-          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+          path: llama-bin-win-hip-${{ matrix.name }}-x64.zip
+          name: llama-bin-win-hip-${{ matrix.name }}-x64.zip

  ios-xcode-build:
    runs-on: macos-latest
@@ -648,14 +646,16 @@ jobs:
    runs-on: ubuntu-latest

    needs:
-      - ubuntu-22-cpu
-      - ubuntu-22-vulkan
      - windows
+      - windows-cpu
      - windows-cuda
      - windows-sycl
      - windows-hip
+      - ubuntu-22-cpu
+      - ubuntu-22-vulkan
      - macOS-arm64
      - macOS-x64
+      - ios-xcode-build

    steps:
      - name: Clone
@@ -673,10 +673,43 @@ jobs:
        uses: actions/download-artifact@v4
        with:
          path: ./artifact
+          merge-multiple: true

      - name: Move artifacts
        id: move_artifacts
-        run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
+        run: |
+          mkdir -p release
+
+          echo "Adding CPU backend files to existing zips..."
+          for arch in x64 arm64; do
+            cpu_zip="artifact/llama-bin-win-cpu-${arch}.zip"
+            temp_dir=$(mktemp -d)
+            echo "Extracting CPU backend for $arch..."
+            unzip "$cpu_zip" -d "$temp_dir"
+
+            echo "Adding CPU files to $arch zips..."
+            for target_zip in artifact/llama-bin-win-*-${arch}.zip; do
+              if [[ "$target_zip" == "$cpu_zip" ]]; then
+                continue
+              fi
+              echo "Adding CPU backend to $(basename "$target_zip")"
+              realpath_target_zip=$(realpath "$target_zip")
+              (cd "$temp_dir" && zip -r "$realpath_target_zip" .)
+            done
+
+            rm -rf "$temp_dir"
+          done
+
+          echo "Renaming and moving zips to release..."
+          for zip_file in artifact/llama-bin-win-*.zip; do
+            base_name=$(basename "$zip_file" .zip)
+            zip_name="llama-${{ steps.tag.outputs.name }}-${base_name#llama-}.zip"
+            echo "Moving $zip_file to release/$zip_name"
+            mv "$zip_file" "release/$zip_name"
+          done
+
+          echo "Moving other artifacts..."
+          mv -v artifact/*.zip release

      - name: Create release
        id: create_release
@@ -695,7 +728,7 @@ jobs:
            const path = require('path');
            const fs = require('fs');
            const release_id = '${{ steps.create_release.outputs.id }}';
-            for (let file of await fs.readdirSync('./artifact/release')) {
+            for (let file of await fs.readdirSync('./release')) {
              if (path.extname(file) === '.zip') {
                console.log('uploadReleaseAsset', file);
                await github.repos.uploadReleaseAsset({
@@ -703,7 +736,7 @@ jobs:
                  repo: context.repo.repo,
                  release_id: release_id,
                  name: file,
-                  data: await fs.readFileSync(`./artifact/release/${file}`)
+                  data: await fs.readFileSync(`./release/${file}`)
                });
              }
            }
--- a/.github/workflows/winget.yml
+++ b/.github/workflows/winget.yml
@@ -0,0 +1,42 @@
+name: Update Winget Package
+
+on:
+  workflow_dispatch: # allows manual triggering
+  schedule:
+    - cron: '28 5 * * *' # Update every day at 5:28 UTC
+
+jobs:
+  update:
+    name: Update Winget Package
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Install cargo binstall
+        uses: cargo-bins/cargo-binstall@268643a6b5ea099f5718ee5cd3ff7dc89a5eb49b
+
+      - name: Install komac
+        run: |
+          cargo binstall komac@2.11.2 -y
+
+      - name: Find latest release
+        id: find_latest_release
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const { data: releases } = await github.rest.repos.listReleases({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+            });
+            console.log("Latest release:", releases[0].tag_name);
+            return releases[0].tag_name;
+
+      - name: Update manifest
+        env:
+          VERSION: ${{ steps.find_latest_release.outputs.result }}
+        run: |
+          echo "Updating manifest..."
+          komac update --version ${{ env.VERSION }} \
+            --urls "https://github.com/ggml-org/llama.cpp/releases/download/${{ env.VERSION }}/llama-${{ env.VERSION }}-bin-win-vulkan-x64.zip" \
+            --token ${{ secrets.WINGET_GITHUB_TOKEN }} \
+            --submit \
+            ggml.llamacpp
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ range of hardware - locally and in the cloud.
 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
 - AVX, AVX2, AVX512 and AMX support for x86 architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
+- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
 - Vulkan and SYCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity

@@ -237,7 +237,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [BLAS](docs/build.md#blas-build) | All |
 | [BLIS](docs/backend/BLIS.md) | All |
 | [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
-| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
+| [MUSA](docs/build.md#musa) | Moore Threads GPU |
 | [CUDA](docs/build.md#cuda) | Nvidia GPU |
 | [HIP](docs/build.md#hip) | AMD GPU |
 | [Vulkan](docs/build.md#vulkan) | GPU |
@@ -572,4 +572,12 @@ automatically. For example:
 $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
 ```

-## References
+## Dependencies
+
+- [yhirose/cpp-httplib](https://github.com/yhirose/cpp-httplib) - Single-header HTTP server, used by `llama-server` - MIT license
+- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
+- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
+- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
+- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
+- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
+- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -117,6 +117,7 @@ setup_framework_structure() {
    # Copy all required headers (common for all platforms)
    cp include/llama.h             ${header_path}
    cp ggml/include/ggml.h         ${header_path}
+    cp ggml/include/ggml-opt.h     ${header_path}
    cp ggml/include/ggml-alloc.h   ${header_path}
    cp ggml/include/ggml-backend.h ${header_path}
    cp ggml/include/ggml-metal.h   ${header_path}
--- a/ci/README.md
+++ b/ci/README.md
@@ -54,7 +54,7 @@ docker run --privileged -it \
    -v $HOME/llama.cpp/ci-cache:/ci-cache \
    -v $HOME/llama.cpp/ci-results:/ci-results \
    -v $PWD:/ws -w /ws \
-    mthreads/musa:rc3.1.1-devel-ubuntu22.04
+    mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
 ```

 Inside the container, execute the following commands:
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -60,12 +60,16 @@ add_library(${TARGET} STATIC
    base64.hpp
    chat.cpp
    chat.h
+    chat-parser.cpp
+    chat-parser.h
    common.cpp
    common.h
    console.cpp
    console.h
    json-schema-to-grammar.cpp
    json.hpp
+    json-partial.h
+    json-partial.cpp
    llguidance.cpp
    log.cpp
    log.h
@@ -73,6 +77,8 @@ add_library(${TARGET} STATIC
    minja/minja.hpp
    ngram-cache.cpp
    ngram-cache.h
+    regex-partial.cpp
+    regex-partial.h
    sampling.cpp
    sampling.h
    speculative.cpp
@@ -119,8 +125,8 @@ if (LLAMA_LLGUIDANCE)

    ExternalProject_Add(llguidance_ext
        GIT_REPOSITORY https://github.com/guidance-ai/llguidance
-        # v0.7.19 (+ fancy-regex build fix):
-        GIT_TAG b59f98f85269892a7de3d3641ad155366f13daa6
+        # v0.7.20 (+ fix to build on GCC 15):
+        GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
        PREFIX ${CMAKE_BINARY_DIR}/llguidance
        SOURCE_DIR ${LLGUIDANCE_SRC}
        BUILD_IN_SOURCE TRUE
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -39,7 +39,7 @@
 using json = nlohmann::ordered_json;

 std::initializer_list<enum llama_example> mmproj_examples = {
-    LLAMA_EXAMPLE_LLAVA,
+    LLAMA_EXAMPLE_MTMD,
    LLAMA_EXAMPLE_SERVER,
 };

@@ -1445,6 +1445,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.n_keep = value;
        }
    ));
+    add_opt(common_arg(
+        {"--swa-full"},
+        string_format("use full-size SWA cache (default: %s)\n"
+            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
+        [](common_params & params) {
+            params.swa_full = true;
+        }
+    ).set_env("LLAMA_ARG_SWA_FULL"));
    add_opt(common_arg(
        {"--no-context-shift"},
        string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -1670,7 +1678,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.warmup = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
    add_opt(common_arg(
        {"--spm-infill"},
        string_format(
@@ -2057,13 +2065,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.grp_attn_w = value;
        }
    ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(common_arg(
-        {"-dkvc", "--dump-kv-cache"},
-        "verbose print of the KV cache",
-        [](common_params & params) {
-            params.dump_kv_cache = true;
-        }
-    ));
    add_opt(common_arg(
        {"-nkvo", "--no-kv-offload"},
        "disable KV offload",
@@ -2232,12 +2233,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
    add_opt(common_arg(
-        {"--image"}, "FILE",
-        "path to an image file. use with multimodal models. Specify multiple times for batching",
+        {"--image", "--audio"}, "FILE",
+        "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
        [](common_params & params, const std::string & value) {
            params.image.emplace_back(value);
        }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    ).set_examples({LLAMA_EXAMPLE_MTMD}));
    if (llama_supports_rpc()) {
        add_opt(common_arg(
            {"--rpc"}, "SERVERS",
@@ -2437,6 +2438,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ));
+    add_opt(common_arg(
+        {"--no-op-offload"},
+        string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
+        [](common_params & params) {
+            params.no_op_offload = true;
+        }
+    ));
    add_opt(common_arg(
        {"--lora"}, "FNAME",
        "path to LoRA adapter (can be repeated to use multiple adapters)",
@@ -2578,7 +2586,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, int value) {
            params.n_junk = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
+    ).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
    add_opt(common_arg(
        {"--pos"}, "N",
        string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
@@ -2641,7 +2649,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.is_pp_shared = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
    add_opt(common_arg(
        {"-npp"}, "n0,n1,...",
        "number of prompt tokens",
@@ -2860,7 +2868,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.chat_template = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
    add_opt(common_arg(
        {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
        string_format(
@@ -2873,6 +2881,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.chat_template = read_file(value);
        }
    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
+    add_opt(common_arg(
+        {"--no-prefill-assistant"},
+        string_format(
+            "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
+            "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
+        ),
+        [](common_params & params) {
+            params.prefill_assistant = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
    add_opt(common_arg(
        {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
        string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@@ -0,0 +1,376 @@
+#include "chat-parser.h"
+#include "common.h"
+#include "log.h"
+#include "regex-partial.h"
+
+#include <optional>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+using json = nlohmann::ordered_json;
+
+common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax)
+    : input_(input), is_partial_(is_partial), syntax_(syntax)
+{
+    result_.role = "assistant";
+
+    while (true) {
+        std::string id = std::to_string(std::rand());
+        if (input.find(id) == std::string::npos) {
+            healing_marker_ = id;
+            break;
+        }
+    }
+}
+
+std::string common_chat_msg_parser::str(const common_string_range & rng) const {
+    GGML_ASSERT(rng.begin <= rng.end);
+    return input_.substr(rng.begin, rng.end - rng.begin);
+}
+
+void common_chat_msg_parser::add_content(const std::string &content) {
+    result_.content += content;
+}
+
+void common_chat_msg_parser::add_reasoning_content(const std::string &reasoning_content) {
+    result_.reasoning_content += reasoning_content;
+}
+
+bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::string & id, const std::string & arguments) {
+    if (name.empty()) {
+        return false;
+    }
+
+    common_chat_tool_call tool_call;
+    tool_call.name = name;
+    tool_call.arguments = arguments;
+    tool_call.id = id;
+
+    // LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
+    result_.tool_calls.emplace_back(tool_call);
+    return true;
+}
+bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
+    std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
+    std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
+    std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
+    return add_tool_call(name, id, arguments);
+}
+
+bool common_chat_msg_parser::add_tool_calls(const json & arr) {
+    for (const auto & item : arr) {
+        if (!add_tool_call(item)) {
+            return false;
+        }
+    }
+    return true;
+}
+void common_chat_msg_parser::finish() {
+    if (!is_partial_ && pos_ != input_.size()) {
+        throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
+    }
+}
+
+bool common_chat_msg_parser::consume_spaces() {
+    const auto length = input_.size();
+    auto consumed = false;
+    while (pos_ < length && std::isspace(input_[pos_])) {
+        ++pos_;
+        consumed = true;
+    }
+    return consumed;
+}
+
+bool common_chat_msg_parser::try_consume_literal(const std::string & literal) {
+    auto pos = pos_;
+    for (auto i = 0u; i < literal.size(); ++i) {
+        if (pos >= input_.size()) {
+            return false;
+        }
+        if (input_[pos] != literal[i]) {
+            return false;
+        }
+        ++pos;
+    }
+    pos_ = pos;
+    return true;
+}
+
+std::optional<common_chat_msg_parser::find_regex_result>  common_chat_msg_parser::try_find_literal(const std::string & literal) {
+    auto idx = input_.find(literal, pos_);
+    if (idx != std::string::npos) {
+        find_regex_result res;
+        res.prelude = input_.substr(pos_, idx - pos_);
+        auto end = idx + literal.size();
+        res.groups.emplace_back(common_string_range{idx, end});
+        move_to(end);
+        return res;
+    }
+    if (is_partial_) {
+        idx = string_find_partial_stop(input_, literal);
+        if (idx != std::string::npos && idx >= pos_) {
+            find_regex_result res;
+            res.prelude = input_.substr(pos_, idx - pos_);
+            auto end = input_.size();
+            res.groups.emplace_back(common_string_range{idx, end});
+            move_to(end);
+            return res;
+        }
+    }
+    return std::nullopt;
+}
+
+void common_chat_msg_parser::consume_literal(const std::string & literal) {
+    if (!try_consume_literal(literal)) {
+        throw common_chat_msg_partial_exception(literal);
+    }
+}
+
+bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
+    auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
+        auto stripped_reasoning = string_strip(reasoning);
+        if (stripped_reasoning.empty()) {
+            return;
+        }
+        if (syntax_.reasoning_in_content) {
+            add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "<think>" : start_think);
+            add_content(stripped_reasoning);
+            if (closed) {
+                add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
+            }
+        } else {
+            add_reasoning_content(stripped_reasoning);
+        }
+    };
+    if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
+        if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
+            if (auto res = try_find_literal(end_think)) {
+                handle_reasoning(res->prelude, /* closed */ true);
+                consume_spaces();
+                return true;
+            }
+            auto rest = consume_rest();
+            if (!rest.empty()) {
+                handle_reasoning(rest, /* closed */ !is_partial());
+            }
+            if (!syntax_.thinking_forced_open) {
+                throw common_chat_msg_partial_exception(end_think);
+            }
+            return true;
+        }
+    }
+    return false;
+}
+
+std::string common_chat_msg_parser::consume_rest() {
+    auto rest = input_.substr(pos_);
+    pos_ = input_.size();
+    return rest;
+}
+
+// Tries to find the regex, consumes it (pos right after it) and gives the prelude (right before it) and the groups to the callback.
+std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from) {
+    auto m = regex.search(input_, from == std::string::npos ? pos_ : from);
+    if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
+        return std::nullopt;
+    }
+    if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
+        if (is_partial()) {
+            throw common_chat_msg_partial_exception(regex.str());
+        }
+        return std::nullopt;
+    }
+    auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);
+    pos_ = m.groups[0].end;
+
+    return find_regex_result{prelude, m.groups};
+}
+
+common_chat_msg_parser::find_regex_result common_chat_msg_parser::consume_regex(const common_regex & regex) {
+    if (auto result = try_consume_regex(regex)) {
+        return *result;
+    }
+    throw common_chat_msg_partial_exception(regex.str());
+}
+
+std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_consume_regex(const common_regex & regex) {
+    auto m = regex.search(input_, pos_);
+    if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
+        return std::nullopt;
+    }
+    if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
+        if (is_partial()) {
+            throw common_chat_msg_partial_exception(regex.str());
+        }
+        return std::nullopt;
+    }
+    if (m.groups[0].begin != pos_) {
+        // Didn't match at the current position.
+        return std::nullopt;
+    }
+    pos_ = m.groups[0].end;
+
+    return find_regex_result {
+        /* .prelude = */ "",
+        m.groups,
+    };
+}
+
+std::optional<common_json> common_chat_msg_parser::try_consume_json() {
+    auto it = input_.cbegin() + pos_;
+    const auto end = input_.cend();
+    common_json result;
+    if (!common_json_parse(it, end, healing_marker_, result)) {
+        return std::nullopt;
+    }
+    pos_ = std::distance(input_.cbegin(), it);
+    if (result.healing_marker.marker.empty()) {
+        // No healing marker, just return the parsed json
+        return result;
+    }
+    if (!is_partial()) {
+        throw common_chat_msg_partial_exception("JSON");
+    }
+    return result;
+}
+
+common_json common_chat_msg_parser::consume_json() {
+    if (auto result = try_consume_json()) {
+        return *result;
+    }
+    throw common_chat_msg_partial_exception("JSON");
+}
+
+common_chat_msg_parser::consume_json_result common_chat_msg_parser::consume_json_with_dumped_args(
+    const std::vector<std::vector<std::string>> & args_paths,
+    const std::vector<std::vector<std::string>> & content_paths
+) {
+    if (auto result = try_consume_json_with_dumped_args(args_paths, content_paths)) {
+        return *result;
+    }
+    throw common_chat_msg_partial_exception("JSON");
+}
+
+std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parser::try_consume_json_with_dumped_args(
+    const std::vector<std::vector<std::string>> & args_paths,
+    const std::vector<std::vector<std::string>> & content_paths
+) {
+    auto partial = try_consume_json();
+    if (!partial) {
+        return std::nullopt;
+    }
+    auto is_arguments_path = [&](const std::vector<std::string> & path) {
+        return std::find(args_paths.begin(), args_paths.end(), path) != args_paths.end();
+    };
+    auto is_content_path = [&](const std::vector<std::string> & path) {
+        return std::find(content_paths.begin(), content_paths.end(), path) != content_paths.end();
+    };
+
+    if (partial->healing_marker.marker.empty()) {
+        if (args_paths.empty()) {
+            // No arguments to dump, and JSON was parsed fully.
+            return consume_json_result {
+                partial->json,
+                /* .is_partial = */ false,
+            };
+        }
+        if (is_arguments_path({})) {
+            // Entire JSON is the arguments and was parsed fully.
+            return consume_json_result {
+                partial->json.dump(),
+                /* .is_partial = */ false,
+            };
+        }
+    }
+
+    LOG_DBG("Parsed partial JSON: %s (json_healing_marker: %s)\n", partial->json.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
+
+    auto found_healing_marker = false;
+    std::vector<std::string> path;
+    std::function<json(const json &)> remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
+        if (is_arguments_path(path)) {
+            auto arguments = j.dump();
+            if (is_partial() && !partial->healing_marker.marker.empty()) {
+                auto idx = arguments.find(partial->healing_marker.json_dump_marker);
+                if (idx != std::string::npos) {
+                    arguments.resize(idx);
+                    found_healing_marker = true;
+                }
+                if (arguments == "\"") {
+                    // This happens because of completing `:"$magic` after `"arguments"`
+                    arguments = "";
+                }
+            }
+            return arguments;
+        }
+        if (is_content_path(path)) {
+            if (!j.is_string()) {
+                throw std::runtime_error("Content path must be a string");
+            }
+            std::string str = j;
+            auto idx = str.find(partial->healing_marker.marker); // not using json_dump_marker as we're inside a string
+            if (idx != std::string::npos) {
+                str.resize(idx);
+                found_healing_marker = true;
+            }
+            return str;
+        }
+        if (j.is_object()) {
+            auto obj = json::object();
+            for (const auto & p : j.items()) {
+                const auto & key = p.key();
+                const auto & value = p.value();
+                const std::string key_str = key; // NOLINT
+                auto idx = key_str.find(healing_marker_);
+                if (idx != std::string::npos) {
+                    found_healing_marker = true;
+                    break;
+                }
+                path.push_back(key_str);
+                if (value.is_string()) {
+                    const std::string value_str = value;
+                    if (value_str.find(healing_marker_) != std::string::npos) {
+                        found_healing_marker = true;
+                        if (is_content_path(path)) {
+                            if (partial->healing_marker.marker == partial->healing_marker.json_dump_marker) {
+                                // The healing occurred inside the string: good. Otherwise we just ditch the entire key/value pair.
+                                obj[key] = remove_unsupported_healings_and_dump_args(value);
+                            }
+                        }
+                        break;
+                    }
+                    obj[key] = value;
+                } else {
+                    obj[key] = remove_unsupported_healings_and_dump_args(value);
+                }
+                path.pop_back();
+            }
+            return obj;
+        }
+        if (j.is_array()) {
+            auto arr = json::array();
+            for (const auto & value : j) {
+                if (value.is_string()) {
+                    std::string str = value;
+                    auto idx = str.find(healing_marker_);
+                    if (idx != std::string::npos) {
+                        // Don't heal array values that aren't in the arguments.
+                        found_healing_marker = true;
+                        break;
+                    }
+                }
+                arr.push_back(remove_unsupported_healings_and_dump_args(value));
+            }
+            return arr;
+        }
+        return j;
+    };
+
+    auto cleaned = remove_unsupported_healings_and_dump_args(partial->json);
+    LOG_DBG("Cleaned up JSON %s to %s (json_healing_marker : '%s')\n", partial->json.dump().c_str(), cleaned.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
+    return consume_json_result {
+        cleaned,
+        /* .is_partial = */ found_healing_marker,
+    };
+}
--- a/common/chat-parser.h
+++ b/common/chat-parser.h
@@ -0,0 +1,116 @@
+#pragma once
+
+#include "chat.h"
+#include "json-partial.h"
+#include "json.hpp"
+#include "regex-partial.h"
+
+#include <optional>
+#include <string>
+#include <vector>
+
+class common_chat_msg_partial_exception : public std::runtime_error {
+  public:
+    common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
+};
+
+class common_chat_msg_parser {
+    std::string input_;
+    bool is_partial_;
+    common_chat_syntax syntax_;
+    std::string healing_marker_;
+
+    size_t pos_ = 0;
+    common_chat_msg result_;
+
+  public:
+    common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
+    const std::string & input() const { return input_; }
+    size_t pos() const { return pos_; }
+    const std::string & healing_marker() const { return healing_marker_; }
+    const bool & is_partial() const { return is_partial_; }
+    const common_chat_msg & result() const { return result_; }
+
+    void move_to(size_t pos) {
+        if (pos > input_.size()) {
+            throw std::runtime_error("Invalid position!");
+        }
+        pos_ = pos;
+    }
+    void move_back(size_t n) {
+        if (pos_ < n) {
+            throw std::runtime_error("Can't move back that far!");
+        }
+        pos_ -= n;
+    }
+
+    // Get the substring of the input at the given range
+    std::string str(const common_string_range & rng) const;
+
+    // Appends to the result.content field
+    void add_content(const std::string & content);
+
+    // Appends to the result.reasoning_content field
+    void add_reasoning_content(const std::string & reasoning_content);
+
+    // Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
+    bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);
+
+    // Adds a tool call using the "name", "id" and "arguments" fields of the json object
+    bool add_tool_call(const nlohmann::ordered_json & tool_call);
+
+    // Adds an array of tool calls using their "name", "id" and "arguments" fields.
+    bool add_tool_calls(const nlohmann::ordered_json & arr);
+
+    void finish();
+
+    bool consume_spaces();
+
+    void consume_literal(const std::string & literal);
+
+    bool try_parse_reasoning(const std::string & start_think, const std::string & end_think);
+
+    std::string consume_rest();
+
+    struct find_regex_result {
+        std::string prelude;
+        std::vector<common_string_range> groups;
+    };
+
+    std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos);
+
+    bool try_consume_literal(const std::string & literal);
+
+    std::optional<find_regex_result> try_find_literal(const std::string & literal);
+
+    find_regex_result consume_regex(const common_regex & regex);
+
+    std::optional<find_regex_result> try_consume_regex(const common_regex & regex);
+
+    std::optional<common_json> try_consume_json();
+    common_json consume_json();
+
+    struct consume_json_result {
+        nlohmann::ordered_json value;
+        bool is_partial;
+    };
+
+    /*
+        Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings.
+
+        By default, object keys can't be truncated, nor can string values (their corresponding key is removed,
+        e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}`
+
+        But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings
+        - with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}`
+        - with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}`
+    */
+    consume_json_result consume_json_with_dumped_args(
+        const std::vector<std::vector<std::string>> & args_paths = {},
+        const std::vector<std::vector<std::string>> & content_paths = {}
+    );
+    std::optional<consume_json_result> try_consume_json_with_dumped_args(
+        const std::vector<std::vector<std::string>> & args_paths = {},
+        const std::vector<std::vector<std::string>> & content_paths = {}
+    );
+};
--- a/common/chat.cpp
+++ b/common/chat.cpp
--- a/common/chat.h
+++ b/common/chat.h
@@ -3,6 +3,8 @@
 #pragma once

 #include "common.h"
+#include <functional>
+#include <chrono>
 #include <string>
 #include <vector>

@@ -12,11 +14,19 @@ struct common_chat_tool_call {
    std::string name;
    std::string arguments;
    std::string id;
+
+    bool operator==(const common_chat_tool_call & other) const {
+        return name == other.name && arguments == other.arguments && id == other.id;
+    }
 };

 struct common_chat_msg_content_part {
    std::string type;
    std::string text;
+
+    bool operator==(const common_chat_msg_content_part & other) const {
+        return type == other.type && text == other.text;
+    }
 };

 struct common_chat_msg {
@@ -27,6 +37,51 @@ struct common_chat_msg {
    std::string reasoning_content;
    std::string tool_name;
    std::string tool_call_id;
+
+    template <class T> T to_json_oaicompat() const;
+
+    bool empty() const {
+        return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
+    }
+    void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
+        for (auto i = 0u; i < tool_calls.size(); i++) {
+            if (ids_cache.size() <= i) {
+                auto id = tool_calls[i].id;
+                if (id.empty()) {
+                    id = gen_tool_call_id();
+                }
+                ids_cache.push_back(id);
+            }
+            tool_calls[i].id = ids_cache[i];
+        }
+    }
+    bool operator==(const common_chat_msg & other) const {
+        return role == other.role
+            && content == other.content
+            && content_parts == other.content_parts
+            && tool_calls == other.tool_calls
+            && reasoning_content == other.reasoning_content
+            && tool_name == other.tool_name
+            && tool_call_id == other.tool_call_id;
+    }
+    bool operator!=(const common_chat_msg & other) const {
+        return !(*this == other);
+    }
+};
+
+struct common_chat_msg_diff {
+    // std::string reasoning_content_delta;
+    std::string content_delta;
+    size_t tool_call_index = std::string::npos;
+    common_chat_tool_call tool_call_delta;
+
+    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
+
+    bool operator==(const common_chat_msg_diff & other) const {
+        return content_delta == other.content_delta
+        && tool_call_index == other.tool_call_index
+        && tool_call_delta == other.tool_call_delta;
+    }
 };

 struct common_chat_tool {
@@ -48,14 +103,11 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_LLAMA_3_X,
    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
-    COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
    COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
    COMMON_CHAT_FORMAT_HERMES_2_PRO,
-    COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
    COMMON_CHAT_FORMAT_COMMAND_R7B,
-    COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,

    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
@@ -70,7 +122,8 @@ struct common_chat_templates_inputs {
    std::vector<common_chat_tool> tools;
    common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
    bool parallel_tool_calls = false;
-    bool extract_reasoning     = true;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
+    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
 };

 struct common_chat_params {
@@ -78,11 +131,20 @@ struct common_chat_params {
    std::string                         prompt;
    std::string                         grammar;
    bool                                grammar_lazy = false;
+    bool                                thinking_forced_open = false;
    std::vector<common_grammar_trigger> grammar_triggers;
    std::vector<std::string>            preserved_tokens;
    std::vector<std::string>            additional_stops;
 };

+struct common_chat_syntax {
+    common_chat_format       format                = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    common_reasoning_format  reasoning_format      = COMMON_REASONING_FORMAT_NONE;
+    // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
+    bool                     reasoning_in_content  = false;
+    bool                     thinking_forced_open  = false;
+};
+
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);

@@ -120,7 +182,7 @@ std::string common_chat_format_example(
    bool use_jinja);

 std::string               common_chat_format_name(common_chat_format format);
-common_chat_msg           common_chat_parse(      const std::string & input, common_chat_format format);
+common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);

 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);

@@ -133,3 +195,5 @@ template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common
 // T can be std::string containing JSON or nlohmann::ordered_json
 template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
 template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
+
+template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -443,6 +443,25 @@ void string_replace_all(std::string & s, const std::string & search, const std::
    s = std::move(builder);
 }

+bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
+    return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
+}
+size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
+    if (!str.empty() && !stop.empty()) {
+        const char text_last_char = str.back();
+        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
+            if (stop[char_index] == text_last_char) {
+                const auto current_partial = stop.substr(0, char_index + 1);
+                if (string_ends_with(str, current_partial)) {
+                    return str.size() - char_index - 1;
+                }
+            }
+        }
+    }
+
+    return std::string::npos;
+}
+
 std::string regex_escape(const std::string & s) {
    static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
    return std::regex_replace(s, special_chars, "\\$0");
@@ -1083,6 +1102,9 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
        mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
    }

+    mparams.progress_callback           = params.load_progress_callback;
+    mparams.progress_callback_user_data = params.load_progress_callback_user_data;
+
    return mparams;
 }

@@ -1113,6 +1135,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    cparams.offload_kqv       = !params.no_kv_offload;
    cparams.flash_attn        = params.flash_attn;
    cparams.no_perf           = params.no_perf;
+    cparams.op_offload        = !params.no_op_offload;
+    cparams.swa_full          = params.swa_full;

    if (params.reranking) {
        cparams.embeddings    = true;
@@ -1305,81 +1329,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
    return text;
 }

-//
-// KV cache utils
-//
-
-void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
-    static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
-
-    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
-        view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
-
-    llama_kv_cache_view_cell * c_curr = view.cells;
-    llama_seq_id * cs_curr = view.cells_sequences;
-
-    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
-        if (i % row_size == 0) {
-            printf("\n%5d: ", i);
-        }
-        int seq_count = 0;
-        for (int j = 0; j < view.n_seq_max; j++) {
-            if (cs_curr[j] >= 0) { seq_count++; }
-        }
-        putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
-    }
-
-    printf("\n=== Done dumping\n");
-}
-
-void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
-    static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
-
-    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
-        view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
-
-    std::unordered_map<llama_seq_id, size_t> seqs;
-    llama_kv_cache_view_cell * c_curr = view.cells;
-    llama_seq_id * cs_curr = view.cells_sequences;
-
-    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
-        for (int j = 0; j < view.n_seq_max; j++) {
-            if (cs_curr[j] < 0) { continue; }
-            if (seqs.find(cs_curr[j]) == seqs.end()) {
-                if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
-                const size_t sz = seqs.size();
-                seqs[cs_curr[j]] = sz;
-            }
-        }
-        if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
-    }
-
-    printf("=== Sequence legend: ");
-    for (const auto & it : seqs) {
-        printf("%zu=%d, ", it.second, it.first);
-    }
-    printf("'+'=other sequence ids");
-
-    c_curr = view.cells;
-    cs_curr = view.cells_sequences;
-    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
-        if (i % row_size == 0) {
-            printf("\n%5d: ", i);
-        }
-        for (int j = 0; j < view.n_seq_max; j++) {
-            if (cs_curr[j] >= 0) {
-                const auto & it = seqs.find(cs_curr[j]);
-                putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
-            } else {
-                putchar('.');
-            }
-        }
-        putchar(' ');
-    }
-
-    printf("\n=== Done dumping\n");
-}
-
 //
 // Embedding utils
 //
@@ -1564,3 +1513,20 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c

    return result;
 }
+
+ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
+    const int64_t ne_datapoint = llama_n_ctx(ctx);
+    const int64_t ndata        = (tokens.size() - ne_datapoint - 1) / stride;
+    ggml_opt_dataset_t result = ggml_opt_dataset_init(
+        GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
+
+    llama_token * data   = (llama_token *) ggml_opt_dataset_data(result)->data;
+    llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
+
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        memcpy(data   + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
+        memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
+    }
+
+    return result;
+}
--- a/common/common.h
+++ b/common/common.h
@@ -6,6 +6,7 @@

 #include <set>
 #include <string>
+#include <string_view>
 #include <vector>
 #include <sstream>

@@ -75,7 +76,7 @@ enum llama_example {
    LLAMA_EXAMPLE_SERVER,
    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
    LLAMA_EXAMPLE_EXPORT_LORA,
-    LLAMA_EXAMPLE_LLAVA,
+    LLAMA_EXAMPLE_MTMD,
    LLAMA_EXAMPLE_LOOKUP,
    LLAMA_EXAMPLE_PARALLEL,
    LLAMA_EXAMPLE_TTS,
@@ -114,7 +115,7 @@ enum common_grammar_trigger_type {
    COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
-    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
+    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
 };

 struct common_grammar_trigger {
@@ -322,16 +323,17 @@ struct common_params {
    bool flash_attn        = false; // flash attention
    bool no_perf           = false; // disable performance metrics
    bool ctx_shift         = true;  // context shift on inifinite text generation
+    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
-    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
    bool warmup            = true;  // warmup run
    bool check_tensors     = false; // validate tensor data
+    bool no_op_offload     = false; // globally disable offload host tensor operations to device

    bool single_turn       = false; // single turn chat conversation

@@ -366,6 +368,7 @@ struct common_params {
    bool use_jinja = false;                                                                                 // NOLINT
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response

    std::vector<std::string> api_keys;

@@ -425,6 +428,11 @@ struct common_params {

    // common params
    std::string out_file; // output filename for all example programs
+    // optional callback for model loading progress and cancellation:
+    // called with a progress value between 0.0 and 1.0.
+    // return false from callback to abort model loading or true to continue
+    llama_progress_callback load_progress_callback = NULL;
+    void *                  load_progress_callback_user_data = NULL;
 };

 // call once at the start of a program if it uses libcommon
@@ -502,10 +510,9 @@ static bool string_starts_with(const std::string & str,
    return str.rfind(prefix, 0) == 0;
 }

-static bool string_ends_with(const std::string & str,
-                               const std::string & suffix) {  // While we wait for C++20's std::string::ends_with...
-    return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
-}
+// While we wait for C++20's std::string::ends_with...
+bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
+size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);

 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
@@ -614,16 +621,6 @@ std::string common_detokenize(
        const std::vector<llama_token> & tokens,
                                  bool   special = true);

-//
-// KV cache utils
-//
-
-// Dump the KV cache view with the number of sequences per cell.
-void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
-
-// Dump the KV cache view showing individual sequences in each cell (long output).
-void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
-
 //
 // Embedding utils
 //
@@ -665,3 +662,9 @@ const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

 }
+
+//
+// training utils
+//
+
+ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
--- a/common/json-partial.cpp
+++ b/common/json-partial.cpp
@@ -0,0 +1,255 @@
+#include <json-partial.h>
+#include "ggml.h"
+#include "log.h"
+#include <string>
+
+#include <json.hpp>
+
+using json = nlohmann::ordered_json;
+
+enum common_json_stack_element_type {
+    COMMON_JSON_STACK_ELEMENT_OBJECT,
+    COMMON_JSON_STACK_ELEMENT_KEY,
+    COMMON_JSON_STACK_ELEMENT_ARRAY,
+};
+
+struct common_json_stack_element {
+    common_json_stack_element_type type;
+    std::string key;
+};
+
+bool common_json_parse(
+    const std::string & input,
+    const std::string & healing_marker,
+    common_json & out)
+{
+    std::string::const_iterator it = input.begin();
+    const auto end = input.end();
+    return common_json_parse(it, end, healing_marker, out);
+}
+
+bool common_json_parse(
+    std::string::const_iterator & it,
+    const std::string::const_iterator & end,
+    const std::string & healing_marker,
+    common_json & out)
+{
+    // // https://json.nlohmann.me/features/parsing/sax_interface/
+    struct json_error_locator : public nlohmann::json_sax<json> {
+        std::size_t position;
+        bool found_error;
+        std::string last_token;
+        std::string exception_message;
+        std::vector<common_json_stack_element> stack;
+
+        json_error_locator() : position(0), found_error(false) {}
+
+        bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT
+            this->position = position - 1;
+            this->found_error = true;
+            this->last_token = last_token;
+            this->exception_message = ex.what();
+            return false;
+        }
+        void close_value() {
+            if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) {
+                stack.pop_back();
+            }
+        }
+        bool null() override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool boolean(bool) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool number_integer(number_integer_t) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool number_unsigned(number_unsigned_t) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool number_float(number_float_t, const string_t &) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool string(string_t &) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool binary(binary_t &) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool start_object(std::size_t) override { // NOLINT
+            stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""});
+            return true;
+        }
+        bool end_object() override {
+            GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT);
+            stack.pop_back();
+            close_value();
+            return true;
+        }
+        bool key(string_t & key) override { // NOLINT
+            stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key});
+            return true;
+        }
+        bool start_array(std::size_t) override { // NOLINT
+            stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""});
+            return true;
+        }
+        bool end_array() override {
+            GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY);
+            stack.pop_back();
+            close_value();
+            return true;
+        }
+    };
+    json_error_locator err_loc;
+    auto start = it;
+    json::sax_parse(it, end, &err_loc);
+
+    if (err_loc.found_error) {
+        it = start;
+        auto temptative_end = it + err_loc.position;
+        // LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());
+
+        auto input = std::string(it, temptative_end);
+        try {
+            out.json = json::parse(input);
+            // out.json = json::parse(it, temptative_end);
+            it = temptative_end;
+            return true;
+        } catch (const std::exception & ex) {
+            // No, needs healing.
+            LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str());
+        }
+        auto can_parse = [](const std::string & str) {
+            try {
+                auto _ = json::parse(str); // NOLINT
+                return true;
+            } catch (const std::exception &) {
+                return false;
+            }
+        };
+        if (!healing_marker.empty() && !err_loc.stack.empty()) {
+            std::string str(it, temptative_end);
+            auto last_non_sp_pos = str.find_last_not_of(" \n\r\t");
+            if (last_non_sp_pos == std::string::npos) {
+                throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
+            }
+            auto last_non_sp_char = str[last_non_sp_pos];
+            // Used to detect stops on a number, which may not be complete.
+            auto was_maybe_number = [&]() {
+                if (!str.empty() && std::isspace(str.back())) {
+                    return false;
+                }
+                return std::isdigit(last_non_sp_char) ||
+                    last_non_sp_char == '.' ||
+                    last_non_sp_char == 'e' ||
+                    last_non_sp_char == 'E' ||
+                    last_non_sp_char == '-';
+            };
+
+            std::string closing;
+            for (size_t i = err_loc.stack.size(); i > 0; i--) {
+                auto & el = err_loc.stack[i - 1];
+                if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
+                    closing += "}";
+                } else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
+                    closing += "]";
+                } else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) {
+                    throw std::runtime_error("Unexpected stack element type");
+                }
+            }
+
+            const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
+
+            if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
+                // We're inside an object value
+                if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) {
+                    // Was about to create an object value
+                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                } else if (can_parse(str + ": 1" + closing)) {
+                    str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing;
+                } else if (last_non_sp_char == '{' && can_parse(str + closing)) {
+                    // Was about to create an object
+                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
+                } else if (can_parse(str + "\"" + closing)) {
+                    // Was inside an object value string
+                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
+                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
+                    // Was inside an object value string after an escape
+                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
+                } else {
+                    // find last :
+                    auto last_pos = str.find_last_of(':');
+                    if (last_pos == std::string::npos) {
+                        throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
+                    }
+                    // Cutting back to opening : for object value
+                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                }
+            } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
+                if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) {
+                    // Was about to create an array value
+                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                } else if (can_parse(str + "\"" + closing)) {
+                    // Was inside an array value string
+                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
+                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
+                    // Was inside an array value string after an escape
+                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
+                } else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
+                    // Had just finished a value
+                    str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
+                } else {
+                    auto last_pos = str.find_last_of("[,");
+                    if (last_pos == std::string::npos) {
+                        throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location");
+                    }
+                    // Cutting back to last [ or , for array value
+                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                }
+            } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
+                if ((last_non_sp_char == '{' && can_parse(str + closing)) ||
+                        (last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) {
+                    // Was about to create an object key+value
+                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
+                } else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) {
+                    // Was about to create an object key+value
+                    str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing;
+                } else if (can_parse(str + "\": 1" + closing)) {
+                    // Was inside an object key string
+                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing;
+                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
+                    // Was inside an object key string after an escape
+                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
+                } else {
+                    auto last_pos = str.find_last_of(':');
+                    if (last_pos == std::string::npos) {
+                        throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
+                    }
+                    // fprintf(stderr, "Cutting back to last : for object key+value\n");
+                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                }
+            } else {
+                throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
+            }
+            // fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str());
+            out.json = json::parse(str);
+            it = temptative_end;
+            return true;
+        }
+        // TODO: handle unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
+        // fprintf(stderr, "Closing: TODO\n");
+        return false;
+    }
+    out.json = json::parse(it, end);
+    it = end;
+    return true;
+}
--- a/common/json-partial.h
+++ b/common/json-partial.h
@@ -0,0 +1,37 @@
+#pragma once
+#include <json.hpp>
+
+// Healing marker (empty if the JSON was fully parsed / wasn't healed).
+struct common_healing_marker {
+    // Raw marker.
+    std::string marker;
+
+    // Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format).
+    std::string json_dump_marker;
+};
+
+// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string)
+struct common_json {
+    nlohmann::ordered_json json;
+
+    common_healing_marker healing_marker;
+};
+
+// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty.
+//
+// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON.
+// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker.
+// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format).
+//
+// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again).
+bool common_json_parse(
+    const std::string & input,
+    const std::string & healing_marker,
+    common_json & out);
+
+// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds.
+bool common_json_parse(
+    std::string::const_iterator & it,
+    const std::string::const_iterator & end,
+    const std::string & healing_marker,
+    common_json & out);
--- a/common/minja/chat-template.hpp
+++ b/common/minja/chat-template.hpp
@@ -13,10 +13,12 @@
 #include <chrono>
 #include <cstddef>
 #include <cstdio>
+#include <ctime>
 #include <exception>
 #include <iomanip>
 #include <memory>
 #include <sstream>
+#include <stdexcept>
 #include <string>
 #include <vector>

@@ -393,8 +395,8 @@ class chat_template {

            for (const auto & message_ : adjusted_messages) {
                auto message = message_;
-                if (!message.contains("role") || !message.contains("content")) {
-                    throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
+                if (!message.contains("role") || (!message.contains("content") && !message.contains("tool_calls"))) {
+                    throw std::runtime_error("message must have 'role' and one of 'content' or 'tool_calls' fields: " + message.dump());
                }
                std::string role = message.at("role");

@@ -415,7 +417,6 @@ class chat_template {
                        }
                    }
                    if (polyfill_tool_calls) {
-                        auto content = message.at("content");
                        auto tool_calls = json::array();
                        for (const auto & tool_call : message.at("tool_calls")) {
                            if (tool_call.at("type") != "function") {
@@ -434,8 +435,11 @@ class chat_template {
                        auto obj = json {
                            {"tool_calls", tool_calls},
                        };
-                        if (!content.is_null() && !content.empty()) {
-                            obj["content"] = content;
+                        if (message.contains("content")) {
+                            auto content = message.at("content");
+                            if (!content.is_null() && !content.empty()) {
+                                obj["content"] = content;
+                            }
                        }
                        message["content"] = obj.dump(2);
                        message.erase("tool_calls");
--- a/common/minja/minja.hpp
+++ b/common/minja/minja.hpp
@@ -11,6 +11,7 @@
 #include <algorithm>
 #include <cctype>
 #include <cstddef>
+#include <cstdint>
 #include <cmath>
 #include <exception>
 #include <functional>
@@ -233,7 +234,7 @@ public:
      }
    } else if (is_object()) {
      if (!index.is_hashable())
-        throw std::runtime_error("Unashable type: " + index.dump());
+        throw std::runtime_error("Unhashable type: " + index.dump());
      auto it = object_->find(index.primitive_);
      if (it == object_->end())
        throw std::runtime_error("Key not found: " + index.dump());
@@ -252,7 +253,7 @@ public:
      auto index = key.get<int>();
      return array_->at(index < 0 ? array_->size() + index : index);
    } else if (object_) {
-      if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
+      if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
      auto it = object_->find(key.primitive_);
      if (it == object_->end()) return Value();
      return it->second;
@@ -261,7 +262,7 @@ public:
  }
  void set(const Value& key, const Value& value) {
    if (!object_) throw std::runtime_error("Value is not an object: " + dump());
-    if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
+    if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
    (*object_)[key.primitive_] = value;
  }
  Value call(const std::shared_ptr<Context> & context, ArgumentsValue & args) const {
@@ -398,7 +399,7 @@ public:
      }
      return false;
    } else if (object_) {
-      if (!value.is_hashable()) throw std::runtime_error("Unashable type: " + value.dump());
+      if (!value.is_hashable()) throw std::runtime_error("Unhashable type: " + value.dump());
      return object_->find(value.primitive_) != object_->end();
    } else {
      throw std::runtime_error("contains can only be called on arrays and objects: " + dump());
@@ -416,7 +417,7 @@ public:
    return const_cast<Value*>(this)->at(index);
  }
  Value& at(const Value & index) {
-    if (!index.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
+    if (!index.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
    if (is_array()) return array_->at(index.get<int>());
    if (is_object()) return object_->at(index.primitive_);
    throw std::runtime_error("Value is not an array or object: " + dump());
@@ -676,8 +677,8 @@ public:
 class VariableExpr : public Expression {
    std::string name;
 public:
-    VariableExpr(const Location & location, const std::string& n)
-      : Expression(location), name(n) {}
+    VariableExpr(const Location & loc, const std::string& n)
+      : Expression(loc), name(n) {}
    std::string get_name() const { return name; }
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
        if (!context->contains(name)) {
@@ -1200,9 +1201,9 @@ public:

 class SliceExpr : public Expression {
 public:
-    std::shared_ptr<Expression> start, end;
-    SliceExpr(const Location & loc, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e)
-      : Expression(loc), start(std::move(s)), end(std::move(e)) {}
+    std::shared_ptr<Expression> start, end, step;
+    SliceExpr(const Location & loc, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e, std::shared_ptr<Expression> && st = nullptr)
+      : Expression(loc), start(std::move(s)), end(std::move(e)), step(std::move(st)) {}
    Value do_evaluate(const std::shared_ptr<Context> &) const override {
        throw std::runtime_error("SliceExpr not implemented");
    }
@@ -1219,18 +1220,35 @@ public:
        if (!index) throw std::runtime_error("SubscriptExpr.index is null");
        auto target_value = base->evaluate(context);
        if (auto slice = dynamic_cast<SliceExpr*>(index.get())) {
-          auto start = slice->start ? slice->start->evaluate(context).get<int64_t>() : 0;
-          auto end = slice->end ? slice->end->evaluate(context).get<int64_t>() : (int64_t) target_value.size();
+          auto len = target_value.size();
+          auto wrap = [len](int64_t i) -> int64_t {
+            if (i < 0) {
+              return i + len;
+            }
+            return i;
+          };
+          int64_t step = slice->step ? slice->step->evaluate(context).get<int64_t>() : 1;
+          if (!step) {
+            throw std::runtime_error("slice step cannot be zero");
+          }
+          int64_t start = slice->start ? wrap(slice->start->evaluate(context).get<int64_t>()) : (step < 0 ? len - 1 : 0);
+          int64_t end = slice->end ? wrap(slice->end->evaluate(context).get<int64_t>()) : (step < 0 ? -1 : len);
          if (target_value.is_string()) {
            std::string s = target_value.get<std::string>();
-            if (start < 0) start = s.size() + start;
-            if (end < 0) end = s.size() + end;
-            return s.substr(start, end - start);
+
+            std::string result;
+            if (start < end && step == 1) {
+              result = s.substr(start, end - start);
+            } else {
+              for (int64_t i = start; step > 0 ? i < end : i > end; i += step) {
+                result += s[i];
+              }
+            }
+            return result;
+
          } else if (target_value.is_array()) {
-            if (start < 0) start = target_value.size() + start;
-            if (end < 0) end = target_value.size() + end;
            auto result = Value::array();
-            for (auto i = start; i < end; ++i) {
+            for (int64_t i = start; step > 0 ? i < end : i > end; i += step) {
              result.push_back(target_value.at(i));
            }
            return result;
@@ -1305,6 +1323,8 @@ public:
              if (name == "iterable") return l.is_iterable();
              if (name == "sequence") return l.is_array();
              if (name == "defined") return !l.is_null();
+              if (name == "true") return l.to_bool();
+              if (name == "false") return !l.to_bool();
              throw std::runtime_error("Unknown type for 'is' operator: " + name);
            };
            auto value = eval();
@@ -1520,6 +1540,10 @@ public:
            vargs.expectArgs("endswith method", {1, 1}, {0, 0});
            auto suffix = vargs.args[0].get<std::string>();
            return suffix.length() <= str.length() && std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
+          } else if (method->get_name() == "startswith") {
+            vargs.expectArgs("startswith method", {1, 1}, {0, 0});
+            auto prefix = vargs.args[0].get<std::string>();
+            return prefix.length() <= str.length() && std::equal(prefix.begin(), prefix.end(), str.begin());
          } else if (method->get_name() == "title") {
            vargs.expectArgs("title method", {0, 0}, {0, 0});
            auto res = str;
@@ -2082,28 +2106,37 @@ private:

      while (it != end && consumeSpaces() && peekSymbols({ "[", "." })) {
        if (!consumeToken("[").empty()) {
-            std::shared_ptr<Expression> index;
+          std::shared_ptr<Expression> index;
+          auto slice_loc = get_location();
+          std::shared_ptr<Expression> start, end, step;
+          bool has_first_colon = false, has_second_colon = false;
+
+          if (!peekSymbols({ ":" })) {
+            start = parseExpression();
+          }
+
+          if (!consumeToken(":").empty()) {
+            has_first_colon = true;
+            if (!peekSymbols({ ":", "]" })) {
+              end = parseExpression();
+            }
            if (!consumeToken(":").empty()) {
-              auto slice_end = parseExpression();
-              index = std::make_shared<SliceExpr>(slice_end->location, nullptr, std::move(slice_end));
-            } else {
-              auto slice_start = parseExpression();
-              if (!consumeToken(":").empty()) {
-                consumeSpaces();
-                if (peekSymbols({ "]" })) {
-                  index = std::make_shared<SliceExpr>(slice_start->location, std::move(slice_start), nullptr);
-                } else {
-                  auto slice_end = parseExpression();
-                  index = std::make_shared<SliceExpr>(slice_start->location, std::move(slice_start), std::move(slice_end));
-                }
-              } else {
-                index = std::move(slice_start);
+              has_second_colon = true;
+              if (!peekSymbols({ "]" })) {
+                step = parseExpression();
              }
            }
-            if (!index) throw std::runtime_error("Empty index in subscript");
-            if (consumeToken("]").empty()) throw std::runtime_error("Expected closing bracket in subscript");
+          }

-            value = std::make_shared<SubscriptExpr>(value->location, std::move(value), std::move(index));
+          if ((has_first_colon || has_second_colon) && (start || end || step)) {
+            index = std::make_shared<SliceExpr>(slice_loc, std::move(start), std::move(end), std::move(step));
+          } else {
+            index = std::move(start);
+          }
+          if (!index) throw std::runtime_error("Empty index in subscript");
+          if (consumeToken("]").empty()) throw std::runtime_error("Expected closing bracket in subscript");
+
+          value = std::make_shared<SubscriptExpr>(value->location, std::move(value), std::move(index));
        } else if (!consumeToken(".").empty()) {
            auto identifier = parseIdentifier();
            if (!identifier) throw std::runtime_error("Expected identifier in subscript");
--- a/common/regex-partial.cpp
+++ b/common/regex-partial.cpp
@@ -0,0 +1,204 @@
+#include "regex-partial.h"
+#include "common.h"
+#include <functional>
+#include <optional>
+
+common_regex::common_regex(const std::string & pattern) :
+    pattern(pattern),
+    rx(pattern),
+    rx_reversed_partial(regex_to_reversed_partial_regex(pattern)) {}
+
+common_regex_match common_regex::search(const std::string & input, size_t pos, bool as_match) const {
+    std::smatch match;
+    if (pos > input.size()) {
+        throw std::runtime_error("Position out of bounds");
+    }
+    auto start = input.begin() + pos;
+    auto found = as_match
+        ? std::regex_match(start, input.end(), match, rx)
+        : std::regex_search(start, input.end(), match, rx);
+    if (found) {
+        common_regex_match res;
+        res.type = COMMON_REGEX_MATCH_TYPE_FULL;
+        for (size_t i = 0; i < match.size(); ++i) {
+            auto begin = pos + match.position(i);
+            res.groups.emplace_back(begin, begin + match.length(i));
+        }
+        return res;
+    }
+    std::match_results<std::string::const_reverse_iterator> srmatch;
+    if (std::regex_match(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial)) {
+        auto group = srmatch[1].str();
+        if (group.length() != 0) {
+            auto it = srmatch[1].second.base();
+            // auto position = static_cast<size_t>(std::distance(input.begin(), it));
+            if ((!as_match) || it == input.begin()) {
+                common_regex_match res;
+                res.type = COMMON_REGEX_MATCH_TYPE_PARTIAL;
+                const size_t begin = std::distance(input.begin(), it);
+                const size_t end = input.size();
+                if (begin == std::string::npos || end == std::string::npos || begin > end) {
+                    throw std::runtime_error("Invalid range");
+                }
+                res.groups.push_back({begin, end});
+                return res;
+            }
+        }
+    }
+    return {};
+}
+
+/*
+  Transforms a regex pattern to a partial match pattern that operates on a reversed input string to find partial final matches of the original pattern.
+
+  Ideally we'd like to use boost::match_partial (https://beta.boost.org/doc/libs/1_59_0/libs/regex/doc/html/boost_regex/partial_matches.html)
+  to see if a string ends with a partial regex match, but but it's not in std::regex yet.
+  Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input.
+
+  - /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:(?:d)?c)?b)?a).*
+  - /a|b/ -> (a|b).*
+  - /a*?/ -> error, could match ""
+  - /a*b/ -> ((?:b)?a*+).* (final repetitions become eager)
+  - /.*?ab/ -> ((?:b)?a).* (merge .*)
+  - /a.*?b/ -> ((?:b)?.*?a).* (keep reluctant matches)
+  - /a(bc)d/ -> ((?:(?:d)?(?:(?:c)?b))?a).*
+  - /a(bc|de)/ -> ((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a).*
+  - /ab{2,4}c/ -> abbb?b?c -> ((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a).*
+
+  The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern
+  (i.e. just where the final .* starts in the inverted pattern; all other groups are turned into non-capturing groups, and reluctant quantifiers are ignored)
+*/
+std::string regex_to_reversed_partial_regex(const std::string & pattern) {
+    auto it = pattern.begin();
+    const auto end = pattern.end();
+
+    std::function<std::string()> process = [&]() {
+        std::vector<std::vector<std::string>> alternatives(1);
+        std::vector<std::string> * sequence = &alternatives.back();
+
+        while (it != end) {
+            if (*it == '[') {
+                auto start = it;
+                ++it;
+                while (it != end) {
+                    if ((*it == '\\') && (++it != end)) {
+                        ++it;
+                    } else if ((it != end) && (*it == ']')) {
+                        break;
+                    } else {
+                        ++it;
+                    }
+                }
+                if (it == end) {
+                    throw std::runtime_error("Unmatched '[' in pattern");
+                }
+                ++it;
+                sequence->push_back(std::string(start, it));
+            } else if (*it == '*' || *it == '?' || *it == '+') {
+                if (sequence->empty()) {
+                    throw std::runtime_error("Quantifier without preceding element");
+                }
+                sequence->back() += *it;
+                auto is_star = *it == '*';
+                ++it;
+                if (is_star) {
+                    if (*it == '?') {
+                        ++it;
+                    }
+                }
+            } else if (*it == '{') {
+                if (sequence->empty()) {
+                    throw std::runtime_error("Repetition without preceding element");
+                }
+                ++it;
+                auto start = it;
+                while (it != end && *it != '}') {
+                    ++it;
+                }
+                if (it == end) {
+                    throw std::runtime_error("Unmatched '{' in pattern");
+                }
+                auto parts = string_split(std::string(start, it), ",");
+                ++it;
+                if (parts.size() > 2) {
+                    throw std::runtime_error("Invalid repetition range in pattern");
+                }
+
+                auto parseOptInt = [&](const std::string & s, const std::optional<int> & def = std::nullopt) -> std::optional<int> {
+                    if (s.empty()) {
+                        return def;
+                    }
+                    return std::stoi(s);
+                };
+                auto min = parseOptInt(parts[0], 0);
+                auto max = parts.size() == 1 ? min : parseOptInt(parts[1]);
+                if (min && max && *max < *min) {
+                    throw std::runtime_error("Invalid repetition range in pattern");
+                }
+                // Brutal but... let's repeat at least min times, then ? for the delta between min & max (or * for unbounded)
+                auto part = sequence->back();
+                sequence->pop_back();
+                for (int i = 0; i < *min; i++) {
+                    sequence->push_back(part);
+                }
+                if (max) {
+                    for (int i = *min; i < *max; i++) {
+                        sequence->push_back(part + "?");
+                    }
+                } else {
+                    sequence->push_back(part + "*");
+                }
+            } else if (*it == '(') {
+                ++it;
+                if (it != end && *it == '?' && (it + 1 != end) && *(it + 1) == ':') {
+                    it += 2;
+                }
+                auto sub = process();
+                if (*it != ')') {
+                    throw std::runtime_error("Unmatched '(' in pattern");
+                }
+                ++it;
+                auto & part = sequence->emplace_back("(?:");
+                part += sub;
+                part += ")";
+            } else if (*it == ')') {
+                break;
+            } else if (*it == '|') {
+                ++it;
+                alternatives.emplace_back();
+                sequence = &alternatives.back();
+            } else if (*it == '\\' && (++it != end)) {
+                auto str = std::string("\\") + *it;
+                sequence->push_back(str);
+                ++it;
+            } else if (it != end) {
+                sequence->push_back(std::string(1, *it));
+                ++it;
+            }
+        }
+
+        // /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:d)?c)?b)?a).*
+        // if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group
+        // We'll do the outermost capturing group and final .* in the enclosing function.
+        std::vector<std::string> res_alts;
+        for (const auto & parts : alternatives) {
+            auto & res = res_alts.emplace_back();
+            for (size_t i = 0; i < parts.size() - 1; i++) {
+                res += "(?:";
+            }
+            for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
+                res += *it;
+                if (it != parts.rend() - 1) {
+                    res += ")?";
+                }
+            }
+        }
+        return string_join(res_alts, "|");
+    };
+    auto res = process();
+    if (it != end) {
+        throw std::runtime_error("Unmatched '(' in pattern");
+    }
+
+    return "(" + res + ")[\\s\\S]*";
+}
--- a/common/regex-partial.h
+++ b/common/regex-partial.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <regex>
+#include <string>
+
+enum common_regex_match_type {
+    COMMON_REGEX_MATCH_TYPE_NONE,
+    COMMON_REGEX_MATCH_TYPE_PARTIAL,
+    COMMON_REGEX_MATCH_TYPE_FULL,
+};
+
+struct common_string_range {
+    size_t begin;
+    size_t end;
+    common_string_range(size_t begin, size_t end) : begin(begin), end(end) {
+        if (begin > end) {
+            throw std::runtime_error("Invalid range");
+        }
+    }
+    // prevent default ctor
+    common_string_range() = delete;
+    bool empty() const {
+        return begin == end;
+    }
+    bool operator==(const common_string_range & other) const {
+        return begin == other.begin && end == other.end;
+    }
+};
+
+struct common_regex_match {
+    common_regex_match_type type = COMMON_REGEX_MATCH_TYPE_NONE;
+    std::vector<common_string_range> groups;
+
+    bool operator==(const common_regex_match & other) const {
+        return type == other.type && groups == other.groups;
+    }
+    bool operator!=(const common_regex_match & other) const {
+        return !(*this == other);
+    }
+};
+
+class common_regex {
+    std::string pattern;
+    std::regex rx;
+    std::regex rx_reversed_partial;
+
+  public:
+    explicit common_regex(const std::string & pattern);
+
+    common_regex_match search(const std::string & input, size_t pos, bool as_match = false) const;
+
+    const std::string & str() const { return pattern; }
+};
+
+// For testing only (pretty print of failures).
+std::string regex_to_reversed_partial_regex(const std::string & pattern);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -161,7 +161,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
        GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
    } else {
-        std::vector<std::string> patterns_at_start;
+        std::vector<std::string> trigger_patterns;
        std::vector<std::string> patterns_anywhere;
        std::vector<llama_token> trigger_tokens;
        for (const auto & trigger : params.grammar_triggers) {
@@ -173,10 +173,13 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
-                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
                {
-                    const auto & pattern = trigger.value;
-                    (trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
+                    patterns_anywhere.push_back(trigger.value);
+                    break;
+                }
+                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
+                {
+                    trigger_patterns.push_back(trigger.value);
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
@@ -190,10 +193,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
            }
        }

-        std::vector<std::string> trigger_patterns;
-        if (!patterns_at_start.empty()) {
-            trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
-        }
        if (!patterns_anywhere.empty()) {
            trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
        }
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -45,7 +45,7 @@ class SentencePieceTokenTypes(IntEnum):

 class ModelType(IntEnum):
    TEXT = 1
-    VISION = 2
+    MMPROJ = 2


 AnyModel = TypeVar("AnyModel", bound="type[ModelBase]")
@@ -54,7 +54,7 @@ AnyModel = TypeVar("AnyModel", bound="type[ModelBase]")
 class ModelBase:
    _model_classes: dict[ModelType, dict[str, type[ModelBase]]] = {
        ModelType.TEXT: {},
-        ModelType.VISION: {},
+        ModelType.MMPROJ: {},
    }

    dir_model: Path
@@ -88,7 +88,7 @@ class ModelBase:
                 small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None):
        if type(self) is ModelBase or \
                type(self) is TextModel or \
-                type(self) is VisionModel:
+                type(self) is MmprojModel:
            raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")

        self.dir_model = dir_model
@@ -308,6 +308,8 @@ class ModelBase:
                            gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
                            gguf.MODEL_TENSOR.POSNET_NORM1,
                            gguf.MODEL_TENSOR.POSNET_NORM2,
+                            gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
+                            gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
                        )
                    )
                    or not new_name.endswith(".weight")
@@ -437,7 +439,7 @@ class ModelBase:
        assert names

        def func(modelcls: AnyModel) -> AnyModel:
-            model_type = ModelType.VISION if modelcls.model_arch == gguf.MODEL_ARCH.CLIP_VISION else ModelType.TEXT
+            model_type = ModelType.MMPROJ if modelcls.model_arch == gguf.MODEL_ARCH.MMPROJ else ModelType.TEXT
            for name in names:
                cls._model_classes[model_type][name] = modelcls
            return modelcls
@@ -1113,60 +1115,87 @@ class TextModel(ModelBase):
            self.gguf_writer.add_pooling_type(pooling_type)


-class VisionModel(ModelBase):
-    model_type = ModelType.VISION
-    model_arch = gguf.MODEL_ARCH.CLIP_VISION
+class MmprojModel(ModelBase):
+    model_type = ModelType.MMPROJ
+    model_arch = gguf.MODEL_ARCH.MMPROJ
    preprocessor_config: dict[str, Any]
    global_config: dict[str, Any]

+    has_vision_encoder: bool = True # by default
+    has_audio_encoder: bool = False
+
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

-        if self.model_arch != gguf.MODEL_ARCH.CLIP_VISION:
-            raise TypeError("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION")
+        if self.model_arch != gguf.MODEL_ARCH.MMPROJ:
+            raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ")
+
+        if self.has_vision_encoder and self.has_audio_encoder:
+            raise NotImplementedError("both vision + audio not supported yet")

        # get n_embd of the text model
        if "text_config" not in self.hparams:
            self.hparams["text_config"] = {}
+        if "audio_config" not in self.hparams:
+            self.hparams["audio_config"] = {}
        text_config = {**self.hparams, **self.hparams["text_config"]}
        self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
        assert self.n_embd_text > 0, "n_embd not found in hparams"

-        if "vision_config" not in self.hparams:
-            raise ValueError("vision_config not found in hparams")
        # move vision config to the top level, while preserving the original hparams in global_config
        self.global_config = self.hparams
-        self.hparams = self.hparams["vision_config"]
+
+        if "vision_config" in self.hparams:
+            self.hparams = self.hparams["vision_config"]
+        elif "audio_config" in self.hparams:
+            self.hparams = self.hparams["audio_config"]
+        else:
+            raise ValueError("vision_config / audio_config not found in hparams")

        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"])
-        self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, self.block_count)
+        self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)

        # load preprocessor config
        with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
            self.preprocessor_config = json.load(f)

    def set_type(self):
-        self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION)
+        self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)

    def set_gguf_parameters(self):
        self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
-        self.gguf_writer.add_vision_has_vision_encoder(True)

-        # vision config
-        self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
-        self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
-        self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
-        self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
-        self.gguf_writer.add_vision_block_count(self.block_count)
-        self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))
+        if self.has_vision_encoder:
+            self.gguf_writer.add_clip_has_vision_encoder(True)
+            self.gguf_writer.add_vision_projection_dim(self.n_embd_text)

-        # preprocessor config
-        self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
-        self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
+            # vision config
+            self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
+            self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
+            self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
+            self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
+            self.gguf_writer.add_vision_block_count(self.block_count)
+            self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))
+
+            # preprocessor config
+            self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
+            self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
+
+        elif self.has_audio_encoder:
+            self.gguf_writer.add_clip_has_audio_encoder(True)
+            self.gguf_writer.add_audio_projection_dim(self.n_embd_text)
+
+            # audio config
+            self.gguf_writer.add_audio_embedding_length(self.find_hparam(["hidden_size"]))
+            self.gguf_writer.add_audio_feed_forward_length(self.find_hparam(["intermediate_size"]))
+            self.gguf_writer.add_audio_block_count(self.block_count)
+            self.gguf_writer.add_audio_head_count(self.find_hparam(["num_attention_heads"]))
+
+        else:
+            raise ValueError("MmprojModel must have either vision or audio encoder")

    def write_vocab(self):
-        raise ValueError("VisionModel does not support vocab writing")
+        raise ValueError("MmprojModel does not support vocab writing")


@ModelBase.register("GPTNeoXForCausalLM")
@@ -1950,7 +1979,7 @@ class LlamaModel(TextModel):
    "LlavaForConditionalGeneration", # pixtral
    "Mistral3ForConditionalGeneration", # mistral small 3.1
 )
-class LlavaVisionModel(VisionModel):
+class LlavaVisionModel(MmprojModel):
    img_break_tok_id = -1

    def __init__(self, *args, **kwargs):
@@ -1976,7 +2005,7 @@ class LlavaVisionModel(VisionModel):
        super().set_gguf_parameters()
        hparams = self.hparams
        if hparams["model_type"] == "pixtral":
-            self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.PIXTRAL)
+            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL)
            self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])

            # hidden_act
@@ -2015,7 +2044,7 @@ class LlavaVisionModel(VisionModel):


@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration")
-class SmolVLMModel(VisionModel):
+class SmolVLMModel(MmprojModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        if self.hparams["model_type"] == "smolvlm_vision":
@@ -2027,7 +2056,7 @@ class SmolVLMModel(VisionModel):

    def set_gguf_parameters(self):
        super().set_gguf_parameters()
-        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.IDEFICS3)
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.IDEFICS3)
        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
        self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
        self.gguf_writer.add_vision_use_gelu(True)
@@ -2069,6 +2098,9 @@ class Llama4Model(LlamaModel):
        self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
+        if name.startswith("language_model."):
+            name = name.replace("language_model.", "")
+
        # split the gate_up into gate and up
        if "gate_up_proj" in name:
            name_up = name.replace("gate_up_proj", "up_proj.weight")
@@ -2089,6 +2121,26 @@ class Llama4Model(LlamaModel):
        return super().modify_tensors(data_torch, name, bid)


+@ModelBase.register("Llama4ForConditionalGeneration")
+class Llama4VisionModel(MmprojModel):
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LLAMA4)
+        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"])
+        self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / self.hparams["pixel_shuffle_ratio"]))
+        assert self.hparams["hidden_act"] == "gelu"
+        self.gguf_writer.add_vision_use_gelu(True)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid # unused
+        if "multi_modal_projector" in name or "vision_model" in name:
+            # process vision tensors
+            if "positional_embedding_vlm" in name and ".weight" not in name:
+                name += ".weight"
+            return [(self.map_tensor_name(name), data_torch)]
+        return []
+
+
@ModelBase.register("Mistral3ForConditionalGeneration")
 class Mistral3Model(LlamaModel):
    model_arch = gguf.MODEL_ARCH.LLAMA
@@ -2621,7 +2673,7 @@ class Qwen2Model(TextModel):
        yield from super().modify_tensors(data_torch, name, bid)


-@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
+@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
 class Qwen2VLModel(TextModel):
    model_arch = gguf.MODEL_ARCH.QWEN2VL

@@ -2645,8 +2697,8 @@ class Qwen2VLModel(TextModel):
        return [(self.map_tensor_name(name), data_torch)]


-@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
-class Qwen2VLVisionModel(VisionModel):
+@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
+class Qwen2VLVisionModel(MmprojModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.hparams["image_size"] = self.hparams.get("image_size", 560)
@@ -2661,9 +2713,9 @@ class Qwen2VLVisionModel(VisionModel):
        super().set_gguf_parameters()
        hparams = self.hparams
        if self.global_config['model_type'] == 'qwen2_vl':
-            self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN2VL)
+            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
        elif self.global_config['model_type'] == 'qwen2_5_vl':
-            self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN25VL)
+            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
            self.gguf_writer.add_vision_use_silu(True)
            # find n_wa_pattern (window attention pattern)
            fullatt_block_indexes = hparams.get("fullatt_block_indexes")
@@ -2722,11 +2774,11 @@ class Qwen2VLVisionModel(VisionModel):


@ModelBase.register("InternVisionModel")
-class InternVisionModel(VisionModel):
+class InternVisionModel(MmprojModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        hparams = self.hparams
-        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.INTERNVL)
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL)
        self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
        # hidden_act
        if hparams["hidden_act"] == "silu":
@@ -3984,11 +4036,11 @@ class Gemma3Model(TextModel):


@ModelBase.register("Gemma3ForConditionalGeneration")
-class Gemma3VisionModel(VisionModel):
+class Gemma3VisionModel(MmprojModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        hparams = self.hparams
-        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.GEMMA3)
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3)
        # default values below are taken from HF tranformers code
        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
        self.gguf_writer.add_vision_use_gelu(True)
@@ -5746,11 +5798,20 @@ class GraniteModel(LlamaModel):
            logger.info("gguf: (granite) logits_scale = %s", logits_scale)


-@ModelBase.register("GraniteMoeForCausalLM")
+@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM")
 class GraniteMoeModel(GraniteModel):
    """Conversion for IBM's GraniteMoeForCausalLM"""
    model_arch = gguf.MODEL_ARCH.GRANITE_MOE

+    def set_gguf_parameters(self):
+        """GraniteMoeShared uses GraniteMoe parameters plus the following:
+        - shared_intermediate_size
+        """
+        super().set_gguf_parameters()
+        if shared_feed_forward_length := self.hparams.get("shared_intermediate_size"):
+            self.gguf_writer.add_expert_shared_feed_forward_length(shared_feed_forward_length)
+            logger.info("gguf: (granitemoeshared) shared_feed_forward_length = %s", shared_feed_forward_length)
+
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        """In modeling_granitemoe, the JetMoe implementation of parallel experts
        is used. This essentially merges w1 and w3 into a single tensor with 2x
@@ -5761,12 +5822,21 @@ class GraniteMoeModel(GraniteModel):
        if name.endswith("block_sparse_moe.input_linear.weight"):
            ffn_dim = self.hparams["intermediate_size"]
            assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
-            gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
+            gate, up = data_torch.split(ffn_dim, dim=-2)
            return [
                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
            ]

+        if name.endswith("shared_mlp.input_linear.weight"):
+            ffn_dim = self.hparams["shared_intermediate_size"]
+            assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
+            gate, up = data_torch.split(ffn_dim, dim=-2)
+            return [
+                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
+                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
+            ]
+
        return super().modify_tensors(data_torch, name, bid)


@@ -5917,6 +5987,52 @@ class ChameleonModel(TextModel):
        return data_torch


+@ModelBase.register("UltravoxModel")
+class UltravoxModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.LLAMA # dummy
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        raise NotImplementedError("Ultravox does not have text decoder. Please use --mmproj argument")
+
+
+@ModelBase.register("UltravoxModel")
+class UltravoxAudioModel(MmprojModel):
+    has_vision_encoder = False # no vision encoder
+    has_audio_encoder = True
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.hparams["hidden_size"] = self.hparams["d_model"]
+        self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
+        self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX)
+        self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
+        self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
+        self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        del bid, new_name, n_dims  # unused
+        if ".conv" in name and ".weight" in name:
+            return gguf.GGMLQuantizationType.F16
+        return False
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        # prevent clash naming with vision tensors
+        if name.startswith("multi_modal_projector"):
+            name = "audio." + name
+
+        if "conv1.bias" in name or "conv2.bias" in name:
+            # transpose conv1 and conv2 bias
+            data_torch = data_torch.unsqueeze(-1)
+
+        return [(self.map_tensor_name(name), data_torch)]
+
 ###### CONVERSION LOGIC ######


@@ -6092,13 +6208,15 @@ def split_str_to_n_bytes(split_str: str) -> int:


 def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str:
+    # TODO @ngxson : this won't work correctly if the model has both audio & vision encoders
+    # maybe we should fallback to text model's arch in that case, since not many models have both
    text_config = hparams.get("text_config", {})
    vision_config = hparams.get("vision_config", {})
    arch = hparams["architectures"][0]
    # if "architectures" is found in the sub-config, use that instead
    if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
        arch = text_config["architectures"][0]
-    elif model_type == ModelType.VISION and vision_config.get("architectures") is not None:
+    elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
        arch = vision_config["architectures"][0]
    return arch

@@ -6161,7 +6279,7 @@ def main() -> None:

    with torch.inference_mode():
        output_type = ftype_map[args.outtype]
-        model_type = ModelType.VISION if args.mmproj else ModelType.TEXT
+        model_type = ModelType.MMPROJ if args.mmproj else ModelType.TEXT
        hparams = ModelBase.load_hparams(dir_model)
        model_architecture = get_model_architecture(hparams, model_type)
        logger.info(f"Model architecture: {model_architecture}")
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -56,60 +56,82 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi

 ## Model Supports

-| Model Name                  | FP16  | Q8_0 | Q4_0 |
+| Model Name                  | FP16  | Q4_0 | Q8_0 |
 |:----------------------------|:-----:|:----:|:----:|
-| AquilaChat2-7B              |   √   |   √  |   √  |
-| Baichuan-7b                 |   √   |   √  |   √  |
-| Baichuan2-7B-Chat           |   √   |   √  |   √  |
-| bitnet_b1_58-large          |   √   |   √  |   √  |
-| bloom-560m                  |   √   |   x  |   √  |
-| bloomz-alpaca-560m          |   √   |   x  |   √  |
-| c4ai-command-r-35B-v01      |   x   |   x  |   x  |
-| chatglm3-6B                 |   x   |   x  |   x  |
-| chinese-alpaca-2-1.3b       |   √   |   √  |   √  |
-| CodeShell-7B                |   √   |   √  |   √  |
-| deepseek-ai_deepseek-coder-1.3B-base | x |   x  |   x  |
-| deepseek-ai_DeepSeek-V2-Lite | x   |   x  |   x   |
-| deepseek-coder-6.7B-instruct | x   |   x  |   x   |
-| DeepSeek-V2-Lite-64x1.5B    |   x   |   x  |   x  |
-| falcon-7b-instruct          |   √   |   √  |   √  |
-| flan-t5-large               |   √   |   √  |   √  |
-| gemma-2-9b-it               |   √   |   √  |   √  |
-| glm-4-9B                    |   x   |   x  |   x  |
-| gpt2                        |   √   |   √  |   √  |
-| Gpt2-163M                   |   √   |   √  |   √  |
-| granite-3B-code-instruct    |   √   |   √  |   √  |
+| Llama-2                     |   √   |   √  |   √  |
+| Llama-3                     |   √   |   √  |   √  |
+| Mistral-7B                  |   √   |   √  |   √  |
+| Mistral MOE                 |   √   |   √  |   √  |
+| DBRX                        |   -   |   -  |   -  |
+| Falcon                      |   √   |   √  |   √  |
+| Chinese LLaMA/Alpaca        |   √   |   √  |   √  |
+| Vigogne(French)             |   √   |   √  |   √  |
+| BERT                        |   x   |   x  |   x  |
+| Koala                       |   √   |   √  |   √  |
+| Baichuan                    |   √   |   √  |   √  |
+| Aquila 1 & 2                |   √   |   √  |   √  |
+| Starcoder models            |   √   |   √  |   √  |
+| Refact                      |   √   |   √  |   √  |
+| MPT                         |   √   |   √  |   √  |
+| Bloom                       |   √   |   √  |   √  |
+| Yi models                   |   √   |   √  |   √  |
+| stablelm models             |   √   |   √  |   √  |
+| DeepSeek models             |   x   |   x  |   x  |
+| Qwen models                 |   √   |   √  |   √  |
+| PLaMo-13B                   |   √   |   √  |   √  |
+| Phi models                  |   √   |   √  |   √  |
+| PhiMoE                      |   √   |   √  |   √  |
+| GPT-2                       |   √   |   √  |   √  |
+| Orion                       |   √   |   √  |   √  |
+| InternlLM2                  |   √   |   √  |   √  |
+| CodeShell                   |   √   |   √  |   √  |
+| Gemma                       |   √   |   √  |   √  |
+| Mamba                       |   √   |   √  |   √  |
+| Xverse                      |   √   |   √  |   √  |
+| command-r models            |   √   |   √  |   √  |
+| Grok-1                      |   -   |   -  |   -  |
+| SEA-LION                    |   √   |   √  |   √  |
 | GritLM-7B                   |   √   |   √  |   √  |
-| internlm2_5-7b-chat         |   √   |   √  |   √  |
-| koala-7B-HF                 |   √   |   √  |   √  |
-| Llama-2-7b-chat-hf          |   √   |   √  |   √  |
-| Llama-3-Smaug-8B            |   √   |   √  |   √  |
-| Llama2-Chinese-7b-Chat      |   √   |   √  |   √  |
-| Llama3-8B                   |   √   |   √  |   √  |
-| Llama3-8b-chinese           |   √   |   √  |   √  |
-| mamba-130m-hf               |   √   |   √  |   √  |
-| Mistral-7B-Instruct-v0.2    |   √   |   √  |   √  |
-| Mixtral-8x7B-Instruct-v0.1  |   x   |   √  |   √  |
-| mpt-7B                      |   √   |   √  |   √  |
-| OLMo-1B-hf                  |   √   |   √  |   √  |
-| OpenELM-3B-Instruct         |   √   |   √  |   √  |
-| Orion-14b-base              |   √   |   √  |   √  |
-| phi1                        |   x   |   x  |   x  |
-| phi2                        |   x   |   x  |   x  |
-| Phi-3-mini-4k-instruct      |   √   |   √  |   √  |
-| plamo-13b                   |   √   |   √  |   √  |
-| pythia-70M                  |   x   |   x  |   x  |
-| Qwen-7B                     |   √   |   √  |   √  |
-| Qwen2-1.5B-Instruct         |   √   |   x  |   √  |
-| Refact-1_6B-fim             |   √   |   √  |   √  |
-| SmolLM-135M                 |   √   |   √  |   √  |
-| stablelm-zephyr             |   x   |   x  |   x  |
-| stablelm-2-zephyr-1_6b      |   x   |   x  |   x  |
-| starcoderbase-1b            |   √   |   √  |   √  |
-| starcoder2-3b               |   √   |   √  |   √  |
-| vigogne-7b-chat             |   √   |   √  |   √  |
-| xverse-7b-chat              |   √   |   √  |   √  |
-| Yi-6b-Chat                  |   √   |   √  |   √  |
+| OLMo                        |   √   |   √  |   √  |
+| OLMo 2                      |   √   |   √  |   √  |
+| OLMoE                       |   √   |   √  |   √  |
+| Granite models              |   √   |   √  |   √  |
+| GPT-NeoX                    |   √   |   √  |   √  |
+| Pythia                      |   √   |   √  |   √  |
+| Snowflake-Arctic MoE        |   -   |   -  |   -  |
+| Smaug                       |   √   |   √  |   √  |
+| Poro 34B                    |   √   |   √  |   √  |
+| Bitnet b1.58 models         |   √   |   x  |   x  |
+| Flan-T5                     |   √   |   √  |   √  |
+| Open Elm models             |   x   |   √  |   √  |
+| chatGLM3-6B + ChatGLM4-9b +  GLMEdge-1.5b + GLMEdge-4b    |   √   |   √  |   √  |
+| GLM-4-0414                  |   √   |   √  |   √  |
+| SmolLM                      |   √   |   √  |   √  |
+| EXAONE-3.0-7.8B-Instruct    |   √   |   √  |   √  |
+| FalconMamba Models          |   √   |   √  |   √  |
+| Jais Models                 |   -   |   x  |   x  |
+| Bielik-11B-v2.3             |   √   |   √  |   √  |
+| RWKV-6                      |   -   |   √  |   √  |
+| QRWKV-6                     |   √   |   √  |   √  |
+| GigaChat-20B-A3B            |   x   |   x  |   x  |
+| Trillion-7B-preview         |   √   |   √  |   √  |
+| Ling models                 |   √   |   √  |   √  |
+
+
+**Multimodal**
+| Model Name                  | FP16  | Q4_0 | Q8_0 |
+|:----------------------------|:-----:|:----:|:----:|
+| LLaVA 1.5 models, LLaVA 1.6 models      |   x   |   x  |   x  |
+|  BakLLaVA                   |   √   |   √  |   √  |
+|  Obsidian                   |   √   |   -  |   -  |
+|  ShareGPT4V                 |   x   |   -  |   -  |
+|  MobileVLM 1.7B/3B models   |   -   |   -  |   -  |
+|  Yi-VL                      |   -   |   -  |   -  |
+|  Mini CPM                   |   √   |   √  |   √  |
+|  Moondream                  |   √   |   √  |   √  |
+|  Bunny                      |   √   |   -  |   -  |
+|  GLM-EDGE                   |   √   |   √  |   √  |
+|  Qwen2-VL                   |   √   |   √  |   √  |



--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -17,25 +17,25 @@

 **SYCL** is a high-level parallel programming model designed to improve developers productivity writing code across various hardware accelerators such as CPUs, GPUs, and FPGAs. It is a single-source language designed for heterogeneous computing and based on standard C++17.

-**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
+**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to Intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:

 - **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
 - **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
+- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over Intel iGPUs and dGPUs.
 - **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.

 ### Llama.cpp + SYCL

-The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.
+The llama.cpp SYCL backend is primarily designed for **Intel GPUs**.
+SYCL cross-platform capabilities enable support for Nvidia GPUs as well, with limited support for AMD.

 ## Recommended Release

-The SYCL backend would be broken by some PRs due to no online CI.
-
-The following release is verified with good quality:
+The following releases are verified and recommended:

 |Commit ID|Tag|Release|Verified  Platform| Update date|
 |-|-|-|-|-|
+|24e86cae7219b0f3ede1d5abdf5bf3ad515cccb8|b5377 |[llama-b5377-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b5377/llama-b5377-bin-win-sycl-x64.zip) |ArcB580/Linux/oneAPI 2025.1<br>LNL Arc GPU/Windows 11/oneAPI 2025.1.1|2025-05-15|
 |3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
 |fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||

@@ -106,15 +106,14 @@ SYCL backend supports Intel GPU Family:
 |-------------------------------|---------|---------------------------------------|
 | Intel Data Center Max Series  | Support | Max 1550, 1100                        |
 | Intel Data Center Flex Series | Support | Flex 170                              |
-| Intel Arc Series              | Support | Arc 770, 730M, Arc A750               |
-| Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake, Arrow Lake    |
-| Intel iGPU                    | Support | iGPU in 13700k,iGPU in 13400, i5-1250P, i7-1260P, i7-1165G7 |
+| Intel Arc Series              | Support | Arc 770, 730M, Arc A750, B580         |
+| Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake, Arrow Lake, Lunar Lake |
+| Intel iGPU                    | Support | iGPU in 13700k, 13400, i5-1250P, i7-1260P, i7-1165G7  |

 *Notes:*

 - **Memory**
  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
-
  - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.

 - **Execution Unit (EU)**
@@ -138,9 +137,11 @@ Note: AMD GPU support is highly experimental and is incompatible with F16.
 Additionally, it only supports GPUs with a sub_group_size (warp size) of 32.

 ## Docker
-The docker build option is currently limited to *intel GPU* targets.
+
+The docker build option is currently limited to *Intel GPU* targets.

 ### Build image
+
 ```sh
 # Using FP16
 docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
@@ -148,9 +149,10 @@ docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f

 *Notes*:

-To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
+To build in default FP32 *(Slower than FP16 alternative)*, set `--build-arg="GGML_SYCL_F16=OFF"` in the previous command.

 You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
+Check the [documentation for Docker](../docker.md) to see the available images.

 ### Run container

@@ -250,7 +252,7 @@ sycl-ls

 - **Intel GPU**

-When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`level_zero:gpu`] in the sample output below:
+When targeting an intel GPU, the user should expect one or more devices among the available SYCL devices. Please make sure that at least one GPU is present via `sycl-ls`, for instance `[level_zero:gpu]` in the sample output below:

 ```
 [opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
@@ -282,7 +284,7 @@ For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:

 #### Intel GPU

-```
+```sh
 ./examples/sycl/build.sh
 ```

@@ -351,7 +353,7 @@ cmake --build build --config Release -j -v

 #### Retrieve and prepare model

-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
+You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).

 ##### Check device

@@ -398,11 +400,15 @@ Choose one of following methods to run.

 ```sh
 ./examples/sycl/run-llama2.sh 0
+# OR
+./examples/sycl/run-llama3.sh 0
 ```
 - Use multiple devices:

 ```sh
 ./examples/sycl/run-llama2.sh
+# OR
+./examples/sycl/run-llama3.sh
 ```

 2. Command line
@@ -425,13 +431,13 @@ Examples:
 - Use device 0:

 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
+ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm none -mg 0
 ```

 - Use multiple devices:

 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
+ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm layer
 ```

 *Notes:*
@@ -452,7 +458,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 1. Install GPU driver

-Intel GPU drivers instructions guide and download page can be found here: [Get intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
+Intel GPU drivers instructions guide and download page can be found here: [Get Intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).

 2. Install Visual Studio

@@ -629,7 +635,7 @@ Once it is completed, final results will be in **build/Release/bin**

 #### Retrieve and prepare model

-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
+You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).

 ##### Check device

@@ -648,7 +654,7 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
 build\bin\llama-ls-sycl-device.exe
 ```

-This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
+This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *Intel GPU* it would look like the following:
 ```
 found 2 SYCL devices:
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
@@ -658,13 +664,14 @@ found 2 SYCL devices:
 | 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|

 ```
+
 #### Choose level-zero devices

 |Chosen Device ID|Setting|
 |-|-|
-|0|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
+|0|Default option. You may also want to `set ONEAPI_DEVICE_SELECTOR="level_zero:0"`|
 |1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
-|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
+|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"` or `set ONEAPI_DEVICE_SELECTOR="level_zero:*"`|

 #### Execute

@@ -673,7 +680,13 @@ Choose one of following methods to run.
 1. Script

 ```
-examples\sycl\win-run-llama2.bat
+examples\sycl\win-run-llama-2.bat
+```
+
+or
+
+```
+examples\sycl\win-run-llama-3.bat
 ```

 2. Command line
@@ -697,13 +710,13 @@ Examples:
 - Use device 0:

 ```
-build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
+build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm none -mg 0
 ```

 - Use multiple devices:

 ```
-build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
+build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm layer
 ```


@@ -714,7 +727,9 @@ Note:
 ```sh
 detect 1 SYCL GPUs: [0] with top Max compute units:512
 ```
+
 Or
+
 ```sh
 use 1 SYCL GPUs: [0] with Max compute units:512
 ```
@@ -726,14 +741,17 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 | Name               | Value                                 | Function                                    |
 |--------------------|---------------------------------------|---------------------------------------------|
-| GGML_SYCL          | ON (mandatory)                        | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
+| GGML_SYCL          | ON (mandatory)                        | Enable build with SYCL code path.           |
 | GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA \| AMD    | Set the SYCL target device type.            |
 | GGML_SYCL_DEVICE_ARCH | Optional (except for AMD)             | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
-| GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)*     | Enable FP16 build with SYCL code path.      |
+| GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)*     | Enable FP16 build with SYCL code path. (1.) |
 | GGML_SYCL_GRAPH    | ON *(default)* \|OFF *(Optional)*     | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
+| GGML_SYCL_DNN      | ON *(default)* \|OFF *(Optional)*     | Enable build with oneDNN.                   |
 | CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |

+1. FP16 is recommended for better prompt processing performance on quantized models. Performance is equivalent in text generation but set `GGML_SYCL_F16=OFF` if you are experiencing issues with FP16 builds.
+
 #### Runtime

 | Name              | Value            | Function                                                                                                                  |
@@ -741,6 +759,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
 | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
 | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
+| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |


@@ -750,7 +769,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 ## Q&A

- Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
+- Error:  `error while loading shared libraries: libsycl.so: cannot open shared object file: No such file or directory`.

  - Potential cause: Unavailable oneAPI installation or not set ENV variables.
  - Solution: Install *oneAPI base toolkit* and enable its ENV through: `source /opt/intel/oneapi/setvars.sh`.
@@ -779,18 +798,18 @@ use 1 SYCL GPUs: [0] with Max compute units:512

  It's same for other projects including llama.cpp SYCL backend.

- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`
+- `Native API failed. Native API returns: 39 (UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY)`, `ggml_backend_sycl_buffer_type_alloc_buffer: can't allocate 3503030272 Bytes of memory on device`, or `failed to allocate SYCL0 buffer`

-  Device Memory is not enough.
+  You are running out of Device Memory.

  |Reason|Solution|
  |-|-|
-  |Default Context is too big. It leads to more memory usage.|Set `-c 8192` or smaller value.|
-  |Model is big and require more memory than device's.|Choose smaller quantized model, like Q5 -> Q4;<br>Use more than one devices to load model.|
+  | The default context is too big. It leads to excessive memory usage.|Set `-c 8192` or a smaller value.|
+  | The model is too big and requires more memory than what is available.|Choose a smaller model or change to a smaller quantization, like Q5 -> Q4;<br>Alternatively, use more than one device to load model.|

 ### **GitHub contribution**:
-Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
+Please add the `SYCL :` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.

 ## TODO

- NA
+- Review ZES_ENABLE_SYSMAN: https://github.com/intel/compute-runtime/blob/master/programmers-guide/SYSMAN.md#support-and-limitations
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -22,6 +22,9 @@ Additionally, there the following images, similar to the above:
 - `ghcr.io/ggml-org/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggml-org/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggml-org/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:full-intel`: Same as `full` but compiled with SYCL support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:light-intel`: Same as `light` but compiled with SYCL support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:server-intel`: Same as `server` but compiled with SYCL support. (platforms: `linux/amd64`)

 The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).

@@ -104,7 +107,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment

 The defaults are:

- `MUSA_VERSION` set to `rc3.1.1`
+- `MUSA_VERSION` set to `rc4.0.1`

 The resulting images, are essentially the same as the non-MUSA images:

--- a/docs/function-calling.md
+++ b/docs/function-calling.md
@@ -325,36 +325,65 @@ To get the official template from original HuggingFace repos, you can use [scrip
 > [!TIP]
 > If there is no official `tool_use` Jinja template, you may want to set `--chat-template chatml` to use a default that works with many models (YMMV!), or write your own (e.g. we provide a custom [llama-cpp-deepseek-r1.jinja](../models/templates/llama-cpp-deepseek-r1.jinja) for DeepSeek R1 distills)

+> [!CAUTION]
+> Beware of extreme KV quantizations (e.g. `-ctk q4_0`), they can substantially degrade the model's tool calling performance.
+
 Test in CLI (or with any library / software that can use OpenAI-compatible API backends):

 ```bash
 curl http://localhost:8080/v1/chat/completions -d '{
-"model": "gpt-3.5-turbo",
-"tools": [
-    {
-    "type":"function",
-    "function":{
-        "name":"python",
-        "description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
-        "parameters":{
-        "type":"object",
-        "properties":{
-            "code":{
-            "type":"string",
-            "description":"The code to run in the ipython interpreter."
+    "model": "gpt-3.5-turbo",
+    "tools": [
+        {
+        "type":"function",
+        "function":{
+            "name":"python",
+            "description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
+            "parameters":{
+            "type":"object",
+            "properties":{
+                "code":{
+                "type":"string",
+                "description":"The code to run in the ipython interpreter."
+                }
+            },
+            "required":["code"]
            }
-        },
-        "required":["code"]
        }
-    }
-    }
-],
-"messages": [
-    {
-    "role": "user",
-    "content": "Print a hello world message with python."
-    }
-]
+        }
+    ],
+    "messages": [
+        {
+        "role": "user",
+        "content": "Print a hello world message with python."
+        }
+    ]
+}'
+
+
+curl http://localhost:8080/v1/chat/completions -d '{
+    "model": "gpt-3.5-turbo",
+    "messages": [
+        {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."},
+        {"role": "user", "content": "What is the weather in Istanbul?"}
+    ],
+    "tools": [{
+        "type":"function",
+        "function":{
+            "name":"get_current_weather",
+            "description":"Get the current weather in a given location",
+            "parameters":{
+                "type":"object",
+                "properties":{
+                    "location":{
+                        "type":"string",
+                        "description":"The city and country/state, e.g. `San Francisco, CA`, or `Paris, France`"
+                    }
+                },
+                "required":["location"]
+            }
+        }
+    }]
 }'
 ```

--- a/docs/multimodal.md
+++ b/docs/multimodal.md
@@ -4,7 +4,9 @@ llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools
 - [llama-mtmd-cli](../tools/mtmd/README.md)
 - [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API

-To enable it, can use use one of the 2 methods below:
+Currently, we support **image** and **audio** input. Audio is highly experimental and may have reduced quality.
+
+To enable it, you can use one of the 2 methods below:

 - Use `-hf` option with a supported model (see a list of pre-quantized model below)
    - To load a model using `-hf` while disabling multimodal, use `--no-mmproj`
@@ -31,12 +33,14 @@ llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload

 ## Pre-quantized models

-These are ready-to-use models, most of them come with `Q4_K_M` quantization by default.
+These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/ggml-org

 Replaces the `(tool_name)` with the name of binary you want to use. For example, `llama-mtmd-cli` or `llama-server`

 NOTE: some models may require large context window, for example: `-c 8192`

+**Vision models**:
+
 ```sh
 # Gemma 3
 (tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
@@ -69,9 +73,20 @@ NOTE: some models may require large context window, for example: `-c 8192`

 # InternVL 2.5 and 3
 (tool_name) -hf ggml-org/InternVL2_5-1B-GGUF
-(tool_name) -hf ggml-org/InternVL2_5-2B-GGUF
+(tool_name) -hf ggml-org/InternVL2_5-4B-GGUF
 (tool_name) -hf ggml-org/InternVL3-1B-Instruct-GGUF
 (tool_name) -hf ggml-org/InternVL3-2B-Instruct-GGUF
-(tool_name) -hf ggml-org/InternVL3-4B-Instruct-GGUF
+(tool_name) -hf ggml-org/InternVL3-8B-Instruct-GGUF
 (tool_name) -hf ggml-org/InternVL3-14B-Instruct-GGUF
+
+# Llama 4 Scout
+(tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF
+```
+
+**Audio models**:
+
+```sh
+# Ultravox 0.5
+(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF
+(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF
 ```
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -32,6 +32,7 @@ else()
    add_subdirectory(speculative)
    add_subdirectory(speculative-simple)
    add_subdirectory(gen-docs)
+    add_subdirectory(training)
    if (NOT GGML_BACKEND_DL)
        add_subdirectory(convert-llama2c-to-ggml)
        # these examples use the backends directly and cannot be built with dynamic loading
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -50,8 +50,6 @@ int main(int argc, char ** argv) {
    const int N = 5;  // n-gram size
    const int G = 15; // max verification n-grams

-    const bool dump_kv_cache = params.dump_kv_cache;
-
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -152,9 +150,6 @@ int main(int argc, char ** argv) {
    // here we keep adding new n-grams as we go
    ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);

-    // debug
-    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
-
    const auto t_dec_start = ggml_time_us();

    // sample first token
@@ -172,12 +167,6 @@ int main(int argc, char ** argv) {
    }

    while (true) {
-        // debug
-        if (dump_kv_cache) {
-            llama_kv_cache_view_update(ctx, &kvc_view);
-            common_kv_cache_dump_view_seqs(kvc_view, 40);
-        }
-
        // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
        //
        // Example for W = 5, N = 4, G = 2:
@@ -473,8 +462,6 @@ int main(int argc, char ** argv) {

    common_sampler_free(smpl);

-    llama_kv_cache_view_free(&kvc_view);
-
    llama_batch_free(batch);

    llama_backend_free();
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -24,8 +24,6 @@ int main(int argc, char ** argv){
    // max. number of additional tokens to draft if match is found
    const int n_draft = params.speculative.n_max;

-    const bool dump_kv_cache = params.dump_kv_cache;
-
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -110,18 +108,9 @@ int main(int argc, char ** argv){

    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);

-    // debug
-    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
-
    const auto t_dec_start = ggml_time_us();

    while (true) {
-        // debug
-        if (dump_kv_cache) {
-            llama_kv_cache_view_update(ctx, &kvc_view);
-            common_kv_cache_dump_view_seqs(kvc_view, 40);
-        }
-
        // print current draft sequence
        LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());

--- a/examples/parallel/README.md
+++ b/examples/parallel/README.md
@@ -1,3 +1,14 @@
 # llama.cpp/example/parallel

 Simplified simulation of serving incoming requests in parallel
+
+## Example
+
+Generate 128 client requests (`-ns 128`), simulating 8 concurrent clients (`-np 8`). The system prompt is shared (`-pps`), meaning that it is computed once at the start. The client requests consist of 10 junk questions (`-j 10`) followed by the actual question.
+
+```bash
+llama-parallel -m model.gguf -np 8 -ns 128 --top-k 1 -pps --junk 10 -c 16384
+```
+
+> [!NOTE]
+> It's recommended to use base models with this example. Instruction tuned models might not be able to properly follow the custom chat template specified here, so the results might not be as expected.
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -34,11 +34,61 @@ static std::string k_system =
 R"(Transcript of a never ending dialog, where the User interacts with an Assistant.
 The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.

-User: Recommend a nice restaurant in the area.
-Assistant: I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
-User: Who is Richard Feynman?
-Assistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
-User:)";
+User:
+Recommend a nice restaurant in the area.
+Assistant:
+I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
+User:
+Who is Richard Feynman?
+Assistant:
+Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
+)";
+
+static std::vector<std::string> k_questions = {
+    "What is the tallest mountain in the world?",
+    "Who was the first person to win two Nobel Prizes?",
+    "Which country invented paper?",
+    "What organ is primarily responsible for pumping blood throughout the body?",
+    "Which planet is known for its prominent ring system?",
+    "Who directed the movie 'Inception'?",
+    "What is the freezing point of water in Fahrenheit?",
+    "Which animal is known to have the longest lifespan?",
+    "What language has the most native speakers worldwide?",
+    "What is the capital city of Canada?",
+    "Who is credited with inventing the World Wide Web?",
+    "Which metal is liquid at room temperature?",
+    "What is the term for an animal that eats both plants and meat?",
+    "Who painted 'The Starry Night'?",
+    "What gas do humans exhale that plants use for photosynthesis?",
+    "What year did World War II end?",
+    "Which continent has the most countries?",
+    "Who wrote the novel 'Frankenstein'?",
+    "What does DNA stand for?",
+    "What is the main ingredient in traditional Japanese miso soup?"
+};
+
+static std::vector<std::string> k_answers = {
+    "The tallest mountain in the world is Mount Everest.",
+    "Marie Curie was the first person to win two Nobel Prizes.",
+    "Paper was invented in China.",
+    "The heart is the organ responsible for pumping blood.",
+    "Saturn is known for its prominent ring system.",
+    "Christopher Nolan directed the movie 'Inception'.",
+    "The freezing point of water in Fahrenheit is 32°F.",
+    "The bowhead whale is known to have the longest lifespan among mammals.",
+    "Mandarin Chinese has the most native speakers in the world.",
+    "The capital city of Canada is Ottawa.",
+    "Tim Berners-Lee is credited with inventing the World Wide Web.",
+    "Mercury is the metal that is liquid at room temperature.",
+    "An animal that eats both plants and meat is called an omnivore.",
+    "'The Starry Night' was painted by Vincent van Gogh.",
+    "Humans exhale carbon dioxide, which plants use in photosynthesis.",
+    "World War II ended in 1945.",
+    "Africa is the continent with the most countries.",
+    "The novel 'Frankenstein' was written by Mary Shelley.",
+    "DNA stands for Deoxyribonucleic Acid.",
+    "The main ingredient in traditional Japanese miso soup is fermented soybean paste."
+};

 static std::vector<std::string> k_prompts = {
    "What is the meaning of life?",
@@ -49,7 +99,7 @@ static std::vector<std::string> k_prompts = {
    "What is the best way to learn a new language?",
    "How to get a job at Google?",
    "If you could have any superpower, what would it be?",
-    "I want to learn how to play the piano.",
+    "I want to learn how to play the piano. What would be the best way to do it?",
 };

 struct client {
@@ -68,6 +118,7 @@ struct client {
    int64_t t_start_prompt;
    int64_t t_start_gen;

+    int32_t n_past    = 0;
    int32_t n_prompt  = 0;
    int32_t n_decoded = 0;
    int32_t i_batch   = -1;
@@ -107,6 +158,7 @@ int main(int argc, char ** argv) {
    common_params params;

    params.n_predict = 128;
+    params.n_junk = 0;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
        return 1;
@@ -126,7 +178,11 @@ int main(int argc, char ** argv) {
    // insert new requests as soon as the previous one is done
    const bool cont_batching = params.cont_batching;

-    const bool dump_kv_cache = params.dump_kv_cache;
+    // is the system prompt shared in the cache
+    const bool is_sp_shared = params.is_pp_shared;
+
+    // extra text to insert in each client's prompt in order to make it larger
+    const int32_t n_junk = params.n_junk;

    // init llama.cpp
    llama_backend_init();
@@ -169,6 +225,7 @@ int main(int argc, char ** argv) {
    }

    std::vector<llama_token> tokens_system;
+
    tokens_system = common_tokenize(ctx, k_system, true);
    const int32_t n_tokens_system = tokens_system.size();

@@ -182,15 +239,13 @@ int main(int argc, char ** argv) {
    int32_t n_total_gen    = 0;
    int32_t n_cache_miss   = 0;

-    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
-
    const auto t_main_start = ggml_time_us();

    LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
    LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
    LOG_INF("\n");

-    {
+    if (is_sp_shared) {
        LOG_INF("%s: Evaluating the system prompt ...\n", __func__);

        for (int32_t i = 0; i < n_tokens_system; ++i) {
@@ -213,11 +268,6 @@ int main(int argc, char ** argv) {
    LOG_INF("Processing requests ...\n\n");

    while (true) {
-        if (dump_kv_cache) {
-            llama_kv_cache_view_update(ctx, &kvc_view);
-            common_kv_cache_dump_view_seqs(kvc_view, 40);
-        }
-
        common_batch_clear(batch);

        // decode any currently ongoing sequences
@@ -228,7 +278,7 @@ int main(int argc, char ** argv) {

            client.i_batch = batch.n_tokens;

-            common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
+            common_batch_add(batch, client.sampled, client.n_past++, { client.id + 1 }, true);

            client.n_decoded += 1;
        }
@@ -254,9 +304,23 @@ int main(int argc, char ** argv) {
                    client.t_start_gen    = 0;

                    client.input    = k_prompts[rand() % k_prompts.size()];
-                    client.prompt   = client.input + "\nAssistant:";
                    client.response = "";

+                    // construct the prompt:
+                    // [system prompt] + [junk] + [user prompt]
+                    client.n_past = 0;
+                    client.prompt = "";
+                    if (is_sp_shared) {
+                        client.n_past = n_tokens_system;
+                    } else {
+                        client.prompt += k_system;
+                    }
+                    for (int i = 0; i < n_junk; ++i) {
+                        const int r = rand() % k_questions.size();
+                        client.prompt += "User:\n" + k_questions[r] + "\nAssistant:\n " + k_answers[r] + "\n";
+                    }
+                    client.prompt += "User:\n" + client.input + "\nAssistant:\n";
+
                    common_sampler_reset(client.smpl);

                    // do not prepend BOS because we have a system prompt!
@@ -264,7 +328,7 @@ int main(int argc, char ** argv) {
                    tokens_prompt = common_tokenize(ctx, client.prompt, false);

                    for (size_t i = 0; i < tokens_prompt.size(); ++i) {
-                        common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
+                        common_batch_add(batch, tokens_prompt[i], client.n_past++, { client.id + 1 }, false);
                    }

                    // extract the logits only for the last token
@@ -363,10 +427,9 @@ int main(int argc, char ** argv) {
                //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());

                if (client.n_decoded > 2 &&
-                        (llama_vocab_is_eog(vocab, id) ||
-                         (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
-                         client.response.find("User:") != std::string::npos ||
-                         client.response.find('\n') != std::string::npos)) {
+                    (llama_vocab_is_eog(vocab, id) ||
+                     (params.n_predict > 0 && client.n_decoded >= params.n_predict) ||
+                     client.response.find("User:") != std::string::npos)) {
                    // basic reverse prompt
                    const size_t pos = client.response.find("User:");
                    if (pos != std::string::npos) {
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -81,14 +81,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
    }
 }

-static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
+static void batch_encode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
    // clear previous kv_cache values (irrelevant for embeddings)
    llama_kv_self_clear(ctx);

    // run model
    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
-    if (llama_decode(ctx, batch) < 0) {
-        LOG_ERR("%s : failed to decode\n", __func__);
+    if (llama_encode(ctx, batch) < 0) {
+        LOG_ERR("%s : failed to encode\n", __func__);
    }

    for (int i = 0; i < batch.n_tokens; i++) {
@@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
        // encode if at capacity
        if (batch.n_tokens + n_toks > n_batch) {
            float * out = emb + p * n_embd;
-            batch_decode(ctx, batch, out, s, n_embd);
+            batch_encode(ctx, batch, out, s, n_embd);
            common_batch_clear(batch);
            p += s;
            s = 0;
@@ -246,7 +246,7 @@ int main(int argc, char ** argv) {

    // final batch
    float * out = emb + p * n_embd;
-    batch_decode(ctx, batch, out, s, n_embd);
+    batch_encode(ctx, batch, out, s, n_embd);

    // save embeddings to chunks
    for (int i = 0; i < n_chunks; i++) {
@@ -267,7 +267,7 @@ int main(int argc, char ** argv) {
        batch_add_seq(query_batch, query_tokens, 0);

        std::vector<float> query_emb(n_embd, 0);
-        batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
+        batch_encode(ctx, query_batch, query_emb.data(), 1, n_embd);

        common_batch_clear(query_batch);

--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
    auto generate = [&](const std::string & prompt) {
        std::string response;

-        const bool is_first = llama_kv_self_used_cells(ctx) == 0;
+        const bool is_first = llama_kv_self_seq_pos_max(ctx, 0) == 0;

        // tokenize the prompt
        const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
        while (true) {
            // check if we have enough space in the context to evaluate this batch
            int n_ctx = llama_n_ctx(ctx);
-            int n_ctx_used = llama_kv_self_used_cells(ctx);
+            int n_ctx_used = llama_kv_self_seq_pos_max(ctx, 0);
            if (n_ctx_used + batch.n_tokens > n_ctx) {
                printf("\033[0m\n");
                fprintf(stderr, "context size exceeded\n");
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -84,13 +84,13 @@ int main(int argc, char ** argv) {
    model_params.n_gpu_layers = ngl;

    llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
-    const llama_vocab * vocab = llama_model_get_vocab(model);

    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }

+    const llama_vocab * vocab = llama_model_get_vocab(model);
    // tokenize the prompt

    // find the number of tokens in the prompt
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -12,16 +12,16 @@ source /opt/intel/oneapi/setvars.sh

 INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
 MODEL_FILE=models/llama-2-7b.Q4_0.gguf
-NGL=33
-CONEXT=4096
+NGL=99
+CONTEXT=4096

 if [ $# -gt 0 ]; then
    GGML_SYCL_DEVICE=$1
    echo "use $GGML_SYCL_DEVICE as main GPU"
    #use signle GPU only
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT} -mg $GGML_SYCL_DEVICE -sm none
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none

 else
    #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT}
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
 fi
--- a/examples/sycl/run-llama3.sh
+++ b/examples/sycl/run-llama3.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+#  MIT license
+#  Copyright (C) 2025 Intel Corporation
+#  SPDX-License-Identifier: MIT
+
+# If you want more control, DPC++ Allows selecting a specific device through the
+# following environment variable
+#export ONEAPI_DEVICE_SELECTOR="level_zero:0"
+source /opt/intel/oneapi/setvars.sh
+
+#export GGML_SYCL_DEBUG=1
+
+#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
+
+INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
+MODEL_FILE=models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
+NGL=99 # Layers offloaded to the GPU. If the device runs out of memory, reduce this value according to the model you are using.
+CONTEXT=4096
+
+if [ $# -gt 0 ]; then
+    GGML_SYCL_DEVICE=$1
+    echo "Using $GGML_SYCL_DEVICE as the main GPU"
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
+else
+    #use multiple GPUs with same max compute units
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT}
+fi
--- a/examples/sycl/win-run-llama2.bat
+++ b/examples/sycl/win-run-llama2.bat
@@ -6,4 +6,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force


-.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
+.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 99 -s 0
--- a/examples/sycl/win-run-llama3.bat
+++ b/examples/sycl/win-run-llama3.bat
@@ -0,0 +1,9 @@
+::  MIT license
+::  Copyright (C) 2024 Intel Corporation
+::  SPDX-License-Identifier: MIT
+
+set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
+@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+
+
+.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -e -ngl 99
--- a/examples/training/CMakeLists.txt
+++ b/examples/training/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-finetune)
+add_executable(${TARGET} finetune.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/training/README.md
+++ b/examples/training/README.md
@@ -0,0 +1,17 @@
+# llama.cpp/examples/training
+
+This directory contains examples related to language model training using llama.cpp/GGML.
+So far finetuning is technically functional (for FP32 models and limited hardware setups) but the code is very much WIP.
+Finetuning of Stories 260K and LLaMA 3.2 1b seems to work with 24 GB of memory.
+**For CPU training, compile llama.cpp without any additional backends such as CUDA.**
+**For CUDA training, use the maximum number of GPU layers.**
+
+Proof of concept:
+
+``` sh
+export model_name=llama_3.2-1b && export quantization=f32
+./build/bin/finetune --file wikitext-2-raw/wiki.test.raw -ngl 999 --model models/${model_name}-${quantization}.gguf -c 512 -b 512 -ub 512
+./build/bin/perplexity --file wikitext-2-raw/wiki.test.raw -ngl 999 --model finetuned-model.gguf
+```
+
+The perplexity value of the finetuned model should be lower after training on the test set for 2 epochs.
--- a/examples/training/finetune.cpp
+++ b/examples/training/finetune.cpp
@@ -0,0 +1,96 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    params.escape = false;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
+        return 1;
+    }
+
+    if (params.use_mmap) {
+        LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__);
+        params.use_mmap = false;
+    }
+    if (params.cache_type_k != GGML_TYPE_F32) {
+        LOG_INF("%s: force changing k cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
+        params.cache_type_k = GGML_TYPE_F32;
+    }
+    if (params.cache_type_v != GGML_TYPE_F32) {
+        LOG_INF("%s: force changing v cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
+        params.cache_type_v = GGML_TYPE_F32;
+    }
+
+    common_init();
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // load the model and apply lora adapter, if any
+    common_init_result llama_init = common_init_from_params(params);
+    llama_model_ptr   & model = llama_init.model;
+    llama_context_ptr & ctx   = llama_init.context;
+
+    if (model == NULL) {
+        LOG_ERR("%s: unable to load model\n", __func__);
+        return 1;
+    }
+
+    // print system information
+    {
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+    }
+
+    constexpr float val_split = 0.05f;
+
+    std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
+    ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);
+
+    struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr);
+    optimizer_params.adamw.alpha = 1e-7f; // learning rate
+
+    struct llama_opt_params lopt_params {
+        /*n_ctx_train     =*/ 0,
+        /*param_filter    =*/ llama_opt_param_filter_all,
+        /*param_filter_ud =*/ nullptr,
+        /*get_opt_pars    =*/ ggml_opt_get_constant_optimizer_params,
+        /*get_opt_pars_ud =*/ &optimizer_params,
+    };
+    llama_opt_init(ctx.get(), model.get(), lopt_params);
+
+    const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - val_split);
+
+    ggml_opt_result_t result_train = ggml_opt_result_init();
+    ggml_opt_result_t result_eval  = ggml_opt_result_init();
+
+    for (int epoch = 0; epoch < 2; ++epoch) {
+        llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
+            ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
+        fprintf(stderr, "\n");
+
+        ggml_opt_result_reset(result_train);
+        ggml_opt_result_reset(result_eval);
+    }
+    ggml_opt_result_free(result_train);
+    ggml_opt_result_free(result_eval);
+
+    llama_model_save_to_file(model.get(), "finetuned-model.gguf");
+
+    llama_backend_free();
+
+    return 0;
+}
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -193,6 +193,7 @@ option(GGML_RPC                             "ggml: use RPC"
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 option(GGML_SYCL_GRAPH                      "ggml: enable graphs in the SYCL backend"         ON)
+option(GGML_SYCL_DNN                        "ggml: enable oneDNN in the SYCL backend"         ON)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
                                            "ggml: sycl target device")
 set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -248,7 +248,7 @@ extern "C" {
        // preferrably to run on the same backend as the buffer
        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);

-        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
+        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);

        // initialize buffers from a max size graph (optional)
        reserve_graph = build_graph(sched, max_batch_size);
@@ -289,7 +289,7 @@ extern "C" {
    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);

    // Initialize a backend scheduler, backends with low index are given priority over backends with high index
-    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
+    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);

    // Initialize backend buffers from a measure graph
--- a/ggml/include/ggml-opt.h
+++ b/ggml/include/ggml-opt.h
@@ -37,13 +37,16 @@ extern "C" {
    // ====== Dataset ======

    GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
-            int64_t ne_datapoint, // number of elements per datapoint
-            int64_t ne_label,     // number of elements per label
-            int64_t ndata,        // total number of datapoints/labels
-            int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
+            enum ggml_type type_data,    // the type for the internal data tensor
+            enum ggml_type type_label,   // the type for the internal labels tensor
+            int64_t        ne_datapoint, // number of elements per datapoint
+            int64_t        ne_label,     // number of elements per label
+            int64_t        ndata,        // total number of datapoints/labels
+            int64_t        ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
    GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);

    // get underlying tensors that store the data
+    GGML_API int64_t              ggml_opt_dataset_ndata (ggml_opt_dataset_t dataset);
    GGML_API struct ggml_tensor * ggml_opt_dataset_data  (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
    GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label,     ndata]

@@ -56,13 +59,19 @@ extern "C" {
            struct ggml_tensor * data_batch,   // shape = [ne_datapoint, ndata_batch]
            struct ggml_tensor * labels_batch, // shape = [ne_label,     ndata_batch]
            int64_t              ibatch);
+    GGML_API void ggml_opt_dataset_get_batch_host(
+            ggml_opt_dataset_t   dataset,
+            void               * data_batch,
+            size_t               nb_data_batch,
+            void               * labels_batch,
+            int64_t              ibatch);

    // ====== Model / Context ======

    enum ggml_opt_build_type {
-        GGML_OPT_BUILD_TYPE_FORWARD,
-        GGML_OPT_BUILD_TYPE_GRAD,
-        GGML_OPT_BUILD_TYPE_OPT,
+        GGML_OPT_BUILD_TYPE_FORWARD = 10,
+        GGML_OPT_BUILD_TYPE_GRAD    = 20,
+        GGML_OPT_BUILD_TYPE_OPT     = 30,
    };

    // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
@@ -81,20 +90,22 @@ extern "C" {
    // userdata can be used to pass arbitrary data
    typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);

-    // returns the default optimizer params (constant)
+    // returns the default optimizer params (constant, hard-coded values)
    // userdata is not used
    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);

+    // casts userdata to ggml_opt_optimizer_params and returns it
+    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata);
+
    // parameters for initializing a new optimization context
    struct ggml_opt_params {
        ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs

-        struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
-
-        // the forward graph is defined by inputs and outputs
-        // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
-        struct ggml_tensor * inputs;
-        struct ggml_tensor * outputs;
+        // by default the forward graph needs to be reconstructed for each eval
+        // if ctx_compute, inputs, and outputs are set the graphs are instead allocated statically
+        struct ggml_context * ctx_compute;
+        struct ggml_tensor  * inputs;
+        struct ggml_tensor  * outputs;

        enum ggml_opt_loss_type  loss_type;
        enum ggml_opt_build_type build_type;
@@ -107,12 +118,9 @@ extern "C" {

    // get parameters for an optimization context with defaults set where possible
    // parameters for which no sensible defaults exist are supplied as arguments to this function
-    GGML_API ggml_opt_params ggml_opt_default_params(
-            ggml_backend_sched_t      backend_sched,
-            struct ggml_context     * ctx_compute,
-            struct ggml_tensor      * inputs,
-            struct ggml_tensor      * outputs,
-            enum ggml_opt_loss_type   loss_type);
+    GGML_API struct ggml_opt_params ggml_opt_default_params(
+            ggml_backend_sched_t    backend_sched,
+            enum ggml_opt_loss_type loss_type);

    GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
    GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
@@ -120,7 +128,10 @@ extern "C" {
    // set gradients to zero, initilize loss, and optionally reset the optimizer
    GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);

+    GGML_API bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx); // whether the graphs are allocated_statically
+
    // get underlying tensors that store data
+    // if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
    GGML_API struct ggml_tensor * ggml_opt_inputs(  ggml_opt_context_t opt_ctx); // forward graph input tensor
    GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
    GGML_API struct ggml_tensor * ggml_opt_labels(  ggml_opt_context_t opt_ctx); // labels to compare outputs against
@@ -128,11 +139,12 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_opt_pred(    ggml_opt_context_t opt_ctx); // predictions made by outputs
    GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels

+    // get the gradient accumulator for a node from the forward graph
    GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);

    // ====== Optimization Result ======

-    GGML_API ggml_opt_result_t ggml_opt_result_init();
+    GGML_API ggml_opt_result_t ggml_opt_result_init(void);
    GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
    GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);

@@ -144,11 +156,20 @@ extern "C" {

    // ====== Computation ======

-    // do forward pass, increment result if not NULL
-    GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+    // if not using static graphs, this function must be called prior to ggml_opt_alloc
+    GGML_API void ggml_opt_prepare_alloc(
+        ggml_opt_context_t    opt_ctx,
+        struct ggml_context * ctx_compute,
+        struct ggml_cgraph  * gf,
+        struct ggml_tensor  * inputs,
+        struct ggml_tensor  * outputs);

-    // do forward pass, increment result if not NULL, do backward pass
-    GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+    // allocate the next graph for evaluation, either forward or forward + backward
+    // must be called exactly once prior to calling ggml_opt_eval
+    GGML_API void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward);
+
+    // do forward pass, increment result if not NULL, do backward pass if allocated
+    GGML_API void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);

    // ############################################################################
    // ## The high-level functions start here. They do not depend on any private ##
@@ -200,9 +221,9 @@ extern "C" {
    // fit model defined by inputs and outputs to dataset
    GGML_API void ggml_opt_fit(
            ggml_backend_sched_t            backend_sched,  // backend scheduler for constructing the compute graphs
-            ggml_context                  * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
-            ggml_tensor                   * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
-            ggml_tensor                   * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
+            struct ggml_context           * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
+            struct ggml_tensor            * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
+            struct ggml_tensor            * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
            ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
            enum ggml_opt_loss_type         loss_type,      // loss to minimize
            ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -536,6 +536,7 @@ extern "C" {
        GGML_UNARY_OP_HARDSWISH,
        GGML_UNARY_OP_HARDSIGMOID,
        GGML_UNARY_OP_EXP,
+        GGML_UNARY_OP_GELU_ERF,

        GGML_UNARY_OP_COUNT,
    };
@@ -768,7 +769,7 @@ extern "C" {
    // Tensor flags
    GGML_API void ggml_set_input(struct ggml_tensor * tensor);
    GGML_API void ggml_set_output(struct ggml_tensor * tensor);
-    GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
+    GGML_API void ggml_set_param(struct ggml_tensor * tensor);
    GGML_API void ggml_set_loss(struct ggml_tensor * tensor);

    //
@@ -938,7 +939,7 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_repeat_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride

    // concat a and b along dim
    // used in stable-diffusion
@@ -1024,6 +1025,16 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // GELU using erf (error function) when possible
+    // some backends may fallback to approximation based on Abramowitz and Stegun formula
+    GGML_API struct ggml_tensor * ggml_gelu_erf(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_gelu_erf_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    GGML_API struct ggml_tensor * ggml_gelu_quick(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
@@ -2049,15 +2060,14 @@ extern "C" {

    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
    GGML_API void ggml_build_backward_expand(
-        struct ggml_context * ctx_static,  // context for static gradients (loss + gradient accumulation)
-        struct ggml_context * ctx_compute, // context for gradient computation
-        struct ggml_cgraph  * cgraph,
-        bool                  accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
+        struct ggml_context *  ctx,        // context for gradient computation
+        struct ggml_cgraph  *  cgraph,
+        struct ggml_tensor  ** grad_accs);

    // graph allocation in a context
    GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
    GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
-    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads);
    GGML_API void                 ggml_graph_cpy       (struct ggml_cgraph * src, struct ggml_cgraph * dst);
    GGML_API void                 ggml_graph_reset     (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
    GGML_API void                 ggml_graph_clear     (struct ggml_cgraph * cgraph);
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -674,6 +674,8 @@ struct ggml_backend_sched {
    char * context_buffer;
    size_t context_buffer_size;

+    bool op_offload;
+
    int debug;
 };

@@ -766,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
        if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
            int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
            // check if a backend with higher prio wants to offload the op
-            if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
+            if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
                for (int b = 0; b < src_backend_id; b++) {
                    if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
                        SET_CAUSE(tensor, "1.off");
@@ -1109,7 +1111,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg

            const int node_backend_id = tensor_backend_id(node);

-            assert(node_backend_id != -1); // all nodes should be assigned by now
+            assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback

            // check if we should start a new split based on the sources of the current node
            bool need_new_split = false;
@@ -1452,7 +1454,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
        ggml_backend_buffer_type_t * bufts,
        int n_backends,
        size_t graph_size,
-        bool parallel) {
+        bool parallel,
+        bool op_offload) {
    GGML_ASSERT(n_backends > 0);
    GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
    GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
@@ -1497,6 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
    }

    sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
+    sched->op_offload = op_offload;

    ggml_backend_sched_reset(sched);

--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -65,6 +65,7 @@
 #include <aclnnop/aclnn_eq_tensor.h>
 #include <aclnnop/aclnn_gt_scalar.h>
 #include <aclnnop/aclnn_pow.h>
+#include <aclnnop/aclnn_grouped_matmul_v2.h>
 #include <float.h>

 #include <cmath>
@@ -2587,3 +2588,276 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){

    ggml_cann_release_resources(ctx, acl_src, acl_dst, alpha);
 }
+
+/**
+ * @brief Performs expert-specific matrix multiplication (MoE) with
+ * floating-point precision using the CANN backend.
+ *
+ * This function executes a matrix multiplication operation tailored for
+ * Mixture of Experts (MoE) models, where the input tensor is multiplied
+ * with expert-specific weight matrices. It uses the CANN backend for
+ * efficient computation and stores the result in the destination tensor `dst`.
+ * The operation may leverage identity-based optimizations or routing masks
+ * as part of sparse expert selection.
+ *
+ * @param ctx The context for executing CANN backend operations.
+ * @param dst The destination tensor where the MoE multiplication result
+ * will be stored.
+ *
+ * @note This function assumes floating-point data types and is designed for
+ * MoE architectures, possibly involving sparse expert routing.
+ */
+static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    //dst   [M, K, N, 1]
+    ggml_tensor * src0 = dst->src[0];  //src0	[D, M, A, 1]
+    ggml_tensor * src1 = dst->src[1];  //src1	[D, B, N, 1], B = K or B = 1
+    ggml_tensor * ids  = dst->src[2];  //ids	[K, N]
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    // copy index from npu to cpu
+    int64_t n_as = ne02; // A
+    int64_t n_ids = ids->ne[0]; // K
+
+    std::vector<char> ids_host(ggml_nbytes(ids));
+    ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
+        ACL_MEMCPY_DEVICE_TO_HOST);
+    ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
+
+    char * src0_original = (char *) src0->data;
+    char * src1_original = (char *) src1->data;
+    char * dst_original  = (char *)  dst->data;
+    size_t ori_src0_nb[4] = {nb00, nb01, nb02, nb03};
+
+    // src0 is F16, src1 is F32, dst is F32
+    ggml_cann_pool_alloc src0_cast_allocator;
+    if (src0->type == GGML_TYPE_F16) {
+        src0_cast_allocator.alloc(ctx.pool(), sizeof(float) * ggml_nelements(src0));
+        void* src0_cast_buf = src0_cast_allocator.get();
+
+        size_t cast_nb[GGML_MAX_DIMS];
+        cast_nb[0] = sizeof(float_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            cast_nb[i] = cast_nb[i - 1] * src0->ne[i - 1];
+        }
+
+        aclTensor* acl_src0_f16 = ggml_cann_create_tensor(src0);
+        aclTensor* acl_cast = ggml_cann_create_tensor(src0_cast_buf,
+            ACL_FLOAT, sizeof(float), src0->ne, cast_nb, 4);
+        GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src0_f16, ACL_FLOAT, acl_cast);
+        ggml_cann_release_resources(ctx, acl_cast, acl_src0_f16);
+
+        src0_original = (char *) src0_cast_buf;
+        memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb));
+    }
+
+    std::vector<aclTensor*> src0_tensor_vec;
+    std::vector<aclTensor*> src1_tensor_vec;
+    std::vector<aclTensor*> dst_tensor_vec;
+    for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
+        for (int64_t id = 0; id < n_ids; id++) {
+            // src0_row [M, D] -> weight && permute
+            int64_t src0_ne[2] = {ne01, ne00};
+            size_t src0_nb[2] = {ori_src0_nb[1], ori_src0_nb[0]};
+            // src1_row [D, 1] -> input
+            int64_t src1_ne[2] = {ne10, 1};
+            size_t src1_nb[2] = {nb10, nb11};
+            // dst_row [M, 1] -> out
+            int64_t dst_ne[2] = {ne0, 1};
+            size_t dst_nb[2] = {nb0, nb1};
+
+            // expert index
+            int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
+            GGML_ASSERT(i02 >= 0 && i02 < n_as);
+
+            // If B = 1 (broadcast), always use 0; otherwise, use id.
+            int64_t i11 = (ne11 == 1 ? 0 : id);
+            int64_t i12 = iid1;
+
+            int64_t i1 = id;
+            int64_t i2 = i12;
+
+            void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
+            void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
+            void* dst_tmp_ptr  = dst_original  + i1*nb1   + i2*nb2;
+
+            aclTensor* acl_src0 = ggml_cann_create_tensor(src0_tmp_ptr,
+                ACL_FLOAT, sizeof(float),
+                src0_ne, src0_nb, 2);
+            aclTensor* acl_src1 = ggml_cann_create_tensor(src1_tmp_ptr,
+                ACL_FLOAT, sizeof(float),
+                src1_ne, src1_nb, 2);
+            aclTensor* acl_dst = ggml_cann_create_tensor(dst_tmp_ptr,
+                ACL_FLOAT, sizeof(float),
+                dst_ne, dst_nb, 2);
+
+            src0_tensor_vec.push_back(acl_src0);
+            src1_tensor_vec.push_back(acl_src1);
+            dst_tensor_vec.push_back(acl_dst);
+        }
+    }
+
+    size_t GROUP_SIZE = 128;
+    // GroupedMatmulV2 required tensor_list.size < 128
+    for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
+        // split and call GroupedMatmulV2
+        size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
+        std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
+        std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
+        std::vector<aclTensor*> dst_tensor_vec_split(dst_tensor_vec.begin() + i, dst_tensor_vec.begin() + end);
+
+        aclTensorList* src0_tensor_list = aclCreateTensorList(src0_tensor_vec_split.data(), src0_tensor_vec_split.size());
+        aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
+        aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
+
+        GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV2, src1_tensor_list, src0_tensor_list,
+            nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
+
+        ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);
+    }
+    return;
+}
+
+/**
+ * @brief Performs expert-specific matrix multiplication (MoE) with
+ * quantized precision using the CANN backend.
+ *
+ * This function executes a matrix multiplication operation tailored for
+ * Mixture of Experts (MoE) models, where the input tensor is multiplied
+ * with expert-specific quantized weight matrices. It leverages the CANN
+ * backend to perform efficient low-precision computations and stores the
+ * quantized result in the destination tensor `dst`.
+ *
+ * Quantization techniques reduce memory footprint and improve performance
+ * by using lower-bit representations (e.g., int8) instead of floating-point.
+ * This function is designed to work with such formats and may incorporate
+ * optimizations like identity-based fast paths or routing masks for sparse
+ * expert selection.
+ *
+ * @param ctx The context for executing CANN backend operations.
+ * @param dst The destination tensor where the quantized MoE multiplication result
+ * will be stored.
+ *
+ * @note This function assumes quantized data types and is designed for
+ * MoE architectures with potential sparse expert routing.
+ */
+static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    // TODO: Use aclnnGroupedMatMul
+    //dst   [M, K, N, 1]
+    ggml_tensor * src0 = dst->src[0];  //src0	[D, M, A, 1]
+    ggml_tensor * src1 = dst->src[1];  //src1	[D, B, N, 1], B = K or B = 1
+    ggml_tensor * ids  = dst->src[2];  //ids	[K, N]
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    // copy index from npu to cpu
+    int64_t n_as = ne02; // A
+    int64_t n_ids = ids->ne[0]; // K
+
+    std::vector<char> ids_host(ggml_nbytes(ids));
+    ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
+        ACL_MEMCPY_DEVICE_TO_HOST);
+    ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
+
+    char * src0_original = (char *) src0->data;
+    char * src1_original = (char *) src1->data;
+    char * dst_original  = (char *)  dst->data;
+
+    ggml_tensor src0_row = *src0;
+    ggml_tensor src1_row = *src1;
+    ggml_tensor dst_row = *dst;
+
+    const enum ggml_type type = dst->src[0]->type;
+    float weight_elem_size;
+    if (type == GGML_TYPE_Q4_0) {
+        weight_elem_size = float(sizeof(uint8_t)) / 2;
+    } else if (type == GGML_TYPE_Q8_0) {
+        weight_elem_size = float(sizeof(uint8_t));
+    } else {
+        GGML_ABORT("MUL_MAT_ID only support quant type Q4_0 and Q8_0 ");
+    }
+
+    // src0_row [D, M, 1, 1] weight without permute
+    src0_row.ne[2] = 1;
+    src0_row.ne[3] = 1;
+    src0_row.nb[0] = weight_elem_size;
+    src0_row.nb[1] = weight_elem_size * ne00;
+    src0_row.nb[2] = weight_elem_size * ne00;
+    src0_row.nb[3] = weight_elem_size * ne00;
+    size_t weight_stride = ne00 * ne01 * weight_elem_size;
+    size_t weight_size = weight_stride * ne02 * ne03;
+
+    // scale [D, M, 1, 1] -> scale && permute
+    size_t scale_elem_size = sizeof(uint16_t);
+    size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
+
+    // src1_row [D, 1, 1, 1] -> input
+    src1_row.ne[1] = 1;
+    src1_row.ne[2] = 1;
+    src1_row.ne[3] = 1;
+    src1_row.nb[2] = nb11;
+    src1_row.nb[3] = nb11;
+
+    // dst_row [M, 1, 1, 1] -> out
+    dst_row.ne[1] = 1;
+    dst_row.ne[2] = 1;
+    dst_row.ne[3] = 1;
+    dst_row.nb[2] = nb1;
+    dst_row.nb[3] = nb1;
+
+    //create weight for one row
+    ggml_cann_pool_alloc weight_allocator(ctx.pool());
+    void* weight_buffer = weight_allocator.alloc(nb02);
+    for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
+        for (int64_t id = 0; id < n_ids; id++) {
+            // expert index
+            int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
+            GGML_ASSERT(i02 >= 0 && i02 < n_as);
+
+            // If B = 1 (broadcast), always use 0; otherwise, use id.
+            int64_t i11 = (ne11 == 1 ? 0 : id);
+            int64_t i12 = iid1;
+
+            int64_t i1 = id;
+            int64_t i2 = i12;
+
+            void* src0_tmp_ptr = src0_original + i02*weight_stride;
+            void* scale_tmp_ptr = src0_original + weight_size + i02*scale_stride;
+            void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
+            void* dst_tmp_ptr  = dst_original  + i1*nb1   + i2*nb2;
+
+            // mem cpy
+            ggml_cann_async_memcpy(ctx, weight_buffer, src0_tmp_ptr, weight_stride,
+                ACL_MEMCPY_DEVICE_TO_DEVICE);
+            void* scale_buffer = (char*)weight_buffer + weight_stride;
+            ggml_cann_async_memcpy(ctx, scale_buffer, scale_tmp_ptr, scale_stride,
+                ACL_MEMCPY_DEVICE_TO_DEVICE);
+
+            src0_row.data = weight_buffer;
+            src1_row.data = src1_tmp_ptr;
+            dst_row.data = dst_tmp_ptr;
+            dst_row.src[0] = &src0_row;
+            dst_row.src[1] = &src1_row;
+
+            ggml_cann_mul_mat(ctx, &dst_row);
+        }
+    }
+    return;
+}
+
+void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    const enum ggml_type type = dst->src[0]->type;
+    switch (type) {
+        case GGML_TYPE_F32:
+        case GGML_TYPE_F16:
+            ggml_cann_mul_mat_id_fp(ctx, dst);
+            break;
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q8_0:
+            ggml_cann_mul_mat_id_quant(ctx, dst);
+            break;
+        default:
+            GGML_ABORT("Unsupported type for mul_mat_id");
+            break;
+    }
+}
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -978,6 +978,33 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe
    }
 }

+/**
+ * @brief   Performs sparse expert-based matrix multiplication using the CANN backend.
+ *
+ * @details This function implements a MoE-style batched matrix multiplication, where each input token
+ *          is routed to one or more experts, and each expert corresponds to a specific [D, M] weight matrix
+ *          in the source tensor `src0`. The routing indices are provided via the `ids` tensor.
+ *
+ *          For each token (from `src1`), the function selects the corresponding expert(s) as specified by `ids`,
+ *          performs the matrix multiplication with the selected expert's weight submatrix (from `src0`),
+ *          and stores the results in `dst`. This operation is optimized and executed on the CANN backend.
+ *
+ *          Dimensions:
+ *              - src0: [D, M, A, 1], where A is the number of experts
+ *              - src1: [D, B, N, 1], where N is batch size and B is the slot count per sample
+ *              - ids : [K, N],       where K is the number of experts each token is routed to
+ *              - dst : [M, K, N, 1], output tensor storing the result of expert × token multiplication
+ *
+ *          The function handles two main modes:
+ *              - If `ne12 == 1`, a simpler per-token loop is used.
+ *              - TODO: If `ne12 > 1`, grouped multiplication and memory copying is used for efficiency.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the expert-weighted token outputs are stored.
+ *            Expected to be of shape [M, K, N, 1].
+ */
+void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
 /**
 * @brief Applies a element-wise operation to two input tensors using the CANN
 * backend.
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1672,7 +1672,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
            ggml_cann_mul_mat(ctx, dst);
            break;
        case GGML_OP_MUL_MAT_ID:
-            return false;
+            ggml_cann_mul_mat_id(ctx, dst);
+            break;
        case GGML_OP_SCALE:
            ggml_cann_scale(ctx, dst);
            break;
@@ -2030,7 +2031,22 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            }
        }
        case GGML_OP_MUL_MAT_ID:
-            return false;
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F16:
+                case GGML_TYPE_F32:
+                    return true;
+                case GGML_TYPE_Q8_0:
+                case GGML_TYPE_Q4_0:
+#ifdef ASCEND_310P
+                    // Q4 && Q8 per group is not suppor on 310p device
+                    return false;
+#endif
+                    // only support contiguous for quantized types.
+                    return ggml_is_contiguous(op->src[0]) &&
+                            ggml_is_contiguous(op->src[1]);
+                default:
+                    return false;
+            }
        // embedding
        case GGML_OP_GET_ROWS: {
            switch (op->src[0]->type) {
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -385,9 +385,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)

        # Fetch KleidiAI sources:
        include(FetchContent)
-        set(KLEIDIAI_COMMIT_TAG "v1.5.0")
+        set(KLEIDIAI_COMMIT_TAG "v1.6.0")
        set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
-        set(KLEIDIAI_ARCHIVE_MD5  "ea22e1aefb800e9bc8c74d91633cc58e")
+        set(KLEIDIAI_ARCHIVE_MD5  "75b4ad68f25ab673dcc01065e5a0b05f")

        if (POLICY CMP0135)
            cmake_policy(SET CMP0135 NEW)
@@ -428,6 +428,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            ${KLEIDIAI_SRC}/kai/ukernels/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)

        set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
@@ -438,17 +439,19 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
        string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)

-        set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS})
+        set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP})

-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c)
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c)
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c)
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
+        list(APPEND GGML_KLEIDIAI_SOURCES
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)

        if (NOT DOTPROD_ENABLED MATCHES -1)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
+            list(APPEND GGML_KLEIDIAI_SOURCES
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
        endif()

        if (NOT I8MM_ENABLED MATCHES -1)
@@ -456,9 +459,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        endif()

        if (NOT SME_ENABLED MATCHES -1)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c)
-            set(PRIVATE_ARCH_FLAGS "${PRIVATE_ARCH_FLAGS}+sve+sve2")
+            list(APPEND GGML_KLEIDIAI_SOURCES
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c)
+            set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
        endif()

        set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -8519,7 +8519,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi

 void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
+#ifdef __ARM_FEATURE_MATMUL_INT8
+    assert((nrc == 2) || (nrc == 1));
+#else
    assert(nrc == 1);
+#endif
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
@@ -8530,6 +8534,197 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi

    const int nb = n / QK_K;

+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    if (nrc == 2) {
+        const block_q6_K * GGML_RESTRICT x0 = x;
+        const block_q6_K * GGML_RESTRICT x1 = (const block_q6_K *) ((const uint8_t *)vx + bx);
+        const block_q8_K * GGML_RESTRICT y0 = y;
+        const block_q8_K * GGML_RESTRICT y1 = (const block_q8_K *) ((const uint8_t *)vy + by);
+
+        float32x4_t vfsum = vdupq_n_f32(0.0f);
+
+        for (int i = 0; i < nb; ++i, ++x0, ++x1, ++y0, ++y1) {
+            const uint8_t * GGML_RESTRICT ql0 = x0->ql;
+            const uint8_t * GGML_RESTRICT ql1 = x1->ql;
+            const uint8_t * GGML_RESTRICT qh0 = x0->qh;
+            const uint8_t * GGML_RESTRICT qh1 = x1->qh;
+            const  int8_t * GGML_RESTRICT qy0 = y0->qs;
+            const  int8_t * GGML_RESTRICT qy1 = y1->qs;
+
+            const uint8x16_t mone = vdupq_n_u8(0x30);
+            const uint8x16_t  m4b = vdupq_n_u8(0x0f);
+
+            int32x4_t visum = vdupq_n_s32(0);
+
+            // process 8 blocks per iteration, totally 16 blocks
+            for (int j = 0; j < 2; ++j, qh0 += 32, ql0 += 64, qh1 += 32, ql1 += 64) {
+                int8x16_t vx0[8], vx1[8];
+
+                // de-quantize vx0[8]
+                {
+                    const uint8x16x2_t qh_bits = vld1q_u8_x2(qh0);
+                    const uint8x16x4_t ql_bits = vld1q_u8_x4(ql0);
+
+                    uint8x16_t q6h_0 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 4));
+                    uint8x16_t q6h_1 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 4));
+                    uint8x16_t q6h_2 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 2));
+                    uint8x16_t q6h_3 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 2));
+
+                    vx0[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[0], m4b), q6h_0));
+                    vx0[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[1], m4b), q6h_1));
+                    vx0[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[2], m4b), q6h_2));
+                    vx0[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[3], m4b), q6h_3));
+
+                    q6h_0 = vandq_u8(mone, qh_bits.val[0]);
+                    q6h_1 = vandq_u8(mone, qh_bits.val[1]);
+                    q6h_2 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[0], 2));
+                    q6h_3 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[1], 2));
+
+                    vx0[4] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[0], 4), q6h_0));
+                    vx0[5] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[1], 4), q6h_1));
+                    vx0[6] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[2], 4), q6h_2));
+                    vx0[7] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[3], 4), q6h_3));
+                }
+
+                // de-quantize vx1[8]
+                {
+                    const uint8x16x2_t qh_bits = vld1q_u8_x2(qh1);
+                    const uint8x16x4_t ql_bits = vld1q_u8_x4(ql1);
+
+                    uint8x16_t q6h_0 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 4));
+                    uint8x16_t q6h_1 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 4));
+                    uint8x16_t q6h_2 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 2));
+                    uint8x16_t q6h_3 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 2));
+
+                    vx1[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[0], m4b), q6h_0));
+                    vx1[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[1], m4b), q6h_1));
+                    vx1[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[2], m4b), q6h_2));
+                    vx1[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[3], m4b), q6h_3));
+
+                    q6h_0 = vandq_u8(mone, qh_bits.val[0]);
+                    q6h_1 = vandq_u8(mone, qh_bits.val[1]);
+                    q6h_2 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[0], 2));
+                    q6h_3 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[1], 2));
+
+                    vx1[4] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[0], 4), q6h_0));
+                    vx1[5] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[1], 4), q6h_1));
+                    vx1[6] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[2], 4), q6h_2));
+                    vx1[7] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[3], 4), q6h_3));
+                }
+
+                // process 16 elements (one block with same scale) per iteration
+                // - vx = concat(ql, qh) - 32
+                // - r1,r2,r3,r4 = smmla(vx, vy)
+                for (int k = 0; k < 8; ++k) {
+                    const int blk = j * 8 + k;
+
+                    const int8x16_t vy0 = vld1q_s8(qy0);
+                    const int8x16_t vy1 = vld1q_s8(qy1);
+                    qy0 += 16;
+                    qy1 += 16;
+
+                    const int32x4_t block_scale = {
+                        x0->scales[blk],
+                        x0->scales[blk],
+                        x1->scales[blk],
+                        x1->scales[blk],
+                    };
+
+                    // calculate four results at once with outer product
+                    const int8x16_t vx_l = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(vx0[k]), vreinterpretq_s64_s8(vx1[k])));
+                    const int8x16_t vx_h = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(vx0[k]), vreinterpretq_s64_s8(vx1[k])));
+                    const int8x16_t vy_l = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(vy0), vreinterpretq_s64_s8(vy1)));
+                    const int8x16_t vy_h = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(vy0), vreinterpretq_s64_s8(vy1)));
+                    int32x4_t vr = vdupq_n_s32(0);
+                    vr = vmmlaq_s32(vr, vx_l, vy_l);
+                    vr = vmmlaq_s32(vr, vx_h, vy_h);
+
+                    // apply block scale, will NOT overflow
+                    // block_scale * sum_256(int6*int8) <= 2^(8+8+6+8) = 30 bits
+                    visum = vmlaq_s32(visum, vr, block_scale);
+                }
+            }
+
+            // adjust bias, apply superblock scale
+            {
+                int32_t bias[4];
+#ifdef __ARM_FEATURE_SVE
+                const svbool_t pg16_8 = svptrue_pat_b16(SV_VL8);
+                const svbool_t pg8_8 = svptrue_pat_b8(SV_VL8);
+                const svint16_t y0_q8sums_0 = svld1_s16(pg16_8, y0->bsums);
+                const svint16_t y0_q8sums_1 = svld1_s16(pg16_8, y0->bsums + 8);
+                const svint16_t y1_q8sums_0 = svld1_s16(pg16_8, y1->bsums);
+                const svint16_t y1_q8sums_1 = svld1_s16(pg16_8, y1->bsums + 8);
+                const svint16_t x0_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x0->scales));
+                const svint16_t x0_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x0->scales + 8));
+                const svint16_t x1_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x1->scales));
+                const svint16_t x1_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x1->scales + 8));
+                const svint64_t zero = svdup_n_s64(0);
+                bias[0] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x0_q6scales_0),
+                                                                               svdot_s64(zero, y0_q8sums_1, x0_q6scales_1)));
+                bias[1] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x0_q6scales_0),
+                                                                               svdot_s64(zero, y1_q8sums_1, x0_q6scales_1)));
+                bias[2] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x1_q6scales_0),
+                                                                               svdot_s64(zero, y0_q8sums_1, x1_q6scales_1)));
+                bias[3] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x1_q6scales_0),
+                                                                               svdot_s64(zero, y1_q8sums_1, x1_q6scales_1)));
+#else
+                // NEON doesn't support int16 dot product, fallback to separated mul and add
+                const int16x8x2_t q8sums0 = vld1q_s16_x2(y0->bsums);
+                const int16x8x2_t q8sums1 = vld1q_s16_x2(y1->bsums);
+
+                int8x16_t scales_s8 = vld1q_s8(x0->scales);
+                const int16x8x2_t q6scales0 = {{vmovl_s8(vget_low_s8(scales_s8)), vmovl_s8(vget_high_s8(scales_s8))}};
+                scales_s8 = vld1q_s8(x1->scales);
+                const int16x8x2_t q6scales1 = {{vmovl_s8(vget_low_s8(scales_s8)), vmovl_s8(vget_high_s8(scales_s8))}};
+
+                int32x4_t prod;
+                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[0]), vget_low_s16 (q6scales0.val[0])),
+                                           vmull_s16(vget_high_s16(q8sums0.val[0]), vget_high_s16(q6scales0.val[0]))),
+                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[1]), vget_low_s16 (q6scales0.val[1])),
+                                           vmull_s16(vget_high_s16(q8sums0.val[1]), vget_high_s16(q6scales0.val[1]))));
+                bias[0] = vaddvq_s32(prod);
+                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[0]), vget_low_s16 (q6scales0.val[0])),
+                                           vmull_s16(vget_high_s16(q8sums1.val[0]), vget_high_s16(q6scales0.val[0]))),
+                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[1]), vget_low_s16 (q6scales0.val[1])),
+                                           vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales0.val[1]))));
+                bias[1] = vaddvq_s32(prod);
+                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[0]), vget_low_s16 (q6scales1.val[0])),
+                                           vmull_s16(vget_high_s16(q8sums0.val[0]), vget_high_s16(q6scales1.val[0]))),
+                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[1]), vget_low_s16 (q6scales1.val[1])),
+                                           vmull_s16(vget_high_s16(q8sums0.val[1]), vget_high_s16(q6scales1.val[1]))));
+                bias[2] = vaddvq_s32(prod);
+                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[0]), vget_low_s16 (q6scales1.val[0])),
+                                           vmull_s16(vget_high_s16(q8sums1.val[0]), vget_high_s16(q6scales1.val[0]))),
+                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[1]), vget_low_s16 (q6scales1.val[1])),
+                                           vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales1.val[1]))));
+                bias[3] = vaddvq_s32(prod);
+
+#endif
+                const int32x4_t vibias = vmulq_n_s32(vld1q_s32(bias), 32);
+
+                const float32x4_t superblock_scale = {
+                    GGML_FP16_TO_FP32(x0->d) * y0->d,
+                    GGML_FP16_TO_FP32(x0->d) * y1->d,
+                    GGML_FP16_TO_FP32(x1->d) * y0->d,
+                    GGML_FP16_TO_FP32(x1->d) * y1->d,
+                };
+
+                visum = vsubq_s32(visum, vibias);
+                vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale);
+            }
+        }
+
+        // vfsum = ABCD -> ACBD
+        // AC -> s, BD -> (s+bs)
+        vfsum = vzip1q_f32(vfsum, vextq_f32(vfsum, vfsum, 2));
+        vst1_f32(s,      vget_low_f32 (vfsum));
+        vst1_f32(s + bs, vget_high_f32(vfsum));
+
+        return;
+    }
+#endif
+
 #ifdef __ARM_FEATURE_SVE
    const int vector_length = ggml_cpu_get_sve_cnt()*8;
    float sum = 0;
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -282,7 +282,11 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
        .from_float               = quantize_row_q6_K,
        .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
+#if defined (__ARM_FEATURE_MATMUL_INT8)
+        .nrows                    = 2,
+#else
        .nrows                    = 1,
+#endif
    },
    [GGML_TYPE_IQ2_XXS] = {
        .from_float               = NULL,
@@ -2198,6 +2202,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                    } break;

                case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_GELU_ERF:
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_SILU:
                    {
@@ -3479,6 +3484,19 @@ void ggml_cpu_init(void) {
            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);

            GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
+
+#ifdef GGML_USE_OPENMP
+            //if (!getenv("OMP_WAIT_POLICY")) {
+            //    // set the wait policy to active, so that OpenMP threads don't sleep
+            //    putenv("OMP_WAIT_POLICY=active");
+            //}
+
+            if (!getenv("KMP_BLOCKTIME")) {
+                // set the time to wait before sleeping a thread
+                // this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
+                putenv("KMP_BLOCKTIME=200"); // 200ms
+            }
+#endif
        }

 #if defined(__ARM_ARCH)
--- a/ggml/src/ggml-cpu/kleidiai/kernels.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kernels.cpp
@@ -4,16 +4,22 @@

 // KleidiAI micro-kernels
 #include "kai_matmul_clamp_f32_qsi8d32p_qsi4c32p_interface.h"
-#include "kai_lhs_quant_pack_qsi8d32p_f32.h"
-#include "kai_lhs_quant_pack_qsi8d32p_f32_neon.h"
-#include "kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
-#include "kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h"
 #include "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h"
 #include "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h"
 #include "kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h"
 #include "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h"
 #include "kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h"
 #include "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h"
+#include "kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.h"
+
+#include "kai_lhs_pack_bf16p2vlx2_f32_sme.h"
+#include "kai_lhs_quant_pack_qsi8d32p_f32.h"
+#include "kai_lhs_quant_pack_qsi8d32p_f32_neon.h"
+
+#include "kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.h"
+#include "kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
+#include "kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h"
+
 #include "kai_common.h"

 #include "kernels.h"
@@ -61,6 +67,53 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
        },
        /* .required_cpu       = */ CPU_FEATURE_SME,
+        /* .lhs_type           = */ GGML_TYPE_F32,
+        /* .rhs_type           = */ GGML_TYPE_Q4_0,
+        /* .op_type            = */ GGML_TYPE_F32,
+    },
+    {
+        /* SME GEMM */
+        /* .kern_info = */ {
+            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_lhs_offset        = */ kai_get_lhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .run_kernel            = */ kai_run_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+        },
+        /* SME GEMV */
+        /* .kern_info = */ {
+            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_lhs_offset        = */ kai_get_lhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .run_kernel            = */ kai_run_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+        },
+        /* .lhs_info = */ {
+            /* .get_offset            = */ kai_get_lhs_offset_lhs_pack_bf16p2vlx2_f32_sme,
+            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_pack_bf16p2vlx2_f32_sme,
+            /* .packed_size           = */ kai_get_lhs_packed_size_lhs_pack_bf16p2vlx2_f32_sme,
+            /* .pack_func             = */ kai_run_lhs_pack_bf16p2vlx2_f32_sme,
+        },
+        /* .rhs_info = */ {
+            /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
+            /* .pack_func   = */ kai_run_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
+        },
+        /* .required_cpu       = */ CPU_FEATURE_SME,
+        /* .lhs_type           = */ GGML_TYPE_F32,
+        /* .rhs_type           = */ GGML_TYPE_F16,
+        /* .op_type            = */ GGML_TYPE_F32,
    },
 #endif
 #if defined(__APPLE__)
@@ -105,6 +158,9 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
        },
        /* .required_cpu       = */ CPU_FEATURE_DOTPROD,
+        /* .lhs_type           = */ GGML_TYPE_F32,
+        /* .rhs_type           = */ GGML_TYPE_Q4_0,
+        /* .op_type            = */ GGML_TYPE_F32,
    },
 #endif
 #if defined(__ARM_FEATURE_MATMUL_INT8)
@@ -148,6 +204,9 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
        },
        /* .required_cpu       = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
+        /* .lhs_type           = */ GGML_TYPE_F32,
+        /* .rhs_type           = */ GGML_TYPE_Q4_0,
+        /* .op_type            = */ GGML_TYPE_F32,
    },
 #endif
 #else
@@ -192,6 +251,9 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
        },
        /* .required_cpu       = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
+        /* .lhs_type           = */ GGML_TYPE_F32,
+        /* .rhs_type           = */ GGML_TYPE_Q4_0,
+        /* .op_type            = */ GGML_TYPE_F32,
    },
 #endif
 #if defined(__ARM_FEATURE_DOTPROD)
@@ -235,12 +297,33 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
        },
        /* .required_cpu       = */ CPU_FEATURE_DOTPROD,
+        /* .lhs_type           = */ GGML_TYPE_F32,
+        /* .rhs_type           = */ GGML_TYPE_Q4_0,
+        /* .op_type            = */ GGML_TYPE_F32,
    },
 #endif
 #endif
 };

-ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature features) {
+ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor) {
+    ggml_kleidiai_kernels * kernel = nullptr;
+
+    if (tensor->op == GGML_OP_MUL_MAT && tensor->src[0] != nullptr && tensor->src[1] != nullptr) {
+        for (size_t i = 0; i < NELEMS(gemm_gemv_kernels); ++i) {
+            if ((cpu_features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu &&
+                gemm_gemv_kernels[i].lhs_type == tensor->src[1]->type &&
+                gemm_gemv_kernels[i].rhs_type == tensor->src[0]->type &&
+                gemm_gemv_kernels[i].op_type  == tensor->type) {
+                kernel = &gemm_gemv_kernels[i];
+                break;
+            }
+        }
+    }
+
+    return kernel;
+}
+
+ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features) {
    ggml_kleidiai_kernels * kernels = nullptr;

    for (size_t i = 0; i < NELEMS(gemm_gemv_kernels); ++i) {
--- a/ggml/src/ggml-cpu/kleidiai/kernels.h
+++ b/ggml/src/ggml-cpu/kleidiai/kernels.h
@@ -4,6 +4,10 @@

 #pragma once

+#include <functional>
+#include <variant>
+#include "ggml.h"
+
 enum cpu_feature {
    CPU_FEATURE_NONE    = 0,
    CPU_FEATURE_DOTPROD = 1,
@@ -26,26 +30,53 @@ struct kernel_info {
    size_t (*get_nr)(void);
    size_t (*get_kr)(void);
    size_t (*get_sr)(void);
-    size_t (*get_lhs_offset)(size_t m_idx, size_t k, size_t bl);
-    size_t (*get_rhs_packed_offset)(size_t n_idx, size_t k, size_t bl);
+    std::variant<
+        std::function<size_t(size_t n_idx, size_t k, size_t bl)>,
+        std::function<size_t(size_t m_idx, size_t k)>
+    > get_lhs_offset;
+    std::variant<
+        std::function<size_t(size_t n_idx, size_t k, size_t bl)>,
+        std::function<size_t(size_t n_idx, size_t k)>
+    > get_rhs_packed_offset;
    size_t (*get_dst_offset)(size_t m_idx, size_t n_idx, size_t stride);
    size_t (*get_dst_size)(size_t m, size_t n);
-    void (*run_kernel)(size_t m, size_t n, size_t k, size_t bl, const void* lhs_packed, const void* rhs_packed,
-                         float* dst, size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max);
+    std::variant<
+        std::function<void(size_t m, size_t n, size_t k, size_t bl, const void* lhs_packed, const void* rhs_packed,
+            float* dst, size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max)>,
+        std::function<void(size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed, void* dst, size_t dst_stride_row,
+            size_t dst_stride_col, float clamp_min, float clamp_max)>
+    > run_kernel;
 };

 struct lhs_packing_info {
    size_t (*get_offset)(size_t m_idx, size_t lhs_stride);
-    size_t (*get_packed_offset)(size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
-    size_t (*packed_size)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
-    void (*pack_func)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
-                      size_t lhs_stride, void* lhs_packed);
+    std::variant<
+        std::function<size_t(size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr)>,
+        std::function<size_t(size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr)>
+    > get_packed_offset;
+    std::variant<
+        std::function<size_t(size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr)>,
+        std::function<size_t(size_t m, size_t k, size_t mr, size_t kr, size_t sr)>
+    > packed_size;
+    std::variant<
+        std::function<void(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
+            size_t lhs_stride, void* lhs_packed)>,
+        std::function<void(size_t m, size_t k, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const void* lhs, size_t lhs_stride,
+        void* lhs_packed)>
+    > pack_func;
 };

 struct rhs_packing_info {
-    size_t (*packed_size)(size_t n, size_t k, size_t nr, size_t kr, size_t bl);
-    void (*pack_func)(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl, const uint8_t* rhs,
-                      const float* bias, void* rhs_packed, size_t extra_bytes, const struct kai_rhs_pack_qs4cxs1s0_param* params);
+    std::variant<
+        std::function<size_t(size_t n, size_t k, size_t nr, size_t kr, size_t bl)>,
+        std::function<size_t(size_t n, size_t k)>
+    > packed_size;
+    std::variant<
+        std::function<void(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl, const uint8_t* rhs,
+            const float* bias, void* rhs_packed, size_t extra_bytes, const struct kai_rhs_pack_qs4cxs1s0_param* params)>,
+        std::function<void(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs,
+            const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params)>
+    > pack_func;
 };

 struct ggml_kleidiai_kernels {
@@ -55,6 +86,10 @@ struct ggml_kleidiai_kernels {
    rhs_packing_info rhs_info;

    cpu_feature required_cpu;
+    ggml_type lhs_type;
+    ggml_type rhs_type;
+    ggml_type op_type;
 };

-ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features);
+ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor);
+ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features);
--- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
@@ -3,7 +3,9 @@
 //
 #include <arm_neon.h>
 #include <assert.h>
+#include <atomic>
 #include <cfloat>
+#include <stdexcept>
 #include <stdint.h>
 #include <string.h>
 #if defined(__linux__)
@@ -34,8 +36,9 @@
 #include "ggml-common.h"

 struct ggml_kleidiai_context {
+    cpu_feature features;
    ggml_kleidiai_kernels * kernels;
-} static ctx = { NULL };
+} static ctx = { CPU_FEATURE_NONE, NULL };

 static void init_kleidiai_context(void) {

@@ -47,18 +50,18 @@ static void init_kleidiai_context(void) {
        const char *env_var = getenv("GGML_KLEIDIAI_SME");
        int sme_enabled = 0;

-        cpu_feature features  = (ggml_cpu_has_dotprod()     ? CPU_FEATURE_DOTPROD : CPU_FEATURE_NONE) |
-                                (ggml_cpu_has_matmul_int8() ? CPU_FEATURE_I8MM    : CPU_FEATURE_NONE) |
-                                (ggml_cpu_has_sve()         ? CPU_FEATURE_SVE     : CPU_FEATURE_NONE);
+        ctx.features  = (ggml_cpu_has_dotprod()     ? CPU_FEATURE_DOTPROD : CPU_FEATURE_NONE) |
+                        (ggml_cpu_has_matmul_int8() ? CPU_FEATURE_I8MM    : CPU_FEATURE_NONE) |
+                        (ggml_cpu_has_sve()         ? CPU_FEATURE_SVE     : CPU_FEATURE_NONE);

        if (env_var) {
            sme_enabled = atoi(env_var);
        }

        if (sme_enabled != 0) {
-            features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
+            ctx.features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
        }
-        ctx.kernels = ggml_kleidiai_select_kernels(features);
+        ctx.kernels = ggml_kleidiai_select_kernels_q4_0(ctx.features);
    }
    ggml_critical_section_end();
 }
@@ -68,95 +71,275 @@ static inline int64_t ggml_ne(const ggml_tensor * tensor, int dim) {
    return tensor->ne[dim];
 }

+template<typename Ret, typename Variant, typename... Args>
+static Ret variant_call(const Variant & var, Args&&... args) {
+    return std::visit([&](auto&& func) -> Ret {
+        if constexpr (std::is_invocable_r_v<Ret, decltype(func), Args...>) {
+            return func(std::forward<Args>(args)...);
+        } else {
+            throw std::runtime_error("Invalid function type in variant_call");
+        }
+    }, var);
+}
+
 namespace ggml::cpu::kleidiai {
+
+static size_t round_down(size_t x, size_t y) {
+    return y == 0 ? x : x - (x % y);
+}
+
+static void transpose_f32kxn_f16nxk(size_t n, size_t k, float * dst, const uint16_t * src, size_t rhs_stride) {
+    size_t src_stride = rhs_stride / sizeof(uint16_t);
+    size_t dst_stride = n;
+
+    for (size_t k_idx = 0; k_idx < k; ++k_idx) {
+        for (size_t n_idx = 0; n_idx < n; ++n_idx) {
+            uint16_t v = *(src + k_idx + n_idx * src_stride);
+            *(dst + n_idx + k_idx * dst_stride) = kai_cast_f32_f16(v);
+        }
+    }
+}
+
 class tensor_traits : public ggml::cpu::tensor_traits {
    bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
-        GGML_ASSERT(ctx.kernels);
-        kernel_info * kernel = op->src[1]->ne[1] == 1 ? &ctx.kernels->gemv : &ctx.kernels->gemm;
+        ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, op);
+        GGML_ASSERT(kernels);
+        kernel_info * kernel = op->src[1]->ne[1] == 1 ? &kernels->gemv : &kernels->gemm;

        size_t k = op->src[0]->ne[0];
+        size_t n = op->src[0]->ne[1];
        size_t m = op->src[1]->ne[1];

        size_t mr = kernel->get_mr();
        size_t kr = kernel->get_kr();
        size_t sr = kernel->get_sr();

-        size = ctx.kernels->lhs_info.packed_size(m, k, QK4_0, mr, kr, sr);
+        if (kernels->rhs_type == GGML_TYPE_Q4_0) {
+            size = variant_call<size_t>(kernels->lhs_info.packed_size, m, k, QK4_0, mr, kr, sr);
+        } else if (kernels->rhs_type == GGML_TYPE_F16) {
+            size = variant_call<size_t>(kernels->lhs_info.packed_size, m, k, mr, kr, sr) +
+                   variant_call<size_t>(kernels->rhs_info.packed_size, n, k) +
+                   k * n * sizeof(float) + n * sizeof(float);
+        } else {
+            GGML_ASSERT(false);
+        }

        return true;
    }

+
    bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * dst) override {
        if (dst->op == GGML_OP_MUL_MAT) {
-            const ggml_tensor * src0 = dst->src[0];
-            const ggml_tensor * src1 = dst->src[1];
+            if (dst->src[0]->type == GGML_TYPE_Q4_0) {
+                return compute_forward_q4_0(params, dst);
+            } else if (dst->src[0]->type == GGML_TYPE_F16) {
+                return compute_forward_kv_cache(params, dst);
+            }
+        }
+        return false;
+    }

-            GGML_TENSOR_BINARY_OP_LOCALS
+    bool compute_forward_kv_cache(ggml_compute_params * params, struct ggml_tensor * dst) {
+        static std::atomic_flag first_to_arrive = ATOMIC_FLAG_INIT;

-            GGML_ASSERT(ctx.kernels);
-            kernel_info * kernel = src1->ne[1] == 1 ? &ctx.kernels->gemv : &ctx.kernels->gemm;
-            lhs_packing_info * lhs_info = &ctx.kernels->lhs_info;
+        const ggml_tensor * src0 = dst->src[0];
+        const ggml_tensor * src1 = dst->src[1];

-            GGML_ASSERT(kernel);
+        GGML_TENSOR_BINARY_OP_LOCALS

-            const int ith = params->ith;
-            const int nth = params->nth;
+        ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
+        GGML_ASSERT(kernels);

-            const size_t k = ne00;
-            const size_t m = ne11;
-            const size_t n = ne01;
+        kernel_info * kernel = src1->ne[1] == 1 ? &kernels->gemv : &kernels->gemm;
+        GGML_ASSERT(kernel);

-            const size_t n_step = kernel->get_n_step();
-            const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
-            const size_t n_start = ith * num_n_per_thread;
+        const int nth = params->nth;
+        const int ith = params->ith;

-            size_t n_to_process = num_n_per_thread;
-            if ((n_start + n_to_process) > n) {
-                n_to_process = n - n_start;
+        const int64_t lhs_batch_size0 = ne12;
+        const int64_t rhs_batch_size0 = ne02;
+        const int64_t batch_size      = rhs_batch_size0;
+
+        const int64_t r = lhs_batch_size0 / rhs_batch_size0;
+
+        const int64_t m = ne11 * r;
+        const int64_t n = ne01;
+        const int64_t k = ne00;
+
+        const size_t lhs_stride = src1->nb[1];
+        const size_t rhs_stride = src0->nb[1];
+        const size_t dst_stride = dst->nb[1];
+
+        const int64_t mr = static_cast<int64_t>(kernel->get_mr());
+        const int64_t nr = static_cast<int64_t>(kernel->get_nr());
+        const int64_t kr = static_cast<int64_t>(kernel->get_kr());
+        const int64_t sr = static_cast<int64_t>(kernel->get_sr());
+
+        const size_t lhs_packed_size = variant_call<size_t>(kernels->lhs_info.packed_size, m, k, mr, kr, sr);
+        const size_t rhs_packed_size = variant_call<size_t>(kernels->rhs_info.packed_size, n, k);
+        const size_t kxn_size        = k * n * sizeof(float);
+        const size_t bias_size       = n * sizeof(float);
+
+        const size_t wsize_required = lhs_packed_size + rhs_packed_size + kxn_size + bias_size;
+        GGML_ASSERT(wsize_required <= params->wsize);
+
+        uint8_t * lhs_packed = static_cast<uint8_t *>(params->wdata);
+        uint8_t * rhs_packed = lhs_packed + lhs_packed_size;
+        uint8_t * rhs_kxn    = rhs_packed + rhs_packed_size;
+        uint8_t * bias       = rhs_kxn + kxn_size;
+
+        for (int64_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
+            const uint8_t * lhs_batch = static_cast<const uint8_t *>(src1->data) + batch_idx * m * lhs_stride;
+            const uint8_t * rhs_batch = static_cast<const uint8_t *>(src0->data) + batch_idx * n * rhs_stride;
+            uint8_t * dst_batch       = static_cast<uint8_t *>(dst->data) + batch_idx * m * dst_stride;
+
+            // LHS packing
+            {
+                const int64_t m_roundup_mr = kai_roundup(m, mr);
+                const int64_t num_threads  = KAI_MIN(m_roundup_mr / mr, nth);
+
+                if (ith < num_threads) {
+                    const int64_t num_m_per_thread0   = round_down(m_roundup_mr / num_threads, mr);
+                    const int64_t num_m_per_threadN_1 = m - (num_threads - 1) * num_m_per_thread0;
+
+                    const int64_t m_start          = ith * num_m_per_thread0;
+                    const int64_t num_m_per_thread = (ith == num_threads - 1) ? num_m_per_threadN_1 : num_m_per_thread0;
+
+                    const size_t lhs_offset        = variant_call<size_t>(kernels->gemm.get_lhs_offset, m_start, lhs_stride);
+                    const size_t lhs_packed_offset = variant_call<size_t>(kernels->lhs_info.get_packed_offset, m_start, k, mr, kr, sr);
+
+                    const void * src_ptr = static_cast<const uint8_t *>(lhs_batch) + lhs_offset;
+                    void * dst_ptr       = static_cast<uint8_t *>(lhs_packed) + lhs_packed_offset;
+
+                    variant_call<void>(kernels->lhs_info.pack_func, num_m_per_thread, k, mr, kr, sr, 0, src_ptr, lhs_stride, dst_ptr);
+                }
            }

-            const uint8_t * lhs        = static_cast<const uint8_t *>(src1->data);
-            uint8_t * lhs_packed       = (uint8_t*)params->wdata;
-            const uint8_t * rhs_packed = static_cast<const uint8_t *>(src0->data);
+            // RHS packing
+            if (first_to_arrive.test_and_set(std::memory_order_acquire) == false) {
+                // First thread to reach this point handles RHS packing
+                memset(bias, 0, n * sizeof(float));
+                transpose_f32kxn_f16nxk(n, k, reinterpret_cast<float *>(rhs_kxn),
+                                        reinterpret_cast<const uint16_t *>(rhs_batch), rhs_stride);

-            size_t mr = kernel->get_mr();
-            size_t kr = kernel->get_kr();
-            size_t sr = kernel->get_sr();
-
-            // Calculate number of columns to be processed per thread
-            const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
-            const size_t m_start = ith * num_m_per_thread;
-            size_t m_to_process = num_m_per_thread;
-            if ((m_start + m_to_process) > m) {
-                m_to_process = m - m_start;
-            }
-
-            if(m_start < m) {
-                // Transform LHS
-                const size_t src_stride        = src1->nb[1];
-                const float * src_ptr          = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
-                const size_t lhs_packed_offset = lhs_info->get_packed_offset(m_start, k, QK4_0, mr, kr, sr);
-                void * lhs_packed_ptr          = static_cast<void *>(lhs_packed + lhs_packed_offset);
-
-                lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
+                variant_call<void>(kernels->rhs_info.pack_func, 1, n, k, nr, kr, sr, n * sizeof(float),
+                             rhs_kxn, bias, nullptr, rhs_packed, 0, nullptr);
            }

            ggml_barrier(params->threadpool);

-            // Perform the operation
-            const size_t dst_stride        = dst->nb[1];
-            const size_t lhs_packed_offset = lhs_info->get_packed_offset(0, k, QK4_0, mr, kr, sr);
-            const size_t rhs_packed_offset = kernel->get_rhs_packed_offset(n_start, k, QK4_0);
-            const size_t dst_offset        = kernel->get_dst_offset(0, n_start, dst_stride);
-            const void * rhs_ptr           = static_cast<const void *>(rhs_packed + rhs_packed_offset);
-            const void* lhs_ptr            = (const void*)((const char *)lhs_packed + lhs_packed_offset);
-            float *dst_ptr                 = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);
+            first_to_arrive.clear(std::memory_order_release);

-            kernel->run_kernel(m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr,
-                               dst_stride, sizeof(float), -FLT_MAX, FLT_MAX);
-            return true;
+            // Perform the matmul
+            {
+                const int64_t m_to_process = m;
+                const int64_t m_start      = 0;
+
+                const int64_t n_step      = static_cast<int64_t>(kernel->get_n_step());
+                const int64_t num_threads = KAI_MIN(n / n_step, nth);
+
+                if (ith < num_threads) {
+                    const int64_t num_n_per_thread0   = round_down(n / num_threads, n_step);
+                    const int64_t num_n_per_threadN_1 = n - (num_threads - 1) * num_n_per_thread0;
+
+                    const int64_t n_start      = ith * num_n_per_thread0;
+                    const int64_t n_to_process = (ith == num_threads - 1) ? num_n_per_threadN_1 : num_n_per_thread0;
+
+                    const size_t lhs_packed_offset = variant_call<size_t>(kernel->get_lhs_offset, m_start, k);
+                    const size_t rhs_packed_offset = variant_call<size_t>(kernel->get_rhs_packed_offset, n_start, k);
+                    const size_t dst_offset        = kernel->get_dst_offset(m_start, n_start, dst_stride);
+
+                    const void * lhs_ptr = lhs_packed + lhs_packed_offset;
+                    const void * rhs_ptr = rhs_packed + rhs_packed_offset;
+                    float * dst_ptr      = reinterpret_cast<float *>(dst_batch + dst_offset);
+
+                    variant_call<void>(kernel->run_kernel, m_to_process, n_to_process, k, lhs_ptr, rhs_ptr, dst_ptr, dst_stride, sizeof(float), -FLT_MAX, FLT_MAX);
+                }
+            }
+
+            if (batch_idx != batch_size - 1) {
+                // This barrier is necessary when the batch size is larger than 1. While processing a batch,
+                // the work data buffer (params->wdata) is used as temporary storage which means that only
+                // a single batch can be processed at any given time. No barrier is needed for the last
+                // batch since GGML inserts a barrier between the execution of every operator.
+                ggml_barrier(params->threadpool);
+            }
        }
-        return false;
+
+        return true;
+    }
+
+    bool compute_forward_q4_0(struct ggml_compute_params * params, struct ggml_tensor * dst) {
+        const ggml_tensor * src0 = dst->src[0];
+        const ggml_tensor * src1 = dst->src[1];
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
+        GGML_ASSERT(kernels);
+
+        kernel_info * kernel = src1->ne[1] == 1 ? &kernels->gemv : &kernels->gemm;
+        lhs_packing_info * lhs_info = &kernels->lhs_info;
+
+        GGML_ASSERT(kernel);
+
+        const int ith = params->ith;
+        const int nth = params->nth;
+
+        const size_t k = ne00;
+        const size_t m = ne11;
+        const size_t n = ne01;
+
+        size_t mr = kernel->get_mr();
+        size_t kr = kernel->get_kr();
+        size_t sr = kernel->get_sr();
+
+        const uint8_t * lhs        = static_cast<const uint8_t *>(src1->data);
+        uint8_t * lhs_packed       = (uint8_t*)params->wdata;
+        const uint8_t * rhs_packed = static_cast<const uint8_t *>(src0->data);
+
+        const size_t n_step = kernel->get_n_step();
+        const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
+        const size_t n_start = ith * num_n_per_thread;
+
+        size_t n_to_process = num_n_per_thread;
+        if ((n_start + n_to_process) > n) {
+            n_to_process = n - n_start;
+        }
+
+        // Calculate number of columns to be processed per thread
+        const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
+        const size_t m_start = ith * num_m_per_thread;
+        size_t m_to_process = num_m_per_thread;
+        if ((m_start + m_to_process) > m) {
+            m_to_process = m - m_start;
+        }
+
+        if (m_start < m) {
+            // Transform LHS
+            const size_t src_stride        = src1->nb[1];
+            const float * src_ptr          = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
+            const size_t lhs_packed_offset = variant_call<size_t>(lhs_info->get_packed_offset, m_start, k, QK4_0, mr, kr, sr);
+            void * lhs_packed_ptr          = static_cast<void *>(lhs_packed + lhs_packed_offset);
+
+            variant_call<void>(lhs_info->pack_func, m_to_process, k, QK4_0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
+        }
+
+        ggml_barrier(params->threadpool);
+
+        // Perform the operation
+        const size_t dst_stride        = dst->nb[1];
+        const size_t lhs_packed_offset = variant_call<size_t>(lhs_info->get_packed_offset, 0, k, QK4_0, mr, kr, sr);
+        const size_t rhs_packed_offset = variant_call<size_t>(kernel->get_rhs_packed_offset, n_start, k, QK4_0);
+        const size_t dst_offset        = kernel->get_dst_offset(0, n_start, dst_stride);
+        const void * rhs_ptr           = static_cast<const void *>(rhs_packed + rhs_packed_offset);
+        const void* lhs_ptr            = (const void*)((const char *)lhs_packed + lhs_packed_offset);
+        float *dst_ptr                 = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);
+
+        variant_call<void>(kernel->run_kernel, m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
+                           sizeof(float), -FLT_MAX, FLT_MAX);
+
+        return true;
    }

 public:
@@ -169,13 +352,13 @@ public:
        size_t sr      = ctx.kernels->gemm.get_sr();

 #ifndef NDEBUG
-        const size_t repacked_size = ctx.kernels->rhs_info.packed_size(n, k, nr, kr, QK4_0);
+        const size_t repacked_size = variant_call<size_t>(ctx.kernels->rhs_info.packed_size, n, k, nr, kr, QK4_0);
        GGML_ASSERT(repacked_size <= data_size && "repacked size larger than the packed size!");
 #endif
        struct kai_rhs_pack_qs4cxs1s0_param params;
        params.lhs_zero_point = 1;
        params.rhs_zero_point = 8;
-        ctx.kernels->rhs_info.pack_func(1, n, k, nr, kr, sr, QK4_0, (const uint8_t *)data, NULL, tensor->data, 0, &params);
+        variant_call<void>(ctx.kernels->rhs_info.pack_func, 1, n, k, nr, kr, sr, QK4_0, (const uint8_t*)data, nullptr, tensor->data, 0, &params);

        return 0;

@@ -189,7 +372,7 @@ static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struc
 }
 }  // namespace ggml::cpu::kleidiai

-GGML_API enum ggml_status ggml_backend_cpu_kleidiai_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+static enum ggml_status ggml_backend_cpu_kleidiai_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
    tensor->extra = (void *) ggml::cpu::kleidiai::get_tensor_traits(buffer, tensor);

    GGML_UNUSED(buffer);
@@ -238,12 +421,11 @@ static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alignment(ggml_backend_b
 namespace ggml::cpu::kleidiai {
 class extra_buffer_type : ggml::cpu::extra_buffer_type {
    bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
-        if (    op->op == GGML_OP_MUL_MAT &&
-                op->src[0]->type == GGML_TYPE_Q4_0 &&
-                op->src[0]->buffer &&
-                (ggml_n_dims(op->src[0]) == 2) &&
-                op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels
-                ) {
+        if (op->op == GGML_OP_MUL_MAT &&
+            op->src[0]->type == GGML_TYPE_Q4_0 &&
+            op->src[0]->buffer &&
+            (ggml_n_dims(op->src[0]) == 2) &&
+            op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels) {
            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
                return false;
            }
@@ -260,6 +442,19 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
            if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
                return (ggml::cpu::tensor_traits *) op->src[0]->extra;
            }
+            else if (ggml_kleidiai_select_kernels(ctx.features, op) &&
+                     op->src[0]->op == GGML_OP_VIEW &&
+                     (op->src[1]->op == GGML_OP_PERMUTE || op->src[1]->op ==  GGML_OP_SOFT_MAX) &&
+                     op->src[1]->ne[1] > 1) {
+                if ((op->src[0]->nb[0] != 2) ||
+                    (op->src[1]->nb[0] != 4) ||
+                    (op->src[0]->nb[1] * op->src[0]->ne[1] != op->src[0]->nb[2]) ||
+                    (op->src[1]->nb[1] * op->src[1]->ne[1] != op->src[1]->nb[2])) {
+                    return nullptr;
+                }
+
+                return ggml::cpu::kleidiai::get_tensor_traits(NULL, NULL);
+            }
        }
        return nullptr;
    }
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -2691,6 +2691,109 @@ static void ggml_compute_forward_gelu(
    }
 }

+// ggml_compute_forward_gelu_erf
+
+static void ggml_compute_forward_gelu_erf_f32(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_vec_gelu_erf_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            GGML_UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_gelu_erf_f16(
+    const ggml_compute_params * params,
+    ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_vec_gelu_erf_f16(nc,
+                (ggml_fp16_t *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            const float v = GGML_FP16_TO_FP32(x);
+            GGML_UNUSED(v);
+            assert(!isnan(v));
+            assert(!isinf(v));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_gelu_erf(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_gelu_erf_f32(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_gelu_erf_f16(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
 // ggml_compute_forward_gelu_quick

 static void ggml_compute_forward_gelu_quick_f32(
@@ -7749,6 +7852,10 @@ void ggml_compute_forward_unary(
            {
                ggml_compute_forward_gelu(params, dst);
            } break;
+        case GGML_UNARY_OP_GELU_ERF:
+            {
+                ggml_compute_forward_gelu_erf(params, dst);
+            } break;
        case GGML_UNARY_OP_GELU_QUICK:
            {
                ggml_compute_forward_gelu_quick(params, dst);
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -428,6 +428,7 @@ inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp
 static const float GELU_COEF_A     = 0.044715f;
 static const float GELU_QUICK_COEF = -1.702f;
 static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
+static const float SQRT_2_INV      = 0.70710678118654752440084436210484f;

 inline static float ggml_gelu_f32(float x) {
    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
@@ -440,6 +441,14 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
    }
 }

+inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        float xi = GGML_FP16_TO_FP32(x[i]);
+        float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
+        y[i] = GGML_FP32_TO_FP16(res);
+    }
+}
+
 #ifdef GGML_GELU_FP16
 inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
    uint16_t t;
@@ -463,6 +472,13 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
 }
 #endif

+inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
+    for (int i = 0; i < n; ++i) {
+        float xi = x[i];
+        y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
+    }
+}
+
 inline static float ggml_gelu_quick_f32(float x) {
    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
 }
--- a/ggml/src/ggml-cuda/acc.cu
+++ b/ggml/src/ggml-cuda/acc.cu
@@ -1,47 +1,61 @@
 #include "acc.cuh"

-static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
-    const int ne10, const int ne11, const int ne12,
-    const int nb1, const int nb2, int offset) {
-    const int i = blockDim.x * blockIdx.x + threadIdx.x;
+static __global__ void acc_f32(const float * x, const float * y, float * dst, const int64_t ne,
+        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
+        const int64_t s11, const int64_t s12, const int64_t s13, const int64_t offset) {
+    const int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
+
    if (i >= ne) {
        return;
    }
-    int src1_idx = i - offset;
-    int oz = src1_idx / nb2;
-    int oy = (src1_idx - (oz * nb2)) / nb1;
-    int ox = src1_idx % nb1;
-    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
-        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
-    } else {
-        dst[i] = x[i];
+
+    int64_t src1_idx = i - offset;
+
+    int64_t tmp = src1_idx;
+    const int64_t i13 = tmp / s13;
+    tmp -= i13 * s13;
+    const int64_t i12 = tmp / s12;
+    tmp -= i12 * s12;
+    const int64_t i11 = tmp / s11;
+    tmp -= i11 * s11;
+    const int64_t i10 = tmp;
+
+    float val = x[i];
+    if (src1_idx >= 0 && i10 < ne10 && i11 < ne11 && i12 < ne12 && i13 < ne13) {
+        val += y[((i13*ne12 + i12) * ne11 + i11) * ne10 + i10];
    }
+    dst[i] = val;
 }

-static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
-    const int ne10, const int ne11, const int ne12,
-    const int nb1, const int nb2, const int offset, cudaStream_t stream) {
-    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
-    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
+static void acc_f32_cuda(const float * x, const float * y, float * dst, const int64_t n_elements,
+        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
+        const int64_t s1, const int64_t s2, const int64_t s3, const int64_t offset, cudaStream_t stream) {
+    const int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
+    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, ne13, s1, s2, s3, offset);
 }

 void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
-    const float * src0_d = (const float *)src0->data;
-    const float * src1_d = (const float *)src1->data;
-    float * dst_d = (float *)dst->data;
+
+    const float * src0_d = (const float *) src0->data;
+    const float * src1_d = (const float *) src1->data;
+    float       * dst_d  = (float       *)  dst->data;
+
    cudaStream_t stream = ctx.stream();

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported

-    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
-    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
-    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
-    int offset = dst->op_params[3] / 4; // offset in bytes
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(dst->nb[0] == ggml_element_size(dst));
+    GGML_ASSERT(ggml_is_contiguously_allocated(dst));

-    acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
+    const int64_t s1     = dst->op_params[0] / sizeof(float);
+    const int64_t s2     = dst->op_params[1] / sizeof(float);
+    const int64_t s3     = dst->op_params[2] / sizeof(float);
+    const int64_t offset = dst->op_params[3] / sizeof(float);
+
+    acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], s1, s2, s3, offset, stream);
 }
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -1,5 +1,8 @@
 #include "cpy.cuh"
 #include "dequantize.cuh"
+#ifdef GGML_USE_MUSA
+#include "ggml-musa/mudnn.cuh"
+#endif // GGML_USE_MUSA

 typedef void (*cpy_kernel_t)(const char * cx, char * cdst);

@@ -597,7 +600,14 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
 #endif
    if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
        GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
-        CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
+#ifdef GGML_USE_MUSA
+        if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
+            CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
+        } else
+#endif // GGML_USE_MUSA
+        {
+            CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
+        }
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -678,10 +678,14 @@ void launch_fattn(
 ) {
    constexpr int ncols = ncols1 * ncols2;

+    const bool is_mla = DV == 512; // TODO better parameterization
+
    const ggml_tensor * Q = dst->src[0];
    const ggml_tensor * K = dst->src[1];
    const ggml_tensor * V = dst->src[2];

+    GGML_ASSERT(V || is_mla);
+
    const ggml_tensor * mask = dst->src[3];

    ggml_tensor * KQV = dst;
@@ -689,6 +693,10 @@ void launch_fattn(
    GGML_ASSERT(Q->type == GGML_TYPE_F32);
    GGML_ASSERT(KQV->type == GGML_TYPE_F32);

+    GGML_ASSERT(      Q->nb[0] == ggml_element_size(Q));
+    GGML_ASSERT(      K->nb[0] == ggml_element_size(K));
+    GGML_ASSERT(!V || V->nb[0] == ggml_element_size(V));
+
    GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
    GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
        "the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
@@ -713,10 +721,10 @@ void launch_fattn(
    size_t nb12 = K->nb[2];
    size_t nb13 = K->nb[3];

-    const char * V_data = (const char *) V->data;
-    size_t nb21 = V->nb[1];
-    size_t nb22 = V->nb[2];
-    size_t nb23 = V->nb[3];
+    const char * V_data = V ? (const char *) V->data : nullptr;
+    size_t nb21 = V ? V->nb[1] : nb11;
+    size_t nb22 = V ? V->nb[2] : nb12;
+    size_t nb23 = V ? V->nb[3] : nb13;

    if (need_f16_K && K->type != GGML_TYPE_F16) {
        GGML_ASSERT(ggml_is_contiguously_allocated(K));
@@ -733,7 +741,7 @@ void launch_fattn(
        nb13 = nb13*bs*sizeof(half)/ts;
    }

-    if (need_f16_V && V->type != GGML_TYPE_F16) {
+    if (V && need_f16_V && V->type != GGML_TYPE_F16) {
        GGML_ASSERT(ggml_is_contiguously_allocated(V));
        V_f16.alloc(ggml_nelements(V));
        to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -33,9 +33,30 @@ struct fattn_mma_f16_config< 64,  64> {
    static constexpr int  nwarps_max     = 4;
    static constexpr bool Q_in_reg       = true;
    static constexpr int  nstages_target = 2;
-    static constexpr int  nbatch_K2      = 32;
-    static constexpr int  nbatch_V2      = 32;
-    static constexpr int  nbatch_combine = 32;
+
+    static int get_nbatch_K2_host(const int /*cc*/, const int /*ncols*/) {
+        return 32;
+    }
+
+    static constexpr __device__ int get_nbatch_K2_device(int /*ncols*/) {
+        return 32;
+    }
+
+    static int get_nbatch_V2_host(const int /*cc*/, const int /*ncols*/) {
+        return 32;
+    }
+
+    static constexpr __device__ int get_nbatch_V2_device(int /*ncols*/) {
+        return 32;
+    }
+
+    static int get_nbatch_combine_host(const int /*cc*/, const int /*ncols*/) {
+        return 32;
+    }
+
+    static constexpr __device__ int get_nbatch_combine_device(int /*ncols*/) {
+        return 32;
+    }
 };

 template <>
@@ -44,9 +65,30 @@ struct fattn_mma_f16_config< 80,  80> {
    static constexpr int  nwarps_max     = 4;
    static constexpr bool Q_in_reg       = true;
    static constexpr int  nstages_target = 2;
-    static constexpr int  nbatch_K2      = 40;
-    static constexpr int  nbatch_V2      = 40;
-    static constexpr int  nbatch_combine = 40;
+
+    static int get_nbatch_K2_host(const int /*cc*/, const int /*ncols*/) {
+        return 40;
+    }
+
+    static constexpr __device__ int get_nbatch_K2_device(int /*ncols*/) {
+        return 40;
+    }
+
+    static int get_nbatch_V2_host(const int /*cc*/, const int /*ncols*/) {
+        return 40;
+    }
+
+    static constexpr __device__ int get_nbatch_V2_device(int /*ncols*/) {
+        return 40;
+    }
+
+    static int get_nbatch_combine_host(const int /*cc*/, const int /*ncols*/) {
+        return 40;
+    }
+
+    static constexpr __device__ int get_nbatch_combine_device(int /*ncols*/) {
+        return 40;
+    }
 };

 template <>
@@ -55,9 +97,30 @@ struct fattn_mma_f16_config< 96,  96> {
    static constexpr int  nwarps_max     = 4;
    static constexpr bool Q_in_reg       = true;
    static constexpr int  nstages_target = 2;
-    static constexpr int  nbatch_K2      = 48;
-    static constexpr int  nbatch_V2      = 48;
-    static constexpr int  nbatch_combine = 48;
+
+    static int get_nbatch_K2_host(const int /*cc*/, const int /*ncols*/) {
+        return 48;
+    }
+
+    static constexpr __device__ int get_nbatch_K2_device(int /*ncols*/) {
+        return 48;
+    }
+
+    static int get_nbatch_V2_host(const int /*cc*/, const int /*ncols*/) {
+        return 48;
+    }
+
+    static constexpr __device__ int get_nbatch_V2_device(int /*ncols*/) {
+        return 48;
+    }
+
+    static int get_nbatch_combine_host(const int /*cc*/, const int /*ncols*/) {
+        return 48;
+    }
+
+    static constexpr __device__ int get_nbatch_combine_device(int /*ncols*/) {
+        return 48;
+    }
 };

 template <>
@@ -66,9 +129,30 @@ struct fattn_mma_f16_config<112, 112> {
    static constexpr int  nwarps_max     = 4;
    static constexpr bool Q_in_reg       = true;
    static constexpr int  nstages_target = 2;
-    static constexpr int  nbatch_K2      = 56;
-    static constexpr int  nbatch_V2      = 56;
-    static constexpr int  nbatch_combine = 56;
+
+    static int get_nbatch_K2_host(const int /*cc*/, const int /*ncols*/) {
+        return 56;
+    }
+
+    static constexpr __device__ int get_nbatch_K2_device(int /*ncols*/) {
+        return 56;
+    }
+
+    static int get_nbatch_V2_host(const int /*cc*/, const int /*ncols*/) {
+        return 56;
+    }
+
+    static constexpr __device__ int get_nbatch_V2_device(int /*ncols*/) {
+        return 56;
+    }
+
+    static int get_nbatch_combine_host(const int /*cc*/, const int /*ncols*/) {
+        return 56;
+    }
+
+    static constexpr __device__ int get_nbatch_combine_device(int /*ncols*/) {
+        return 56;
+    }
 };

 template <>
@@ -77,9 +161,30 @@ struct fattn_mma_f16_config<128, 128> {
    static constexpr int  nwarps_max     = 4;
    static constexpr bool Q_in_reg       = true;
    static constexpr int  nstages_target = 2;
-    static constexpr int  nbatch_K2      = 64;
-    static constexpr int  nbatch_V2      = 64;
-    static constexpr int  nbatch_combine = 64;
+
+    static int get_nbatch_K2_host(const int /*cc*/, const int /*ncols*/) {
+        return 64;
+    }
+
+    static constexpr __device__ int get_nbatch_K2_device(int /*ncols*/) {
+        return 64;
+    }
+
+    static int get_nbatch_V2_host(const int /*cc*/, const int /*ncols*/) {
+        return 64;
+    }
+
+    static constexpr __device__ int get_nbatch_V2_device(int /*ncols*/) {
+        return 64;
+    }
+
+    static int get_nbatch_combine_host(const int /*cc*/, const int /*ncols*/) {
+        return 64;
+    }
+
+    static constexpr __device__ int get_nbatch_combine_device(int /*ncols*/) {
+        return 64;
+    }
 };

 template <>
@@ -88,9 +193,38 @@ struct fattn_mma_f16_config<256, 256> {
    static constexpr int  nwarps_max     = 4;
    static constexpr bool Q_in_reg       = true;
    static constexpr int  nstages_target = 2;
-    static constexpr int  nbatch_K2      = 128;
-    static constexpr int  nbatch_V2      = 128;
-    static constexpr int  nbatch_combine = 128;
+
+    static int get_nbatch_K2_host(const int /*cc*/, const int /*ncols*/) {
+        return 128;
+    }
+
+    static constexpr __device__ int get_nbatch_K2_device(int /*ncols*/) {
+        return 128;
+    }
+
+    static int get_nbatch_V2_host(const int /*cc*/, const int /*ncols*/) {
+        return 128;
+    }
+
+    static constexpr __device__ int get_nbatch_V2_device(int /*ncols*/) {
+        return 128;
+    }
+
+    static int get_nbatch_combine_host(const int cc, const int ncols) {
+        if (ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_TURING) {
+            return ncols <= 16 ? 128 : 64;
+        }
+        return 64;
+    }
+
+    static constexpr __device__ int get_nbatch_combine_device(int ncols) {
+#if __CUDA_ARCH__ == GGML_CUDA_CC_TURING
+        return ncols <= 16 ? 128 : 64;
+#else
+        GGML_UNUSED(ncols);
+        return 128;
+#endif // __CUDA_ARCH__ == GGML_CUDA_CC_TURING
+    }
 };

 template <>
@@ -99,9 +233,44 @@ struct fattn_mma_f16_config<576, 512> {
    static constexpr int  nwarps_max     = 8;
    static constexpr bool Q_in_reg       = false;
    static constexpr int  nstages_target = 1;
-    static constexpr int  nbatch_K2      = 160;
-    static constexpr int  nbatch_V2      = 128;
-    static constexpr int  nbatch_combine = 128;
+
+    static int get_nbatch_K2_host(const int cc, const int ncols) {
+        if (ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_TURING) {
+            return ncols <= 16 ? 96 : 160;
+        }
+        return ncols <= 16 ? 288 : 160;
+    }
+
+    static constexpr __device__ int get_nbatch_K2_device(int ncols) {
+#if __CUDA_ARCH__ == GGML_CUDA_CC_TURING
+        return ncols <= 16 ? 96 : 160;
+#else
+        return ncols <= 16 ? 288 : 160;
+#endif // __CUDA_ARCH__ == GGML_CUDA_CC_TURING
+    }
+
+    static int get_nbatch_V2_host(const int cc, const int ncols) {
+        if (ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_TURING) {
+            return ncols <= 16 ? 64 : 128;
+        }
+        return ncols <= 16 ? 256 : 128;
+    }
+
+    static constexpr __device__ int get_nbatch_V2_device(int ncols) {
+#if __CUDA_ARCH__ == GGML_CUDA_CC_TURING
+        return ncols <= 16 ? 64 : 128;
+#else
+        return ncols <= 16 ? 256 : 128;
+#endif // __CUDA_ARCH__ == GGML_CUDA_CC_TURING
+    }
+
+    static int get_nbatch_combine_host(const int /*cc*/, const int /*ncols*/) {
+        return 128;
+    }
+
+    static constexpr __device__ int get_nbatch_combine_device(int /*ncols*/) {
+        return 128;
+    }
 };

 // ------------------------------------------------------------------------------------------------------------------
@@ -120,7 +289,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(

        const unsigned int tile_KV_32 = ggml_cuda_cvta_generic_to_shared(tile_KV);

-        auto load = [&] __device__ (const int n) {
+        auto load = [&] __device__ (auto n) {
            const int stride_k = WARP_SIZE >> n;
            const int k0_start = stride_k == WARP_SIZE ? 0 : chunks_per_row - chunks_per_row % (2*stride_k);
            const int k0_stop  =                             chunks_per_row - chunks_per_row % (1*stride_k);
@@ -223,7 +392,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
    }
 }

-template<int DKQ, int DV, int ncols1, int ncols2, int nwarps, int ntiles, bool use_logit_softcap, bool needs_fixup, bool is_fixup, bool last_iter>
+template<int DKQ, int DV, int ncols1, int ncols2, int nwarps, int ntiles, bool use_logit_softcap, bool mla, bool needs_fixup, bool is_fixup, bool last_iter>
 static __device__ __forceinline__ void flash_attn_ext_f16_iter(
        const float2 * const __restrict__ Q_f2,
        const half2  * const __restrict__ K_h2,
@@ -261,10 +430,15 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
    constexpr int cols_per_warp   = ntiles * tile_B::I;
    constexpr int cols_per_thread = ntiles == 1 ? 2 : ntiles;
    constexpr int np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
+    constexpr int ncols           = ncols1 * ncols2;
+    constexpr int nbatch_K2       = c::get_nbatch_K2_device(ncols);
+    constexpr int nbatch_V2       = c::get_nbatch_V2_device(ncols);

-    constexpr int stride_tile_Q = DKQ/2        + 4;
-    constexpr int stride_tile_K = c::nbatch_K2 + 4;
-    constexpr int stride_tile_V = c::nbatch_V2 + 4;
+    constexpr int stride_tile_Q = DKQ/2     + 4;
+    constexpr int stride_tile_K = nbatch_K2 + 4;
+
+    static_assert(!mla || nbatch_K2 >= nbatch_V2, "bad nbatch_K2, nbatch_V2 for MLA");
+    constexpr int stride_tile_V = mla ? stride_tile_K : nbatch_V2 + 4;

    const int k_VKQ_0 = kb0 * c::nbatch_fa;
    tile_C_KQ KQ_C[c::nbatch_fa/(np*tile_C_KQ::I) * ntiles];
@@ -275,12 +449,13 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
    tile_C_KQ_16  * KQ_C_16  = (tile_C_KQ_16  *) KQ_C;

    if constexpr (nstages > 1) {
-        static_assert(c::nbatch_K2 == DKQ/2, "batching not implemented for multi stage loading");
+        static_assert(!mla, "multi-stage loading not implemented for MLA");
+        static_assert(nbatch_K2 == DKQ/2, "batching not implemented for multi stage loading");
        constexpr bool use_cp_async = true;
        cp_async_wait_all();
        __syncthreads();
        flash_attn_ext_f16_load_tile<stride_tile_V, nwarps, c::nbatch_fa, use_cp_async>
-            (V_h2 + k_VKQ_0*stride_V, tile_V, c::nbatch_V2, stride_V);
+            (V_h2 + k_VKQ_0*stride_V, tile_V, nbatch_V2, stride_V);
    } else {
        constexpr bool use_cp_async = nstages == 1;
        if (ncols2 > 1 || mask_h2) {
@@ -289,8 +464,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
    }

 #pragma unroll
-    for (int k0_start = 0; k0_start < DKQ/2; k0_start += c::nbatch_K2) {
-        const int k0_stop = k0_start + c::nbatch_K2 < DKQ/2 ? k0_start + c::nbatch_K2 : DKQ/2;
+    for (int k0_start = 0; k0_start < DKQ/2; k0_start += nbatch_K2) {
+        const int k0_stop = k0_start + nbatch_K2 < DKQ/2 ? k0_start + nbatch_K2 : DKQ/2;
        const int k0_diff = k0_stop - k0_start;

        if (nstages <= 1) {
@@ -537,16 +712,21 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                    (mask_h2 + (k_VKQ_0 + c::nbatch_fa)/2, tile_mask, stride_mask);
            }
            flash_attn_ext_f16_load_tile<stride_tile_K, nwarps, c::nbatch_fa, use_cp_async>
-                (K_h2 + (k_VKQ_0 + c::nbatch_fa)*stride_K, tile_K, c::nbatch_K2, stride_K);
+                (K_h2 + (k_VKQ_0 + c::nbatch_fa)*stride_K, tile_K, nbatch_K2, stride_K);
        }
    }

-#pragma unroll
-    for (int i0_start = 0; i0_start < DV; i0_start += 2*c::nbatch_V2) {
-        const int i0_stop = i0_start + 2*c::nbatch_V2 < DV ? i0_start + 2*c::nbatch_V2 : DV;
-        const int i0_diff = i0_stop - i0_start;

-        if (nstages <= 1) {
+    // For MLA K and V have the same data.
+    // Therefore, iterate over V in reverse and re-use the data if possible.
+    static_assert(!mla || nstages <= 1, "combination of MLA and multi-stage loading not implemented");
+    constexpr int reusable_cutoff = mla ? (DKQ - 1) - (DKQ - 1) % (2*nbatch_K2) - (DKQ - DV) : DV;
+#pragma unroll
+    for (int i0_stop = DV; i0_stop > 0; i0_stop -= 2*nbatch_V2) {
+        const int i0_start = i0_stop - 2*nbatch_V2 > 0 ? i0_stop - 2*nbatch_V2 : 0;
+        const int i0_diff  = i0_stop - i0_start;
+
+        if (nstages <= 1 && i0_start < reusable_cutoff) {
            constexpr bool use_cp_async = nstages == 1;
            flash_attn_ext_f16_load_tile<stride_tile_V, nwarps, c::nbatch_fa, use_cp_async>
                (V_h2 + k_VKQ_0*stride_V + i0_start/2, tile_V, i0_diff/2, stride_V);
@@ -555,6 +735,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
            }
            __syncthreads();
        }
+        const half2 * tile_V_i = i0_start < reusable_cutoff ? tile_V : tile_V + (i0_start - reusable_cutoff)/2;

        // Calculate VKQ tile:
 #pragma unroll
@@ -565,7 +746,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                const int k0 = k00 + (threadIdx.y % np)*tile_A::J;

                tile_A A;
-                load_ldmatrix_trans(A, tile_V + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
+                load_ldmatrix_trans(A, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
                if (ntiles == 1) {
                    mma(VKQ_C[i_VKQ_0/tile_C_VKQ::I], A, B[k00/(np*tile_A::J)]);
                } else {
@@ -591,12 +772,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
    GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
    GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
    GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
-    GGML_UNUSED(kb0);
+    GGML_UNUSED(kb0); GGML_UNUSED(tile_Q);
    NO_DEVICE_CODE;
 #endif // NEW_MMA_AVAILABLE
 }

-template<int DKQ, int DV, int ncols1, int ncols2, int nwarps, int ntiles, bool use_logit_softcap, bool needs_fixup, bool is_fixup>
+template<int DKQ, int DV, int ncols1, int ncols2, int nwarps, int ntiles, bool use_logit_softcap, bool mla, bool needs_fixup, bool is_fixup>
 static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
        const float2 * const __restrict__ Q_f2,
        const half2  * const __restrict__ K_h2,
@@ -632,13 +813,16 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
    constexpr int cols_per_warp   = ntiles * tile_B::I;
    constexpr int cols_per_thread = ntiles == 1 ? 2 : ntiles;
    constexpr int np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
+    constexpr int nbatch_K2       = c::get_nbatch_K2_device(ncols);
+    constexpr int nbatch_V2       = c::get_nbatch_V2_device(ncols);

    static_assert(nwarps * (cols_per_warp/ncols2) % ncols1 == 0, "bad nwarps");

-    constexpr int stride_tile_Q = DKQ/2        + 4;
-    constexpr int stride_tile_K = c::nbatch_K2 + 4;
-    constexpr int stride_tile_V = c::nbatch_V2 + 4;
+    constexpr int stride_tile_Q = DKQ/2     + 4;
+    constexpr int stride_tile_K = nbatch_K2 + 4;

+    static_assert(!mla || nbatch_K2 >= nbatch_V2, "bad nbatch_K2, nbatch_V2 for MLA");
+    constexpr int stride_tile_V = mla ? stride_tile_K : nbatch_V2 + 4;
    constexpr int stride_tile_KV_max = stride_tile_K > stride_tile_V ? stride_tile_K : stride_tile_V;

    extern __shared__ half2 tile_Q[];
@@ -726,26 +910,26 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(

    // Preload mask and K data for first iteration when using cp_async with multiple stages:
    if constexpr (nstages > 1) {
-        static_assert(c::nbatch_K2 == DKQ/2, "batching not implemented for multi-stage pipeline");
+        static_assert(nbatch_K2 == DKQ/2, "batching not implemented for multi-stage pipeline");
        constexpr bool use_cp_async = true;
        if (ncols2 > 1 || mask_h2) {
            flash_attn_ext_f16_load_mask<ncols1, nwarps, c::nbatch_fa, use_cp_async>
                (mask_h2 + kb0_start*c::nbatch_fa/2, tile_mask, stride_mask);
        }
        flash_attn_ext_f16_load_tile<stride_tile_K, nwarps, c::nbatch_fa, use_cp_async>
-            (K_h2 + kb0_start*c::nbatch_fa*stride_K, tile_K, c::nbatch_K2, stride_K);
+            (K_h2 + kb0_start*c::nbatch_fa*stride_K, tile_K, nbatch_K2, stride_K);
    }

    // Iterate over ne11 == previous tokens:
    for (int kb0 = kb0_start; kb0 < kb0_stop-1; ++kb0) {
        constexpr bool last_iter = false;
-        flash_attn_ext_f16_iter<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, needs_fixup, is_fixup, last_iter>
+        flash_attn_ext_f16_iter<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter>
            (Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap,
             ne01, ne02, stride_K, stride_V, stride_mask, jt, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0);
    }
    { // kb0_start is always < kb0_stop so the last iter can be executed unconditionally.
        constexpr bool last_iter = true;
-        flash_attn_ext_f16_iter<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, needs_fixup, is_fixup, last_iter>
+        flash_attn_ext_f16_iter<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter>
            (Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap,
             ne01, ne02, stride_K, stride_V, stride_mask, jt, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0_stop-1);
    }
@@ -774,7 +958,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
    // It's also faster to do small writes to shared memory, then large write to VRAM than to do small writes to VRAM.
    // So also write VKQ accumulators to shared memory in column-major format if np == 1.

-    constexpr int nbatch_combine = c::Q_in_reg ? DV/2 : DV/4;
+    constexpr int nbatch_combine = c::get_nbatch_combine_device(ncols);
    constexpr int tile_stride    = nbatch_combine + 4;
    static_assert((DV/2) % nbatch_combine == 0, "bad nbatch_combine");

@@ -895,6 +1079,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
            float2 * dstk_fixup_meta = dstk_fixup + (gridDim.x + blockIdx.x)*ncols;
            dstk_fixup_meta[(threadIdx.y/np)*cols_per_warp + threadIdx.x] = make_float2(KQ_cmn, KQ_crs);
        }
+    } else if (np > 1) {
+        // Warps with threadIdx.y % np == 0 execute a __syncthreads() in the if branch.
+        // Therefore, all other warps also need to execute a __syncthreads().
+        // Otherwise the points at which warps synchronize with each other would become misaligned.
+        __syncthreads();
    }

 #pragma unroll
@@ -1007,7 +1196,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
 #endif // NEW_MMA_AVAILABLE
 }

-template<int DKQ, int DV, int ncols1, int ncols2, int nwarps, int ntiles, bool use_logit_softcap>
+template<int DKQ, int DV, int ncols1, int ncols2, int nwarps, int ntiles, bool use_logit_softcap, bool mla>
 __launch_bounds__(nwarps*WARP_SIZE, 1)
 static __global__ void flash_attn_ext_f16(
        const char * __restrict__ Q,
@@ -1052,6 +1241,14 @@ static __global__ void flash_attn_ext_f16(
        NO_DEVICE_CODE;
        return;
    }
+#if __CUDA_ARCH__ == GGML_CUDA_CC_TURING
+    if (ncols1*ncols2 > 32) {
+        NO_DEVICE_CODE;
+        return;
+    }
+#endif __CUDA_ARCH__ == GGML_CUDA_CC_TURING
+
+    static_assert(!mla || DKQ >= DV, "MLA needs DKQ >= DV");

    typedef fattn_mma_f16_config<DKQ, DV> c;

@@ -1062,9 +1259,10 @@ static __global__ void flash_attn_ext_f16(
    const int stride_Q1   = nb01 / sizeof(float2);
    const int stride_Q2   = nb02 / sizeof(float2);
    const int stride_K    = nb11 / sizeof(half2);
-    const int stride_V    = nb21 / sizeof(half2);
    const int stride_mask = nb31 / sizeof(half2);

+    const int stride_V = mla ? stride_K : nb21 / sizeof(half2);
+
    const int iter_k = ne11 / FATTN_KQ_STRIDE;
    const int iter_j = (ne01 + (ncols1 - 1)) / ncols1;

@@ -1087,10 +1285,11 @@ static __global__ void flash_attn_ext_f16(

        const float2 * Q_f2    = (const float2 *) (Q + nb02* channel*ncols2);
        const half2  * K_h2    = (const half2  *) (K + nb12*(channel*ncols2 / gqa_ratio));
-        const half2  * V_h2    = (const half2  *) (V + nb22*(channel*ncols2 / gqa_ratio));
        const half2  * mask_h2 = ncols2 > 1 || mask ? (const half2  *) mask + (nb31/sizeof(half2))*jt*ncols1 : nullptr;
        float2       * dstk    = ((float2 *) dst) + channel*(ncols2 * DV/2);

+        const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb22*(channel*ncols2 / gqa_ratio));
+
        const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, channel, n_head_log2, m0, m1) : 1.0f;

        const int kb0_start_kernel = kb0_start * kb_niter;
@@ -1099,12 +1298,12 @@ static __global__ void flash_attn_ext_f16(
        constexpr bool is_fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
        if (kb0_start == 0) {
            constexpr bool needs_fixup = false; // CUDA block is working on an entire tile.
-            flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, needs_fixup, is_fixup>
+            flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup>
                (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
                 ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
        } else {
            constexpr bool needs_fixup = true; // CUDA block is working on the beginning of a tile.
-            flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, needs_fixup, is_fixup>
+            flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup>
                (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
                 ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
        }
@@ -1125,10 +1324,11 @@ static __global__ void flash_attn_ext_f16(

    const float2 * Q_f2    = (const float2 *) (Q + nb02* channel*ncols2);
    const half2  * K_h2    = (const half2  *) (K + nb12*(channel*ncols2 / gqa_ratio));
-    const half2  * V_h2    = (const half2  *) (V + nb22*(channel*ncols2 / gqa_ratio)); // K and V have same shape
    const half2  * mask_h2 = ncols2 > 1 || mask ? (const half2  *) mask + (nb31/sizeof(half2))*jt*ncols1 : nullptr;
    float2       * dstk    = ((float2 *) dst) + channel*(ncols2 * DV/2);

+    const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb22*(channel*ncols2 / gqa_ratio));
+
    const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, channel, n_head_log2, m0, m1) : 1.0f;

    const int kb0_start_kernel = kb0_start * kb_niter;
@@ -1136,7 +1336,7 @@ static __global__ void flash_attn_ext_f16(

    constexpr bool is_fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
    constexpr bool needs_fixup = false;
-    flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, needs_fixup, is_fixup>
+    flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup>
        (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
         ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
 #else
@@ -1162,10 +1362,6 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml

    typedef fattn_mma_f16_config<DKQ, DV> c;

-    constexpr int nbatch_K2      = c::nbatch_K2      < 1 ? DKQ/2 : c::nbatch_K2;
-    constexpr int nbatch_V2      = c::nbatch_V2      < 1 ? DV /2 : c::nbatch_V2;
-    constexpr int nbatch_combine = c::nbatch_combine < 1 ? DV /2 : c::nbatch_combine;
-
    const int nstages = cp_async_available(cc) ? c::nstages_target : 0;

    constexpr int ncols         = ncols1 * ncols2;
@@ -1175,15 +1371,21 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
    constexpr int nwarps_max_y  = c::nbatch_fa / tile_A::I;
    constexpr int nwarps        = nwarps_max_x*nwarps_max_y <= c::nwarps_max ? nwarps_max_x*nwarps_max_y : c::nwarps_max;

+    constexpr bool mla = DKQ == 576;
+
+    const int nbatch_K2      = c::get_nbatch_K2_host     (cc, ncols);
+    const int nbatch_V2      = c::get_nbatch_K2_host     (cc, ncols);
+    const int nbatch_combine = c::get_nbatch_combine_host(cc, ncols);
+
    static_assert(DKQ   % tile_B::J     == 0, "bad DKQ");
    static_assert(DV    % tile_A::J     == 0, "bad DV");
    static_assert(ncols % cols_per_warp == 0, "bad ncols");

-    const size_t nbytes_shared_KV_1stage = c::nbatch_fa         * std::max(c::nbatch_K2 + 4,  c::nbatch_V2 + 4) * sizeof(half2);
-    const size_t nbytes_shared_KV_2stage = c::nbatch_fa         *         (c::nbatch_K2 + 4 + c::nbatch_V2 + 4) * sizeof(half2);
-    const size_t nbytes_shared_Q         = ncols                * (DKQ/2 + 4)                                   * sizeof(half2);
-    const size_t nbytes_shared_mask      = ncols1               * (c::nbatch_fa/2 + 4)                          * sizeof(half2);
-    const size_t nbytes_shared_combine   = nwarps*cols_per_warp * (nbatch_combine + 4)                          * sizeof(half2);
+    const size_t nbytes_shared_KV_1stage = c::nbatch_fa         * std::max(nbatch_K2 + 4,  nbatch_V2 + 4) * sizeof(half2);
+    const size_t nbytes_shared_KV_2stage = c::nbatch_fa         *         (nbatch_K2 + 4 + nbatch_V2 + 4) * sizeof(half2);
+    const size_t nbytes_shared_Q         = ncols                * (DKQ/2 + 4)                             * sizeof(half2);
+    const size_t nbytes_shared_mask      = ncols1               * (c::nbatch_fa/2 + 4)                    * sizeof(half2);
+    const size_t nbytes_shared_combine   = nwarps*cols_per_warp * (nbatch_combine + 4)                    * sizeof(half2);

    const size_t nbytes_shared_KV = nstages <= 1 ? nbytes_shared_KV_1stage : nbytes_shared_KV_2stage;

@@ -1197,7 +1399,7 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
    fattn_kernel_t fattn_kernel;
    if (logit_softcap == 0.0f) {
        constexpr bool use_logit_softcap = false;
-        fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap>;
+        fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>;

 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
        static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
@@ -1208,7 +1410,7 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
    } else {
        constexpr bool use_logit_softcap = true;
-        fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap>;
+        fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>;

 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
        static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -2,9 +2,9 @@
 #include "fattn-common.cuh"

 template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#ifndef GGML_USE_HIP
 __launch_bounds__(D, 1)
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#endif // GGML_USE_HIP
 static __global__ void flash_attn_vec_ext_f16(
        const char * __restrict__ Q,
        const char * __restrict__ K,
@@ -48,6 +48,12 @@ static __global__ void flash_attn_vec_ext_f16(
        NO_DEVICE_CODE;
        return;
    }
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+    if (ncols > 1) {
+        NO_DEVICE_CODE;
+        return;
+    }
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)

    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.

@@ -91,6 +97,13 @@ static __global__ void flash_attn_vec_ext_f16(
            kqsum_shared[j][threadIdx.x] = 0.0f;
        }
    }
+
+    __shared__ half maskh_shared[ncols*D];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        maskh_shared[j*D + tid] = 0.0f;
+    }
+
    __syncthreads();

    // Convert Q to half2 (f16 K) or q8_1 (quantized K) and store in registers:
@@ -175,6 +188,36 @@ static __global__ void flash_attn_vec_ext_f16(
    for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:

+        if (mask) {
+#pragma unroll
+            for (int j = 0; j < ncols; ++j) {
+                maskh_shared[j*D + tid] = slopeh*maskh[j*ne11 + k_VKQ_0 + tid];
+            }
+
+            __syncthreads();
+
+            // When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.
+            // In such cases, skip the KV slice.
+            // On AMD __all_sync would not work correctly because it assumes a warp size of 64.
+#ifndef GGML_USE_HIP
+            bool skip = true;
+#pragma unroll
+            for (int j = 0; j < ncols; ++j) {
+#pragma unroll
+                for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
+                    const int i = i0 + threadIdx.x;
+
+                    const float2 tmp = __half22float2(((const half2 *) maskh_shared)[j*(D/2) + i]);
+                    skip = skip && isinf(tmp.x) && isinf(tmp.y);
+                }
+            }
+            if (__all_sync(0xFFFFFFFF, skip)) {
+                __syncthreads();
+                continue;
+            }
+#endif // GGML_USE_HIP
+        }
+
        // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
        // see https://github.com/ggerganov/llama.cpp/pull/7061 .
        // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
@@ -202,7 +245,7 @@ static __global__ void flash_attn_vec_ext_f16(
                    sum = logit_softcap*tanhf(sum);
                }

-                sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
+                sum += maskh_shared[j*D + i_KQ];

                if (ncols == 1) {
                    kqmax_new        = ggml_cuda_hmax(kqmax_new,        sum);
@@ -335,7 +378,9 @@ void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml
    float logit_softcap;
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));

-    if (Q->ne[1] == 1) {
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+
+    if (Q->ne[1] == 1 || GGML_CUDA_CC_IS_NVIDIA(cc)) {
        constexpr int cols_per_block = 1;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
--- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@@ -2,9 +2,9 @@
 #include "fattn-common.cuh"

 template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#ifndef GGML_USE_HIP
 __launch_bounds__(D, 1)
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+#endif // GGML_USE_HIP
 static __global__ void flash_attn_vec_ext_f32(
        const char * __restrict__ Q,
        const char * __restrict__ K,
@@ -60,6 +60,12 @@ static __global__ void flash_attn_vec_ext_f32(
        NO_DEVICE_CODE;
        return;
    }
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+    if (ncols > 1) {
+        NO_DEVICE_CODE;
+        return;
+    }
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)

    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.

@@ -104,6 +110,13 @@ static __global__ void flash_attn_vec_ext_f32(
            kqsum_shared[j][threadIdx.x] = 0.0f;
        }
    }
+
+    __shared__ float maskf_shared[ncols*D];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        maskf_shared[j*D + tid] = 0.0f;
+    }
+
    __syncthreads();

    // Convert Q to float2 (f16 K) or q8_1 (quantized K) and store in registers:
@@ -181,6 +194,35 @@ static __global__ void flash_attn_vec_ext_f32(
    for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:

+        if (mask) {
+#pragma unroll
+            for (int j = 0; j < ncols; ++j) {
+                maskf_shared[j*D + tid] = slope*__half2float(maskh[j*ne11 + k_VKQ_0 + tid]);
+            }
+
+            __syncthreads();
+
+            // When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.
+            // In such cases, skip the KV slice.
+            // On AMD __all_sync would not work correctly because it assumes a warp size of 64.
+#ifndef GGML_USE_HIP
+            bool skip = true;
+#pragma unroll
+            for (int j = 0; j < ncols; ++j) {
+#pragma unroll
+                for (int i0 = 0; i0 < D; i0 += WARP_SIZE) {
+                    const int i = i0 + threadIdx.x;
+
+                    skip = skip && isinf(maskf_shared[j*D + i]);
+                }
+            }
+            if (__all_sync(0xFFFFFFFF, skip)) {
+                __syncthreads();
+                continue;
+            }
+#endif // GGML_USE_HIP
+        }
+
        float kqmax_new_arr[ncols];
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
@@ -204,7 +246,7 @@ static __global__ void flash_attn_vec_ext_f32(
                    sum = logit_softcap*tanhf(sum);
                }

-                sum += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
+                sum += maskf_shared[j*D + i_KQ];

                kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum);

@@ -326,7 +368,9 @@ void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml
    float logit_softcap;
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));

-    if (Q->ne[1] == 1) {
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+
+    if (Q->ne[1] == 1 || GGML_CUDA_CC_IS_NVIDIA(cc)) {
        constexpr int cols_per_block = 1;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -10,6 +10,7 @@

 template <int DKQ, int DV, int ncols2>
 static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
    const ggml_tensor * Q = dst->src[0];

    if constexpr (ncols2 <= 8) {
@@ -24,7 +25,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_con
        return;
    }

-    if (Q->ne[1] <= 32/ncols2) {
+    if (ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_TURING || Q->ne[1] <= 32/ncols2) {
        ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 32/ncols2, ncols2>(ctx, dst);
        return;
    }
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1909,13 +1909,19 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
 static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);

+    // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
+    // But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
+    // Therefore, in such cases use cuBLAS.
+    const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
+        && ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
+
    bool use_mul_mat_vec   = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
-    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
+    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
-    bool use_mul_mat_q     = ggml_is_quantized(src0->type)
+    bool use_mul_mat_q     = ggml_is_quantized(src0->type) && !bad_padding_clear
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;

    bool any_gpus_with_slow_fp16   = false;
@@ -2186,6 +2192,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
                case GGML_UNARY_OP_SILU:
                    ggml_cuda_op_silu(ctx, dst);
                    break;
+                case GGML_UNARY_OP_GELU_ERF:
+                    ggml_cuda_op_gelu_erf(ctx, dst);
+                    break;
                case GGML_UNARY_OP_GELU_QUICK:
                    ggml_cuda_op_gelu_quick(ctx, dst);
                    break;
@@ -2971,6 +2980,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                case GGML_UNARY_OP_SIGMOID:
                case GGML_UNARY_OP_HARDSIGMOID:
                case GGML_UNARY_OP_HARDSWISH:
+                case GGML_UNARY_OP_GELU_ERF:
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_EXP:
@@ -3216,7 +3226,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
 #endif // FLASH_ATTN_AVAILABLE
            if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
                const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
-                if (!new_mma_available(cc) || cc < GGML_CUDA_CC_AMPERE) {
+                if (!new_mma_available(cc)) {
                    return false;
                }
                const int gqa_ratio = op->src[0]->ne[2] / op->src[1]->ne[2];
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -91,11 +91,11 @@ void ggml_cuda_mul_mat_q(

    // If src0 is a temporary compute buffer, clear any potential padding.
    if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
-        GGML_ASSERT(ggml_is_contiguously_allocated(src0));
-        GGML_ASSERT(!src0->view_src);
        const size_t size_data  = ggml_nbytes(src0);
        const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
        if (size_alloc > size_data) {
+            GGML_ASSERT(ggml_is_contiguously_allocated(src0));
+            GGML_ASSERT(!src0->view_src);
            CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
        }
    }
@@ -122,6 +122,7 @@ void ggml_cuda_mul_mat_q(
            const int64_t s13 = src1->nb[3] / ts_src1;
            quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type,
                ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
+            CUDA_CHECK(cudaGetLastError());
        }

        const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
@@ -205,6 +206,7 @@ void ggml_cuda_mul_mat_q(
        const int64_t s13 = src1->nb[2] / ts_src1;
        quantize_mmq_q8_1_cuda(src1_d, ids_src1_dev, src1_q8_1.get(), src0->type,
            ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
+        CUDA_CHECK(cudaGetLastError());
    }

    const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -515,11 +515,11 @@ void ggml_cuda_mul_mat_vec_q(

    // If src0 is a temporary compute buffer, clear any potential padding.
    if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
-        GGML_ASSERT(ggml_is_contiguously_allocated(src0));
-        GGML_ASSERT(!src0->view_src);
        const size_t size_data  = ggml_nbytes(src0);
        const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
        if (size_alloc > size_data) {
+            GGML_ASSERT(ggml_is_contiguously_allocated(src0));
+            GGML_ASSERT(!src0->view_src);
            CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
        }
    }
--- a/ggml/src/ggml-cuda/quantize.cu
+++ b/ggml/src/ggml-cuda/quantize.cu
@@ -56,13 +56,13 @@ static __global__ void quantize_mmq_q8_1(
    constexpr int vals_per_scale = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 64 : 32;
    constexpr int vals_per_sum   = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 16 : 32;

-    const int64_t i0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4;
+    const int64_t i0 = ((int64_t)blockDim.x*blockIdx.y + threadIdx.x)*4;

    if (i0 >= ne0) {
        return;
    }

-    const int64_t i1 = blockIdx.y;
+    const int64_t i1 = blockIdx.x;
    const int64_t i2 = blockIdx.z % ne2;
    const int64_t i3 = blockIdx.z / ne2;

@@ -75,8 +75,8 @@ static __global__ void quantize_mmq_q8_1(

    block_q8_1_mmq * y = (block_q8_1_mmq *) vy;

-    const int64_t ib0 = blockIdx.z*((int64_t)gridDim.y*gridDim.x*blockDim.x/QK8_1); // first block of channel
-    const int64_t ib  = ib0 + (i0 / (4*QK8_1))*ne1 + blockIdx.y;                    // block index in channel
+    const int64_t ib0 = blockIdx.z*((int64_t)gridDim.x*gridDim.y*blockDim.x/QK8_1); // first block of channel
+    const int64_t ib  = ib0 + (i0 / (4*QK8_1))*ne1 + blockIdx.x;                    // block index in channel
    const int64_t iqs = i0 % (4*QK8_1);                                             // quant index in block

    // Load 4 floats per thread and calculate max. abs. value between them:
@@ -166,8 +166,9 @@ void quantize_mmq_q8_1_cuda(
    GGML_ASSERT(ne00 % 4 == 0);
    GGML_ASSERT(ne0 % (4*QK8_1) == 0);

-    const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
-    const dim3 num_blocks(block_num_x, ne1, ne2*ne3);
+    // ne1 tends to assume the highest values, therefore use it as the "x" dimension of the CUDA grid:
+    const int64_t block_num_y = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
+    const dim3 num_blocks(ne1, block_num_y, ne2*ne3);
    const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE_MMQ, 1, 1);
    switch (mmq_get_q8_1_ds_layout(type_src0)) {
        case MMQ_Q8_1_DS_LAYOUT_D4:
--- a/ggml/src/ggml-cuda/sum.cu
+++ b/ggml/src/ggml-cuda/sum.cu
@@ -31,7 +31,7 @@ void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguously_allocated(src0));

    const float * src0_d = (const float *) src0->data;
    float * dst_d = (float *) dst->data;
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -23,6 +23,12 @@ static __device__ __forceinline__ float op_gelu(float x) {
    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
 }

+static __device__ __forceinline__ float op_gelu_erf(float x) {
+    const float SQRT_2_INV = 0.70710678118654752440084436210484f;
+
+    return 0.5f*x*(1.0f + erff(x*SQRT_2_INV));
+}
+
 static __device__ __forceinline__ float op_gelu_quick(float x) {
    const float GELU_QUICK_COEF = -1.702f;

@@ -134,6 +140,10 @@ void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_unary<op_gelu>(ctx, dst);
 }

+void ggml_cuda_op_gelu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_cuda_op_unary<op_gelu_erf>(ctx, dst);
+}
+
 void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_unary<op_gelu_quick>(ctx, dst);
 }
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -30,6 +30,8 @@ void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

+void ggml_cuda_op_gelu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
 void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -207,6 +207,10 @@ typedef struct {
    float    attn_factor;
    float    beta_fast;
    float    beta_slow;
+    int32_t  sect_0;
+    int32_t  sect_1;
+    int32_t  sect_2;
+    int32_t  sect_3;
 } ggml_metal_kargs_rope;

 typedef struct {
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -149,6 +149,8 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_SIGMOID,
    GGML_METAL_KERNEL_TYPE_GELU,
    GGML_METAL_KERNEL_TYPE_GELU_4,
+    GGML_METAL_KERNEL_TYPE_GELU_ERF,
+    GGML_METAL_KERNEL_TYPE_GELU_ERF_4,
    GGML_METAL_KERNEL_TYPE_GELU_QUICK,
    GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,
    GGML_METAL_KERNEL_TYPE_SILU,
@@ -332,6 +334,10 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16,
    GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,
    GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,
+    GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,
+    GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16,
+    GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32,
+    GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16,
    GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,
    GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,
    GGML_METAL_KERNEL_TYPE_IM2COL_F16,
@@ -411,6 +417,13 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96,
@@ -1092,6 +1105,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIGMOID,                         sigmoid,                         true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU,                            gelu,                            true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4,                          gelu_4,                          true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_ERF,                        gelu_erf,                        true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_ERF_4,                      gelu_erf_4,                      true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK,                      gelu_quick,                      true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,                    gelu_quick_4,                    true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU,                            silu,                            true);
@@ -1275,6 +1290,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16,            mul_mm_id_iq4_xs_f16,            has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,                   rope_norm_f32,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,                   rope_norm_f16,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,                  rope_multi_f32,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16,                  rope_multi_f16,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32,                 rope_vision_f32,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16,                 rope_vision_f16,                 true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,                   rope_neox_f32,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,                   rope_neox_f16,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16,                      im2col_f16,                      true);
@@ -1354,6 +1373,13 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128, flash_attn_ext_q8_0_hk192_hv128, has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,        flash_attn_ext_q8_0_h256,        has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512, flash_attn_ext_q8_0_hk576_hv512, has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64,      flash_attn_ext_vec_f16_h64,      has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64,     flash_attn_ext_vec_bf16_h64,     has_simdgroup_reduction && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64,     flash_attn_ext_vec_q4_0_h64,     has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64,     flash_attn_ext_vec_q4_1_h64,     has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64,     flash_attn_ext_vec_q5_0_h64,     has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64,     flash_attn_ext_vec_q5_1_h64,     has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64,     flash_attn_ext_vec_q8_0_h64,     has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96,      flash_attn_ext_vec_f16_h96,      has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96,     flash_attn_ext_vec_bf16_h96,     has_simdgroup_reduction && use_bfloat);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96,     flash_attn_ext_vec_q4_0_h96,     has_simdgroup_reduction);
@@ -1591,6 +1617,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
                case GGML_UNARY_OP_RELU:
                case GGML_UNARY_OP_SIGMOID:
                case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_GELU_ERF:
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_SILU:
                case GGML_UNARY_OP_ELU:
@@ -1637,16 +1664,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
        case GGML_OP_NORM:
            return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
        case GGML_OP_ROPE:
-            {
-                const int mode = ((const int32_t *) op->op_params)[2];
-                if (mode & GGML_ROPE_TYPE_MROPE) {
-                    return false;
-                }
-                if (mode & GGML_ROPE_TYPE_VISION) {
-                    return false;
-                }
-                return true;
-            }
+            return true;
        case GGML_OP_IM2COL:
            return op->src[0]->type == GGML_TYPE_F16;
        case GGML_OP_POOL_1D:
@@ -2238,6 +2256,25 @@ static bool ggml_metal_encode_node(

                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                } break;
+                case GGML_UNARY_OP_GELU_ERF:
+                {
+                    int64_t n = ggml_nelements(dst);
+
+                    id<MTLComputePipelineState> pipeline = nil;
+
+                    if (n % 4 == 0) {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_ERF_4].pipeline;
+                        n /= 4;
+                    } else {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_ERF].pipeline;
+                    }
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
                case GGML_UNARY_OP_GELU_QUICK:
                {
                    int64_t n = ggml_nelements(dst);
@@ -3826,6 +3863,7 @@ static bool ggml_metal_encode_node(
            } break;
        case GGML_OP_ROPE:
            {
+
                // make sure we have one or more position id(ne10) per token(ne02)
                GGML_ASSERT(ne10 % ne02 == 0);
                GGML_ASSERT(ne10 >= ne02);
@@ -3852,20 +3890,42 @@ static bool ggml_metal_encode_node(
                memcpy(&beta_fast,   (const int32_t *) dst->op_params +  9, sizeof(float));
                memcpy(&beta_slow,   (const int32_t *) dst->op_params + 10, sizeof(float));

-                const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+                const bool is_neox   = mode & GGML_ROPE_TYPE_NEOX;
+                const bool is_mrope  = mode & GGML_ROPE_TYPE_MROPE;
+                const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+                // mrope
+                const int sect_0 = ((const int32_t *) dst->op_params)[11];
+                const int sect_1 = ((const int32_t *) dst->op_params)[12];
+                const int sect_2 = ((const int32_t *) dst->op_params)[13];
+                const int sect_3 = ((const int32_t *) dst->op_params)[14];

                id<MTLComputePipelineState> pipeline = nil;

-                if (!is_neox) {
+                if (is_neox) {
                    switch (src0->type) {
-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
+                        default: GGML_ABORT("fatal error");
+                    };
+                } else if (is_mrope && !is_vision) {
+                    GGML_ASSERT(ne10*4 >= ne02); // need at least 4 pos per token
+                    switch (src0->type) {
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16].pipeline; break;
+                        default: GGML_ABORT("fatal error");
+                    };
+                } else if (is_vision) {
+                    GGML_ASSERT(ne10*4 >= ne02); // need at least 4 pos per token
+                    switch (src0->type) {
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16].pipeline; break;
                        default: GGML_ABORT("fatal error");
                    };
                } else {
                    switch (src0->type) {
-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
                        default: GGML_ABORT("fatal error");
                    };
                }
@@ -3896,6 +3956,10 @@ static bool ggml_metal_encode_node(
                    /*.attn_factor =*/ attn_factor,
                    /*.beta_fast   =*/ beta_fast,
                    /*.beta_slow   =*/ beta_slow,
+                    /* sect_0      =*/ sect_0,
+                    /* sect_1      =*/ sect_1,
+                    /* sect_2      =*/ sect_2,
+                    /* sect_3      =*/ sect_3,
                };

                [encoder setComputePipelineState:pipeline];
@@ -4332,7 +4396,7 @@ static bool ggml_metal_encode_node(
                // TODO: add vec kernels for (ne00%64 == 0) and maybe also for (ne00%32 == 0)
                //       for now avoiding mainly to keep the number of templates/kernels a bit lower
                //       these are now trivial to add after: https://github.com/ggml-org/llama.cpp/pull/12612
-                if (ne01 >= 4 || (ne00%128 != 0 && ne00 != 96 && ne00 != 192 && ne00 != 576)) {
+                if (ne01 >= 20 || (ne00%128 != 0 && ne00 != 64 && ne00 != 96 && ne00 != 192 && ne00 != 576)) {
                    switch (src1->type) {
                        case GGML_TYPE_F16:
                            {
@@ -4513,6 +4577,24 @@ static bool ggml_metal_encode_node(
                    use_vec_kernel = true;

                    switch (ne00) {
+                        case 64:
+                            {
+                                switch (src1->type) {
+                                    case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64].pipeline; break;
+                                    case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64].pipeline; break;
+                                    case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64].pipeline; break;
+                                    case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64].pipeline; break;
+                                    case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64].pipeline; break;
+                                    case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64].pipeline; break;
+                                    case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64].pipeline; break;
+                                    default:
+                                        {
+                                            GGML_LOG_ERROR("unsupported type: %d\n", src1->type);
+                                            GGML_LOG_ERROR("add template specialization for this type\n");
+                                            GGML_ABORT("add template specialization for this type");
+                                        }
+                                }
+                            } break;
                        case 96:
                            {
                                switch (src1->type) {
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -856,6 +856,7 @@ kernel void kernel_tanh(
 constant float GELU_COEF_A     = 0.044715f;
 constant float GELU_QUICK_COEF = -1.702f;
 constant float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
+constant float SQRT_2_INV      = 0.70710678118654752440084436210484f;

 kernel void kernel_gelu(
    device const float * src0,
@@ -897,6 +898,42 @@ kernel void kernel_gelu_quick_4(
    dst[tpig] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
 }

+// based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation
+// ref: https://www.johndcook.com/blog/python_erf/
+constant float p_erf  = 0.3275911f;
+constant float a1_erf = 0.254829592f;
+constant float a2_erf = -0.284496736f;
+constant float a3_erf = 1.421413741f;
+constant float a4_erf = -1.453152027f;
+constant float a5_erf = 1.061405429f;
+
+template<typename T>
+T erf_approx(T x) {
+    T sign_x = sign(x);
+    x = fabs(x);
+    T t = 1.0f / (1.0f + p_erf * x);
+    T y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x);
+    return sign_x * y;
+}
+
+kernel void kernel_gelu_erf(
+    device const float * src0,
+    device       float * dst,
+    uint tpig[[thread_position_in_grid]]) {
+    device const float & x = src0[tpig];
+
+    dst[tpig] = 0.5f*x*(1.0f+erf_approx<float>(x*SQRT_2_INV));
+}
+
+kernel void kernel_gelu_erf_4(
+    device const float4 * src0,
+    device       float4 * dst,
+    uint tpig[[thread_position_in_grid]]) {
+    device const float4 & x = src0[tpig];
+
+    dst[tpig] = 0.5f*x*(1.0f+erf_approx<float4>(x*SQRT_2_INV));
+}
+
 kernel void kernel_silu(
        device const float * src0,
        device       float * dst,
@@ -2713,8 +2750,148 @@ kernel void kernel_rope_neox(
    }
 }

+template<typename T>
+kernel void kernel_rope_multi(
+        constant ggml_metal_kargs_rope & args,
+        device const char * src0,
+        device const char * src1,
+        device const char * src2,
+        device       char * dst,
+        ushort  tiitg[[thread_index_in_threadgroup]],
+        ushort3 tptg [[threads_per_threadgroup]],
+        uint3   tgpig[[threadgroup_position_in_grid]]) {
+    const int i3 = tgpig[2];
+    const int i2 = tgpig[1];
+    const int i1 = tgpig[0];
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
+
+    device const int32_t * pos = (device const int32_t *) src1;
+
+    const float inv_ndims = -1.f/args.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
+        if (i0 < args.n_dims) {
+            const int ic = i0/2;
+
+            // mrope theta calculations
+            // note: the rest is the same as kernel_rope_neox
+            const int sect_dims = args.sect_0 + args.sect_1 + args.sect_2 + args.sect_3;
+            const int sec_w01   = args.sect_0 + args.sect_1;               // end of section 1
+            const int sec_w012  = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2
+            const int sector    = ic % sect_dims;
+
+            float theta_base;
+            if (sector < args.sect_0) {
+                theta_base = (float) pos[i2];
+            } else if (sector < sec_w01) {
+                theta_base = (float) pos[i2 + args.ne02];
+            } else if (sector < sec_w012) {
+                theta_base = (float) pos[i2 + args.ne02 * 2];
+            } else {
+                theta_base = (float) pos[i2 + args.ne02 * 3];
+            }
+            // end of mrope
+
+            const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);
+
+            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
+
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
+
+            const float x0 = src[0];
+            const float x1 = src[args.n_dims/2];
+
+            dst_data[0]             = x0*cos_theta - x1*sin_theta;
+            dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
+        } else {
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
+
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+
+template<typename T>
+kernel void kernel_rope_vision(
+        constant ggml_metal_kargs_rope & args,
+        device const char * src0,
+        device const char * src1,
+        device const char * src2,
+        device       char * dst,
+        ushort  tiitg[[thread_index_in_threadgroup]],
+        ushort3 tptg [[threads_per_threadgroup]],
+        uint3   tgpig[[threadgroup_position_in_grid]]) {
+    const int i3 = tgpig[2];
+    const int i2 = tgpig[1];
+    const int i1 = tgpig[0];
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
+
+    device const int32_t * pos = (device const int32_t *) src1;
+
+    const float inv_ndims = -1.f/args.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
+        if (i0 < 2*args.n_dims) { // different from kernel_rope_multi
+            const int ic = i0/2;
+
+            // mrope theta calculations (only support 2 dimensions)
+            const int sect_dims = args.sect_0 + args.sect_1;
+            const int sector    = ic % sect_dims;
+
+            float p;
+            float theta_base;
+            if (sector < args.sect_1) {
+                p = (float) sector;
+                theta_base = (float) pos[i2];
+            } else {
+                p = (float) sector - args.sect_0;
+                theta_base = (float) pos[i2 + args.ne02];
+            }
+
+            const float theta = theta_base * pow(args.freq_base, 2.0f * inv_ndims * p);
+            // end of mrope
+
+            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
+
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
+
+            const float x0 = src[0];
+            const float x1 = src[args.n_dims]; // different from kernel_rope_multi
+
+            dst_data[0]           = x0*cos_theta - x1*sin_theta;
+            dst_data[args.n_dims] = x0*sin_theta + x1*cos_theta; // different from kernel_rope_multi
+        } else {
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
+
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+
 typedef decltype(kernel_rope_norm<float>) kernel_rope_norm_t;
 typedef decltype(kernel_rope_neox<float>) kernel_rope_neox_t;
+typedef decltype(kernel_rope_multi<float>) kernel_rope_multi_t;
+typedef decltype(kernel_rope_vision<float>) kernel_rope_vision_t;

 template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm<float>;
 template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm<half>;
@@ -2722,6 +2899,12 @@ template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_
 template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox<float>;
 template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox<half>;

+template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi<float>;
+template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi<half>;
+
+template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
+template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
+
 typedef void (im2col_t)(
        device const float * x,
        device        char * dst,
@@ -3109,7 +3292,7 @@ template<
    typename kd4x4_t, // key type in device memory
    short nl_k,
    void (*deq_k)(device const kd4x4_t *, short, thread k4x4_t &),
-    typename vd4x4_t, // key type in device memory
+    typename vd4x4_t, // value type in device memory
    short nl_v,
    void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &),
    short DK,        // K head size
@@ -3630,7 +3813,7 @@ template<
    typename kd4_t, // key type in device memory
    short nl_k,
    void (*deq_k_t4)(device const kd4_t *, short, thread k4_t &),
-    typename vd4_t, // key type in device memory
+    typename vd4_t, // value type in device memory
    short nl_v,
    void (*deq_v_t4)(device const vd4_t *, short, thread v4_t &),
    short DK,       // K head size
@@ -3741,6 +3924,11 @@ kernel void kernel_flash_attn_ext_vec(
                sm[tiisg] = pm[ic + tiisg];
            }

+            // skip -INF blocks
+            if (simd_max(sm[tiisg]) == -INFINITY) {
+                continue;
+            }
+
            // Q*K^T
            {
                // each simdgroup processes 1 query and NE (NW/NL) head elements
@@ -3973,6 +4161,16 @@ kernel void kernel_flash_attn_ext_vec(

 typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 128, 128, 4>) flash_attn_ext_vec_t;

+template [[host_name("kernel_flash_attn_ext_vec_f16_h64")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4,             1, dequantize_f16_t4,  half4,       1, dequantize_f16_t4,  64, 64, 8>;
+#if defined(GGML_METAL_USE_BF16)
+template [[host_name("kernel_flash_attn_ext_vec_bf16_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4,           1, dequantize_bf16_t4, bfloat4,     1, dequantize_bf16_t4, 64, 64, 8>;
+#endif
+template [[host_name("kernel_flash_attn_ext_vec_q4_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0,        8, dequantize_q4_0_t4, block_q4_0,  8, dequantize_q4_0_t4, 64, 64, 8>;
+template [[host_name("kernel_flash_attn_ext_vec_q4_1_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1,        8, dequantize_q4_1_t4, block_q4_1,  8, dequantize_q4_1_t4, 64, 64, 8>;
+template [[host_name("kernel_flash_attn_ext_vec_q5_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0,        8, dequantize_q5_0_t4, block_q5_0,  8, dequantize_q5_0_t4, 64, 64, 8>;
+template [[host_name("kernel_flash_attn_ext_vec_q5_1_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1,        8, dequantize_q5_1_t4, block_q5_1,  8, dequantize_q5_1_t4, 64, 64, 8>;
+template [[host_name("kernel_flash_attn_ext_vec_q8_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0,        8, dequantize_q8_0_t4, block_q8_0,  8, dequantize_q8_0_t4, 64, 64, 8>;
+
 template [[host_name("kernel_flash_attn_ext_vec_f16_h96")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4,             1, dequantize_f16_t4,  half4,       1, dequantize_f16_t4,  96, 96, 4>;
 #if defined(GGML_METAL_USE_BF16)
 template [[host_name("kernel_flash_attn_ext_vec_bf16_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4,           1, dequantize_bf16_t4, bfloat4,     1, dequantize_bf16_t4, 96, 96, 4>;
--- a/ggml/src/ggml-musa/CMakeLists.txt
+++ b/ggml/src/ggml-musa/CMakeLists.txt
@@ -27,12 +27,15 @@ if (MUSAToolkit_FOUND)

    file(GLOB   GGML_HEADERS_MUSA "../ggml-cuda/*.cuh")
    list(APPEND GGML_HEADERS_MUSA "../../include/ggml-cuda.h")
+    list(APPEND GGML_HEADERS_MUSA "../ggml-musa/mudnn.cuh")

    file(GLOB   GGML_SOURCES_MUSA "../ggml-cuda/*.cu")
    file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-mma*.cu")
    list(APPEND GGML_SOURCES_MUSA ${SRCS})
    file(GLOB   SRCS "../ggml-cuda/template-instances/mmq*.cu")
    list(APPEND GGML_SOURCES_MUSA ${SRCS})
+    file(GLOB   SRCS "../ggml-musa/*.cu")
+    list(APPEND GGML_SOURCES_MUSA ${SRCS})

    if (GGML_CUDA_FA_ALL_QUANTS)
        file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
@@ -62,7 +65,9 @@ if (MUSAToolkit_FOUND)
                            )

    # TODO: do not use CUDA definitions for MUSA
-    target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
+    if (NOT GGML_BACKEND_DL)
+        target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
+    endif()

    add_compile_definitions(GGML_USE_MUSA)
    add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
@@ -92,9 +97,10 @@ if (MUSAToolkit_FOUND)
    endif()

    if (GGML_STATIC)
+        # TODO: mudnn has not provided static libraries yet
        target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
    else()
-        target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas)
+        target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas mudnn)
    endif()

    if (GGML_CUDA_NO_VMM)
--- a/ggml/src/ggml-musa/mudnn.cu
+++ b/ggml/src/ggml-musa/mudnn.cu
@@ -0,0 +1,112 @@
+#include <mutex>
+#include <mudnn.h>
+
+#include "mudnn.cuh"
+
+namespace mudnn = musa::dnn;
+
+// Returns a human-readable error string for mudnn::Status
+const char* mudnnGetErrorString(mudnn::Status err) {
+    switch (err) {
+        case mudnn::Status::SUCCESS:
+            return "Success";
+        case mudnn::Status::INVALID_PARAMETER:
+            return "Invalid parameter";
+        case mudnn::Status::NOT_INITIALIZED:
+            return "Not initialized";
+        case mudnn::Status::ALLOC_FAILED:
+            return "Allocation failed";
+        case mudnn::Status::NOT_SUPPORTED:
+            return "Not supported";
+        case mudnn::Status::INTERNAL_ERROR:
+            return "Internal error";
+        case mudnn::Status::ARCH_MISMATCH:
+            return "Architecture mismatch";
+        case mudnn::Status::EXECUTION_FAILED:
+            return "Execution failed";
+        default:
+            return "Unknown mudnn status";
+    }
+}
+
+// Error checking macro for MUDNN calls
+#define MUDNN_CHECK(err) CUDA_CHECK_GEN(err, mudnn::Status::SUCCESS, mudnnGetErrorString)
+
+namespace {
+    // Thread-safe cache for mudnn::Handle objects per device
+    std::unordered_map<int, std::unique_ptr<mudnn::Handle>> handle_cache;
+    std::mutex handle_cache_mutex;
+
+    mudnn::Handle* get_cached_handle(int device_id) {
+        std::lock_guard<std::mutex> lock(handle_cache_mutex);
+        auto it = handle_cache.find(device_id);
+        if (it != handle_cache.end()) {
+            return it->second.get();
+        }
+        auto handle = std::make_unique<mudnn::Handle>(device_id);
+        mudnn::Handle* handle_ptr = handle.get();
+        handle_cache[device_id] = std::move(handle);
+        return handle_ptr;
+    }
+}
+
+// Extracts dimensions and strides from a ggml_tensor
+int get_ggml_dims_and_strides(const ggml_tensor* tensor,
+                              std::vector<int64_t>& dims,
+                              std::vector<int64_t>& strides) {
+    const int ndims = ggml_n_dims(tensor);
+    const size_t element_size = ggml_element_size(tensor);
+
+    dims.resize(ndims);
+    strides.resize(ndims);
+
+    for (int i = 0; i < ndims; ++i) {
+        dims[i] = tensor->ne[i];
+        strides[i] = tensor->nb[i] / static_cast<int64_t>(element_size);
+    }
+    return ndims;
+}
+
+// Converts ggml_type to mudnn::Tensor::Type
+mudnn::Tensor::Type ggml_type_to_mudnn_type(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_F32:
+            return mudnn::Tensor::Type::FLOAT;
+        case GGML_TYPE_F16:
+            return mudnn::Tensor::Type::HALF;
+
+        // TODO: Add support for other types
+
+        default:
+            MUDNN_CHECK(mudnn::Status::NOT_SUPPORTED);
+    }
+
+    return mudnn::Tensor::Type::FLOAT; // Default fallback
+}
+
+// Asynchronous memory copy using mudnn::Unary::IDENTITY
+musaError_t mudnnMemcpyAsync(ggml_backend_cuda_context& ctx, const ggml_tensor* dst, const ggml_tensor* src) {
+    mudnn::Tensor tensor_dst, tensor_src;
+
+    MUDNN_CHECK(tensor_dst.SetType(ggml_type_to_mudnn_type(dst->type)));
+    MUDNN_CHECK(tensor_src.SetType(ggml_type_to_mudnn_type(src->type)));
+
+    std::vector<int64_t> dims, strides;
+    const int ndims = get_ggml_dims_and_strides(src, dims, strides);
+
+    MUDNN_CHECK(tensor_dst.SetNdInfo(ndims, dims.data(), strides.data()));
+    MUDNN_CHECK(tensor_src.SetNdInfo(ndims, dims.data(), strides.data()));
+    MUDNN_CHECK(tensor_dst.SetAddr(dst->data));
+    MUDNN_CHECK(tensor_src.SetAddr(src->data));
+
+    mudnn::Unary op;
+    MUDNN_CHECK(op.SetMode(mudnn::Unary::Mode::IDENTITY));
+    MUDNN_CHECK(op.SetAlpha(0.0f));
+    MUDNN_CHECK(op.SetBeta(0.0f));
+
+    mudnn::Handle* handle = get_cached_handle(ctx.device);
+    MUDNN_CHECK(handle->SetStream(ctx.stream()));
+    MUDNN_CHECK(op.Run(*handle, tensor_dst, tensor_src));
+
+    return musaSuccess;
+}
--- a/ggml/src/ggml-musa/mudnn.cuh
+++ b/ggml/src/ggml-musa/mudnn.cuh
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "../include/ggml.h"
+#include "../ggml-cuda/common.cuh"
+
+// Asynchronously copies data from src tensor to dst tensor using the provided context.
+// Returns a musaError_t indicating success or failure.
+musaError_t mudnnMemcpyAsync(
+    ggml_backend_cuda_context &ctx,
+    const ggml_tensor *dst,
+    const ggml_tensor *src
+);
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -27,6 +27,7 @@
 #include <cmath>
 #include <memory>
 #include <charconv>
+#include <mutex>

 #undef MIN
 #undef MAX
@@ -74,6 +75,7 @@ struct ggml_cl_version {
    cl_uint minor = 0;
 };

+
 struct ggml_cl_compiler_version {
    ADRENO_CL_COMPILER_TYPE type;
    int major = -1;
@@ -91,6 +93,14 @@ struct ggml_cl_compiler_version {
    }
 };

+static size_t align_to(size_t value, size_t to_alignment) {
+    GGML_ASSERT(to_alignment && "Invalid alignment (must be non-zero)");
+    GGML_ASSERT((to_alignment & (to_alignment - 1)) == 0 && "to_alignment must be power-of-two");
+
+    return ((value + to_alignment - 1) / to_alignment) * to_alignment;
+}
+
+
 // Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
 static ggml_cl_version parse_cl_version(std::string_view str) {
    size_t major_str_begin = 0;
@@ -221,13 +231,25 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
    return { type, major, minor, patch };
 }

+struct ggml_backend_opencl_context;
+
 // backend device context
 struct ggml_backend_opencl_device_context {
    cl_platform_id platform;
    std::string platform_name;

-    cl_device_id device;
-    std::string device_name;
+    cl_device_id   device;
+    std::string    device_name;
+    cl_device_type device_type;
+    std::string    device_version;
+
+    // Initialized by ggml_cl2_init().
+    ggml_backend_opencl_context * backend_ctx = nullptr;
+
+    // Initialized by ggml_backend_opencl_device_get_buffer_type()
+    ggml_backend_buffer_type buffer_type;
+
+    cl_context context = nullptr;
 };

 // backend context
@@ -248,6 +270,8 @@ struct ggml_backend_opencl_context {

    int adreno_wave_size;

+    cl_bool non_uniform_workgroups;
+
    cl_context context;
    cl_command_queue queue;

@@ -344,15 +368,8 @@ struct ggml_backend_opencl_context {
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
 };

-static ggml_backend_device                 g_ggml_backend_opencl_device;
-static ggml_backend_opencl_device_context  g_ggml_ctx_dev_main {
-    /*.platform         =*/ nullptr,
-    /*.platform_nane    =*/ "",
-    /*.device           =*/ nullptr,
-    /*.device_name      =*/ "",
-};
-
-static int ggml_backend_opencl_n_devices = 0;
+// All registered devices with a default device in the front.
+static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;

 // Profiling
 #ifdef GGML_OPENCL_PROFILING
@@ -1107,25 +1124,19 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
    GGML_LOG_CONT("\n");
 }

-static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
-    static bool initialized = false;
-    static ggml_backend_opencl_context *backend_ctx = nullptr;
+// XXX static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
+// XXX    static bool initialized = false;
+// XXX    static ggml_backend_opencl_context *backend_ctx = nullptr;

-    if (initialized) {
-        return backend_ctx;
-    }
+static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev);

-    ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *)dev->context;
-    GGML_ASSERT(dev_ctx);
-    GGML_ASSERT(dev_ctx->platform == nullptr);
-    GGML_ASSERT(dev_ctx->device == nullptr);
-    GGML_ASSERT(backend_ctx == nullptr);
+namespace /* anonymous */ {
+extern struct ggml_backend_device_i ggml_backend_opencl_device_i;
+}

-    initialized = true;
-    backend_ctx = new ggml_backend_opencl_context();
-    backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
-
-    cl_int err;
+// Look for available and suitable devices.
+static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_reg * reg) {
+    std::vector<ggml_backend_device> found_devices;

 #ifdef GGML_OPENCL_PROFILING
    GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
@@ -1158,11 +1169,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    struct cl_device devices[NDEV];
    unsigned n_devices = 0;
    struct cl_device * default_device = NULL;
+    unsigned           default_platform_number = 0;

    cl_platform_id platform_ids[NPLAT];
    if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
        GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
-        return backend_ctx;
+        return found_devices;
    }

    for (unsigned i = 0; i < n_platforms; i++) {
@@ -1197,19 +1209,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
        }

        if (default_device == NULL && p->default_device != NULL) {
-            default_device = p->default_device;
+            default_device          = p->default_device;
+            default_platform_number = i;
        }
    }

    if (n_devices == 0) {
        GGML_LOG_ERROR("ggml_opencl: could find any OpenCL devices.\n");
-        return backend_ctx;
+        return found_devices;
    }

-    char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
-    char * user_device_string = getenv("GGML_OPENCL_DEVICE");
-    int user_platform_number = -1;
-    int user_device_number = -1;
+    char *      user_platform_string = getenv("GGML_OPENCL_PLATFORM");
+    char *      user_device_string   = getenv("GGML_OPENCL_DEVICE");
+    int         user_platform_number = -1;
+    int         user_device_number   = -1;
+    cl_device * candidate_devices    = nullptr;
+    unsigned    n_candidate_devices  = 0;

    unsigned n;
    if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
@@ -1224,12 +1239,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
            GGML_LOG_ERROR("ggml_opencl: invalid device number %d\n", user_device_number);
            exit(1);
        }
-        default_device = &platform->devices[user_device_number];
+        default_device      = &platform->devices[user_device_number];
+        candidate_devices   = platform->devices;
+        n_candidate_devices = platform->n_devices;
    } else {
-
-        struct cl_device * selected_devices = devices;
-        unsigned n_selected_devices = n_devices;
-
+        // Choose a platform by matching a substring.
        if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
            for (unsigned i = 0; i < n_platforms; i++) {
                struct cl_platform * p = &platforms[i];
@@ -1244,20 +1258,20 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
                exit(1);
            }
        }
-        if (user_platform_number != -1) {
-            struct cl_platform * p = &platforms[user_platform_number];
-            selected_devices = p->devices;
-            n_selected_devices = p->n_devices;
-            default_device = p->default_device;
-            if (n_selected_devices == 0) {
-                GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
-                exit(1);
-            }
+
+        int                  platform_idx = user_platform_number != -1 ? user_platform_number : default_platform_number;
+        struct cl_platform * p            = &platforms[platform_idx];
+        candidate_devices                 = p->devices;
+        n_candidate_devices               = p->n_devices;
+        default_device                    = p->default_device;
+        if (n_candidate_devices == 0) {
+            GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
+            exit(1);
        }

        if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
-            for (unsigned i = 0; i < n_selected_devices; i++) {
-                struct cl_device * d = &selected_devices[i];
+            for (unsigned i = 0; i < n_candidate_devices; i++) {
+                struct cl_device * d = &candidate_devices[i];
                if (strstr(d->name, user_device_string) != NULL) {
                    user_device_number = d->number;
                    break;
@@ -1269,71 +1283,145 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
            }
        }
        if (user_device_number != -1) {
-            selected_devices = &devices[user_device_number];
-            n_selected_devices = 1;
-            default_device = &selected_devices[0];
+            candidate_devices   = &devices[user_device_number];
+            n_candidate_devices = 1;
+            default_device      = &candidate_devices[0];
        }

-        GGML_ASSERT(n_selected_devices > 0);
+        GGML_ASSERT(n_candidate_devices > 0);

        if (default_device == NULL) {
-            default_device = &selected_devices[0];
+            default_device = &candidate_devices[0];
        }
    }

-    GGML_LOG_INFO("ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
-    GGML_LOG_INFO("ggml_opencl: selecting device: '%s (%s)'\n", default_device->name, default_device->version);
-    if (default_device->type != CL_DEVICE_TYPE_GPU) {
-        GGML_LOG_WARN("ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
+    GGML_ASSERT(n_candidate_devices != 0 && candidate_devices);
+
+    // Put the default device in front.
+    for (unsigned i = 1; i < n_candidate_devices; i++) {
+        if (&candidate_devices[i] == default_device) {
+            std::swap(candidate_devices[0], candidate_devices[i]);
+            default_device = &candidate_devices[0];
+            break;
+        }
    }

-    dev_ctx->platform = default_device->platform->id;
-    dev_ctx->device = default_device->id;
-    backend_ctx->device = default_device->id;
+    GGML_LOG_INFO("ggml_opencl: selected platform: '%s'\n", default_device->platform->name);

-    if (strstr(default_device->name, "Adreno") ||
-        strstr(default_device->name, "Qualcomm") ||
-        strstr(default_device->version, "Adreno")) {
+    std::vector<cl_device_id> device_ids;
+    for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
+        device_ids.push_back(dev->id);
+    }
+
+    cl_int                err;
+    cl_context            shared_context;
+    cl_context_properties properties[] = { (intptr_t) CL_CONTEXT_PLATFORM, (intptr_t) default_device->platform->id, 0 };
+
+    CL_CHECK(
+        (shared_context = clCreateContext(properties, device_ids.size(), device_ids.data(), NULL, NULL, &err), err));
+
+    for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
+        GGML_LOG_INFO("\nggml_opencl: device: '%s (%s)'\n", dev->name, dev->version);
+
+        auto dev_ctx = std::unique_ptr<ggml_backend_opencl_device_context>(new ggml_backend_opencl_device_context{
+            /*.platform         =*/dev->platform->id,
+            /*.platform_nane    =*/dev->platform->name,
+            /*.device           =*/dev->id,
+            /*.device_name      =*/dev->name,
+            /*.device_type      =*/dev->type,
+            /*.device_version   =*/dev->version,
+            /*.backend_ctx      =*/nullptr,
+            /*.buffer_type      =*/{},
+            /*.context          =*/shared_context,
+        });
+
+        found_devices.push_back(ggml_backend_device{
+            /* .iface   = */ ggml_backend_opencl_device_i,
+            /* .reg     = */ reg,
+            /* .context = */ dev_ctx.get(),
+        });
+
+        if (!ggml_cl2_init(&found_devices.back())) {
+            found_devices.pop_back();
+            GGML_LOG_INFO("ggml_opencl: drop unsupported device.\n");
+            continue;
+        }
+
+        dev_ctx.release();
+    }
+
+    if (found_devices.size()) {
+        auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(found_devices.front().context);
+        GGML_LOG_INFO("ggml_opencl: default device: '%s (%s)'\n", dev_ctx->device_name.c_str(),
+                      dev_ctx->device_version.c_str());
+
+        if (dev_ctx->device_type != CL_DEVICE_TYPE_GPU) {
+            GGML_LOG_WARN("ggml_opencl: warning, the default device is not a GPU: '%s'.\n",
+                          dev_ctx->device_name.c_str());
+        }
+    }
+
+    return found_devices;
+}
+
+// Initialize device if it is supported (returns nullptr if it is not).
+static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
+    GGML_ASSERT(dev);
+    GGML_ASSERT(dev->context);
+
+    ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
+    GGML_ASSERT(dev_ctx->platform);
+    GGML_ASSERT(dev_ctx->device);
+
+    if (dev_ctx->backend_ctx) {
+        return dev_ctx->backend_ctx;
+    }
+
+    auto backend_ctx        = std::make_unique<ggml_backend_opencl_context>();
+    backend_ctx->device     = dev_ctx->device;
+    backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
+
+    if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
+        strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
+        strstr(dev_ctx->device_version.c_str(), "Adreno")) {
        backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
        // Usually device version contains the detailed device name
-        backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->version);
+        backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
        if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
-            backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
+            backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
        }

        // Use wave size of 64 for all Adreno GPUs.
        backend_ctx->adreno_wave_size = 64;
-    } else if (strstr(default_device->name, "Intel")) {
+    } else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
        backend_ctx->gpu_family = GPU_FAMILY::INTEL;
    } else {
-        GGML_LOG_ERROR("Unsupported GPU: %s\n", default_device->name);
+        GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str());
        backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
-        return backend_ctx;
+        return nullptr;
    }

 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
    if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
        GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
            "run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
-        return backend_ctx;
+        return nullptr;
    }
 #endif

    // Populate backend device name
-    dev_ctx->platform_name = default_device->platform->name;
-    dev_ctx->device_name = default_device->name;
-    backend_ctx->device_name = default_device->name;
+    backend_ctx->device_name = dev_ctx->device_name;

    // A local ref of cl_device_id for convenience
    cl_device_id device = backend_ctx->device;

-    ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id);
+    ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);

    // Check device OpenCL version, OpenCL 2.0 or above is required
    ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
    if (opencl_c_version.major < 2) {
        GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
-        return backend_ctx;
+        return nullptr;
    }

    // Check driver version
@@ -1364,7 +1452,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    // fp16 is required
    if (!backend_ctx->fp16_support) {
        GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n");
-        return backend_ctx;
+        return nullptr;
    }

    // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
@@ -1373,7 +1461,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
        strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
        GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
            "(note that subgroups is an optional feature in OpenCL 3.0)\n");
-        return backend_ctx;
+        return nullptr;
    }

    cl_uint base_align_in_bits;
@@ -1397,6 +1485,15 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
        svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");

+    if (opencl_c_version.major >= 3) {
+        CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool),
+                                 &backend_ctx->non_uniform_workgroups, 0));
+    } else {
+        GGML_ASSERT(opencl_c_version.major == 2);
+        // Non-uniform workgroup sizes is mandatory feature in v2.x.
+        backend_ctx->non_uniform_workgroups = true;
+    }
+
    // Print out configurations
 #ifdef GGML_OPENCL_SOA_Q
    GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
@@ -1406,14 +1503,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS

-    cl_context_properties properties[] = {
-        (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)dev_ctx->platform, 0
-    };
-
-    CL_CHECK((backend_ctx->context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
+    cl_int err;

    // A local ref of cl_context for convenience
-    cl_context context = backend_ctx->context;
+    cl_context context = backend_ctx->context = dev_ctx->context;

    //CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
    //    (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
@@ -1426,7 +1519,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));

    // Load kernels
-    load_cl_kernels(backend_ctx, opencl_c_version);
+    load_cl_kernels(backend_ctx.get(), opencl_c_version);

 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
    // Allocate intermediate buffers and images
@@ -1456,10 +1549,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    CL_CHECK((backend_ctx->B_d_max   = clCreateBuffer(context, 0, max_B_d_bytes,   NULL, &err), err));
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS

-    // For now we support a single devices
-    ggml_backend_opencl_n_devices = 1;
-
-    return backend_ctx;
+    dev_ctx->backend_ctx = backend_ctx.release();
+    return dev_ctx->backend_ctx;
 }

 static void ggml_cl2_free(void) {
@@ -1664,10 +1755,46 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
    GGML_UNUSED(backend);
 }

+// Syncronizes the 'backend_ctx's device with others so that commands
+// enqueued to it won't start until commands in the other devices have
+// completed.
+static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
+    if (g_ggml_backend_opencl_devices.size() < 2)
+      return; // No other devices to synchronize with.
+
+    std::vector<cl_event> events;
+    events.reserve(g_ggml_backend_opencl_devices.size());
+
+    for (ggml_backend_device & backend_dev : g_ggml_backend_opencl_devices) {
+        auto * other_backend_ctx = ggml_cl2_init(&backend_dev);
+        if (backend_ctx != other_backend_ctx) {
+            cl_event ev;
+            CL_CHECK(clEnqueueMarkerWithWaitList(other_backend_ctx->queue, 0, nullptr, &ev));
+            CL_CHECK(clFlush(other_backend_ctx->queue));
+            events.push_back(ev);
+        }
+    }
+
+    CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, events.size(), events.data(), nullptr));
+    for (auto ev : events) {
+        CL_CHECK(clReleaseEvent(ev));
+    }
+}
+
+static void sync_with_other_backends(ggml_backend_t backend) {
+    auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
+    sync_with_other_backends(backend_ctx);
+}
+
 static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_tensor * node = cgraph->nodes[i];

+        // NOTE: this may oversynchronize by synchronizing with
+        //       backends/devices which don't compute 'cgraph's
+        //       dependencies.
+        sync_with_other_backends(backend);
+
        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
            continue;
        }
@@ -2058,15 +2185,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
        // The original tensor memory is divided into scales and quants, i.e.,
        // we first store scales, then quants.
        // Create subbuffer for scales.
-        region.origin = extra_orig->offset + tensor->view_offs + offset;
+        region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
        region.size = size_d;
        extra->d = clCreateSubBuffer(
            extra_orig->data_device, CL_MEM_READ_WRITE,
            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
        CL_CHECK(err);
+        auto previous_origin = region.origin;

        // Create subbuffer for quants.
-        region.origin = extra_orig->offset + tensor->view_offs + offset + size_d;
+        region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
        region.size = size_q;
        extra->q = clCreateSubBuffer(
            extra_orig->data_device, CL_MEM_READ_WRITE,
@@ -2271,8 +2399,8 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
    cl_context context = backend_ctx->context;
    cl_command_queue queue = backend_ctx->queue;

-    // Make sure all previously submitted commands are finished.
-    CL_CHECK(clFinish(queue));
+    // Make sure all previously submitted commands in other devices are finished.
+    sync_with_other_backends(backend_ctx);

 #ifdef GGML_OPENCL_SOA_Q
    // In end-to-end runs, get_tensor is usually used to get back the logits,
@@ -2376,13 +2504,8 @@ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_b
 }

 static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
-    // FIXME: not thread safe, device may not be initialized yet
-    static cl_uint alignment = -1;
-    if (alignment == (cl_uint)-1) {
-        ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
-        alignment = backend_ctx->alignment;
-    }
-    return alignment;
+    ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
+    return backend_ctx->alignment;
 }

 static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
@@ -2409,16 +2532,6 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
    /* .is_host          = */ NULL,
 };

-ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
-    static ggml_backend_buffer_type buffer_type = {
-        /* .iface   = */ ggml_backend_opencl_buffer_type_interface,
-        /* .device  = */ &g_ggml_backend_opencl_device,
-        /* .context = */ nullptr,
-    };
-
-    return &buffer_type;
-}
-
 //
 // backend device
 //
@@ -2476,9 +2589,15 @@ static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, co
 }

 static ggml_backend_buffer_type_t ggml_backend_opencl_device_get_buffer_type(ggml_backend_dev_t dev) {
-    return ggml_backend_opencl_buffer_type();
+    auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(dev->context);

-    GGML_UNUSED(dev);
+    dev_ctx->buffer_type = ggml_backend_buffer_type{
+        /* .iface   = */ ggml_backend_opencl_buffer_type_interface,
+        /* .device  = */ dev,
+        /* .context = */ nullptr,
+    };
+
+    return &dev_ctx->buffer_type;
 }

 static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
@@ -2494,12 +2613,21 @@ static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const
 }

 static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return buft->iface.get_name == ggml_backend_opencl_buffer_type_get_name;
+    // Check 'dev' and 'buffer_type' are not objects belonging to this backend.
+    if (dev->iface.get_name != ggml_backend_opencl_device_get_name ||
+        buft->iface.get_name != ggml_backend_opencl_buffer_type_get_name) {
+        return false;
+    }

-    GGML_UNUSED(dev);
+    // Check cl_context is the same. clEnqueue* commands may not use
+    // buffers from another cl_context.
+    ggml_backend_opencl_context * backend_ctx0 = ggml_cl2_init(dev);
+    ggml_backend_opencl_context * backend_ctx1 = ggml_cl2_init(buft->device);
+    return backend_ctx0->context == backend_ctx1->context;
 }

-static struct ggml_backend_device_i ggml_backend_opencl_device_i = {
+namespace /* anonymous */ {
+struct ggml_backend_device_i ggml_backend_opencl_device_i = {
    /* .get_name             = */ ggml_backend_opencl_device_get_name,
    /* .get_description      = */ ggml_backend_opencl_device_get_description,
    /* .get_memory           = */ ggml_backend_opencl_device_get_memory,
@@ -2516,6 +2644,7 @@ static struct ggml_backend_device_i ggml_backend_opencl_device_i = {
    /* .event_free           = */ NULL,
    /* .event_synchronize    = */ NULL,
 };
+}

 // Backend registry

@@ -2526,15 +2655,15 @@ static const char * ggml_backend_opencl_reg_get_name(ggml_backend_reg_t reg) {
 }

 static size_t ggml_backend_opencl_reg_device_count(ggml_backend_reg_t reg) {
-    return ggml_backend_opencl_n_devices;
+    return g_ggml_backend_opencl_devices.size();

    GGML_UNUSED(reg);
 }

 static ggml_backend_dev_t ggml_backend_opencl_reg_device_get(ggml_backend_reg_t reg, size_t index) {
-    GGML_ASSERT(index == 0);
+    GGML_ASSERT(index < ggml_backend_opencl_reg_device_count(reg));

-    return &g_ggml_backend_opencl_device;
+    return &g_ggml_backend_opencl_devices[index];

    GGML_UNUSED(reg);
    GGML_UNUSED(index);
@@ -2548,27 +2677,23 @@ static struct ggml_backend_reg_i ggml_backend_opencl_reg_i = {
 };

 ggml_backend_reg_t ggml_backend_opencl_reg(void) {
-    // TODO: make this thread-safe somehow?
+    static std::mutex mutex;
    static ggml_backend_reg reg;
    static bool initialized = false;
+    std::lock_guard<std::mutex> lock(mutex);

-    if (!initialized) {
-        reg = ggml_backend_reg {
-            /* .api_version = */ GGML_BACKEND_API_VERSION,
-            /* .iface   = */ ggml_backend_opencl_reg_i,
-            /* .context = */ NULL,
-        };
-
-        g_ggml_backend_opencl_device = ggml_backend_device {
-            /* .iface   = */ ggml_backend_opencl_device_i,
-            /* .reg     = */ &reg,
-            /* .context = */ &g_ggml_ctx_dev_main,
-        };
-
-        ggml_cl2_init(&g_ggml_backend_opencl_device);
-
-        initialized = true;
+    if (initialized) {
+        return &reg;
    }
+    initialized = true;
+
+    g_ggml_backend_opencl_devices = ggml_opencl_probe_devices(&reg);
+
+    reg = ggml_backend_reg{
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_opencl_reg_i,
+        /* .context     = */ NULL,
+    };

    return &reg;
 }
@@ -2942,14 +3067,19 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
        size_t global_work_size[] = {(size_t)n, 1, 1};
        size_t local_work_size[] = {64, 1, 1};

+        size_t * local_work_size_ptr = local_work_size;
+        if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+            local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+        }
+
 #ifdef GGML_OPENCL_PROFILING
        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));

        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
    } else {
        unsigned int nth = MIN(64, ne0);
@@ -3077,14 +3207,19 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
        size_t global_work_size[] = {(size_t)n, 1, 1};
        size_t local_work_size[] = {64, 1, 1};

+        size_t * local_work_size_ptr = local_work_size;
+        if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+            local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+        }
+
 #ifdef GGML_OPENCL_PROFILING
        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));

        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
    } else {
        unsigned int nth = MIN(64, ne0);
@@ -3233,14 +3368,19 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
    size_t global_work_size[] = {(size_t)n, 1, 1};
    size_t local_work_size[] = {64, 1, 1};

+    size_t * local_work_size_ptr = local_work_size;
+    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+    }
+
 #ifdef GGML_OPENCL_PROFILING
    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));

    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
 }

@@ -3273,14 +3413,19 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
    size_t global_work_size[] = {(size_t)n, 1, 1};
    size_t local_work_size[] = {64, 1, 1};

+    size_t * local_work_size_ptr = local_work_size;
+    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+    }
+
 #ifdef GGML_OPENCL_PROFILING
    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));

    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
 }

@@ -3320,14 +3465,19 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
    size_t global_work_size[] = {(size_t)n, 1, 1};
    size_t local_work_size[] = {64, 1, 1};

+    size_t * local_work_size_ptr = local_work_size;
+    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+    }
+
 #ifdef GGML_OPENCL_PROFILING
    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));

    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
 }

@@ -4230,14 +4380,19 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
    size_t global_work_size[] = {(size_t)n, 1, 1};
    size_t local_work_size[] = {64, 1, 1};

+    size_t * local_work_size_ptr = local_work_size;
+    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+    }
+
 #ifdef GGML_OPENCL_PROFILING
    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));

    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
 }

@@ -4418,14 +4573,19 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
        size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02};
        size_t local_work_size[] = {64, 1, 1};

+        size_t * local_work_size_ptr = local_work_size;
+        if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+            local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+        }
+
 #ifdef GGML_OPENCL_PROFILING
        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));

        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
    }
 }
@@ -4855,8 +5015,6 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
            if (!any_on_device) {
                return false;
            }
-            GGML_ASSERT(ggml_is_contiguous(src0));
-            GGML_ASSERT(ggml_is_contiguous(src1));
            func = ggml_cl_add;
            break;
        case GGML_OP_MUL:
--- a/ggml/src/ggml-opt.cpp
+++ b/ggml/src/ggml-opt.cpp
@@ -28,16 +28,19 @@ struct ggml_opt_dataset {
 };

 struct ggml_opt_context {
-    ggml_backend_sched_t    backend_sched        = nullptr;
-    ggml_cgraph           * allocated_graph      = nullptr;
-    ggml_cgraph           * allocated_graph_copy = nullptr;
-    struct ggml_context   * ctx_static           = nullptr;
-    struct ggml_context   * ctx_static_cpu       = nullptr;
-    struct ggml_context   * ctx_compute          = nullptr;
-    struct ggml_context   * ctx_copy             = nullptr;
-    ggml_backend_buffer_t   buf_static           = nullptr;
-    ggml_backend_buffer_t   buf_static_cpu       = nullptr;
-    std::mt19937            rng;
+    ggml_backend_sched_t       backend_sched        = nullptr;
+    ggml_cgraph              * allocated_graph      = nullptr;
+    ggml_cgraph              * allocated_graph_copy = nullptr;
+    struct ggml_context      * ctx_static           = nullptr;
+    struct ggml_context      * ctx_cpu              = nullptr;
+    struct ggml_context      * ctx_compute          = nullptr;
+    struct ggml_context      * ctx_copy             = nullptr;
+    ggml_backend_buffer_t      buf_static           = nullptr;
+    ggml_backend_buffer_t      buf_cpu              = nullptr;
+    std::mt19937               rng;
+    enum ggml_opt_loss_type    loss_type;
+    enum ggml_opt_build_type   build_type;
+    enum ggml_opt_build_type   build_type_alloc;

    struct ggml_tensor * inputs  = nullptr;
    struct ggml_tensor * outputs = nullptr;
@@ -50,6 +53,11 @@ struct ggml_opt_context {
    struct ggml_cgraph * gf      = nullptr;
    struct ggml_cgraph * gb_grad = nullptr;
    struct ggml_cgraph * gb_opt  = nullptr;
+    bool static_graphs           = false;
+    bool eval_ready              = false;
+    std::vector<struct ggml_tensor *> grad_accs;
+    std::vector<struct ggml_tensor *> grad_m;
+    std::vector<struct ggml_tensor *> grad_v;

    int64_t iter               = 1;
    int32_t opt_period         = 1;
@@ -73,7 +81,13 @@ struct ggml_opt_result {

 // ====== Dataset ======

-ggml_opt_dataset_t ggml_opt_dataset_init(int64_t ne_datapoint, int64_t ne_label, int64_t ndata, int64_t ndata_shard) {
+ggml_opt_dataset_t ggml_opt_dataset_init(
+        enum ggml_type type_data,
+        enum ggml_type type_label,
+        int64_t        ne_datapoint,
+        int64_t        ne_label,
+        int64_t        ndata,
+        int64_t        ndata_shard) {
    GGML_ASSERT(ne_datapoint >  0);
    GGML_ASSERT(ne_label     >= 0);
    GGML_ASSERT(ndata        >  0);
@@ -92,11 +106,11 @@ ggml_opt_dataset_t ggml_opt_dataset_init(int64_t ne_datapoint, int64_t ne_label,
        result->ctx = ggml_init(params);
    }

-    result->data = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_datapoint, ndata);
+    result->data = ggml_new_tensor_2d(result->ctx, type_data, ne_datapoint, ndata);
    result->nbs_data = ggml_nbytes(result->data) * ndata_shard/ndata;

    if (ne_label > 0) {
-        result->labels = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_label, ndata);
+        result->labels = ggml_new_tensor_2d(result->ctx, type_label, ne_label, ndata);
        result->nbs_labels = ggml_nbytes(result->labels) * ndata_shard/ndata;
    } else {
        result->labels = nullptr;
@@ -119,6 +133,10 @@ void ggml_opt_dataset_free(ggml_opt_dataset_t dataset) {
    delete dataset;
 }

+int64_t ggml_opt_dataset_ndata(ggml_opt_dataset_t dataset) {
+    return dataset->ndata;
+}
+
 struct ggml_tensor * ggml_opt_dataset_data(ggml_opt_dataset_t dataset) {
    return dataset->data;
 }
@@ -144,6 +162,8 @@ void ggml_opt_dataset_get_batch(ggml_opt_dataset_t dataset, struct ggml_tensor *
    GGML_ASSERT(   data_batch && ggml_is_contiguous(data_batch));
    GGML_ASSERT(!labels_batch || ggml_is_contiguous(labels_batch));
    GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
+    GGML_ASSERT(                   data_batch->type == dataset->data->type);
+    GGML_ASSERT(!labels_batch || labels_batch->type == dataset->labels->type);

    const size_t nb_data_batch = ggml_nbytes(data_batch);
    GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
@@ -171,6 +191,31 @@ void ggml_opt_dataset_get_batch(ggml_opt_dataset_t dataset, struct ggml_tensor *
    }
 }

+void ggml_opt_dataset_get_batch_host(ggml_opt_dataset_t dataset, void * data_batch, size_t nb_data_batch, void * labels_batch, int64_t ibatch) {
+    GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
+    GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
+
+    const int64_t shards_per_batch = nb_data_batch / dataset->nbs_data;
+
+    GGML_ASSERT((ibatch + 1)*shards_per_batch <= int64_t(dataset->permutation.size()));
+
+    for (int64_t ishard_batch = 0; ishard_batch < shards_per_batch; ++ishard_batch) {
+        const int64_t ishard = dataset->permutation[ibatch*shards_per_batch + ishard_batch];
+
+        const char * ptr_data       = (const char *) dataset->data->data + ishard      *dataset->nbs_data;
+        char       * ptr_data_batch = (char       *) data_batch          + ishard_batch*dataset->nbs_data;
+        memcpy(ptr_data_batch, ptr_data, dataset->nbs_data);
+
+        if (!labels_batch) {
+            continue;
+        }
+
+        const char * ptr_labels       = (const char *) dataset->labels->data + ishard      *dataset->nbs_labels;
+        char       * ptr_labels_batch = (char       *) labels_batch          + ishard_batch*dataset->nbs_labels;
+        memcpy(ptr_labels_batch, ptr_labels, dataset->nbs_labels);
+    }
+}
+
 // ====== Model / Context ======

 struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata) {
@@ -187,17 +232,18 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us
    return result;
 }

+struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata) {
+    return *((struct ggml_opt_optimizer_params *) userdata);
+}
+
 struct ggml_opt_params ggml_opt_default_params(
        ggml_backend_sched_t      backend_sched,
-        struct ggml_context     * ctx_compute,
-        struct ggml_tensor      * inputs,
-        struct ggml_tensor      * outputs,
        enum ggml_opt_loss_type   loss_type) {
    return {
        /*backend_sched   =*/ backend_sched,
-        /*ctx_compute     =*/ ctx_compute,
-        /*inputs          =*/ inputs,
-        /*logits          =*/ outputs,
+        /*ctx_compute     =*/ nullptr,
+        /*inputs          =*/ nullptr,
+        /*logits          =*/ nullptr,
        /*loss_type       =*/ loss_type,
        /*build_type      =*/ GGML_OPT_BUILD_TYPE_OPT,
        /*opt_period      =*/ 1,
@@ -266,195 +312,246 @@ static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) {
    return dst;
 }

-static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph) {
-    GGML_ASSERT(graph);
-    if (opt_ctx->allocated_graph == graph) {
-        return;
-    }
+static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
+    GGML_ASSERT(opt_ctx->ctx_compute && "no compute context set, either use static graphs or set one with ggml_opt_prepare_alloc");
+    GGML_ASSERT((!opt_ctx->static_graphs || opt_ctx->inputs->data) && "when using static graphs the inputs must be allocated statically");

-    ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph
+    const bool accumulate = opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD &&
+        !(opt_ctx->static_graphs && opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period == 1);

-    {
-        ggml_init_params params = {
-            /*.mem_size   =*/ ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE,
-            /*.mem_buffer =*/ nullptr,
-            /*.no_alloc   =*/ true,
-        };
-        ggml_free(opt_ctx->ctx_copy);
-        opt_ctx->ctx_copy = ggml_init(params);
-    }
-
-    opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
-
-    ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
-    opt_ctx->allocated_graph = graph;
-}
-
-ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
-    ggml_opt_context_t result = new struct ggml_opt_context;
-    result->backend_sched   = params.backend_sched;
-    result->ctx_compute     = params.ctx_compute;
-    result->inputs          = params.inputs;
-    result->outputs         = params.outputs;
-    result->opt_period      = params.opt_period;
-    result->get_opt_pars    = params.get_opt_pars;
-    result->get_opt_pars_ud = params.get_opt_pars_ud;
-
-    GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically");
-    GGML_ASSERT(result->opt_period >= 1);
-
-    const bool accumulate = params.build_type == GGML_OPT_BUILD_TYPE_GRAD ||
-        (params.build_type == GGML_OPT_BUILD_TYPE_OPT && result->opt_period > 1);
-
-    ggml_set_input(result->inputs);
-    ggml_set_output(result->outputs);
-
-    result->gf = ggml_new_graph_custom(result->ctx_compute, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass.
-    ggml_build_forward_expand(result->gf, result->outputs);
+    ggml_set_input(opt_ctx->inputs);
+    ggml_set_output(opt_ctx->outputs);

    int n_param = 0;
-    for (int i = 0; i < result->gf->n_nodes; ++i) {
-        if (result->gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
+    for (int i = 0; i < opt_ctx->gf->n_nodes; ++i) {
+        const struct ggml_tensor * node = opt_ctx->gf->nodes[i];
+        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
            n_param++;
        }
+        GGML_ASSERT(!(node->flags & GGML_TENSOR_FLAG_LOSS) && "support for extra loss terms not implemented");
    }

-    {
+    if (!opt_ctx->ctx_static) {
        // The static context is used for:
-        //   - gradients (1 tensor per param if using gradient accumulation)
+        //   - gradients (1 per loss, 1 tensor per param if using gradient accumulation)
        //   - optimizer momenta (2 tensors per param)
-        //   - labels
-        //   - loss + its gradient (up to 5 tensors)
-        //   - pred
-        //   - ncorrect (2 tensors).
-        const size_t tensors_per_param = (accumulate ? 1 : 0) + (params.build_type == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
-        const size_t size_meta = (tensors_per_param*n_param + 9) * ggml_tensor_overhead();
+        //   - labels (if using static graphs)
+        //   - loss (if using static graphs, up to 5 tensors)
+        //   - pred (if using static graphs)
+        //   - ncorrect (if using static graphs, 2 tensors).
+        constexpr size_t n_loss = 1;
+        const size_t tensors_per_param = (accumulate ? 1 : 0) +
+            (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
+        const size_t tensors_const = opt_ctx->static_graphs ? 9 : 0;
+        const size_t size_meta = (n_loss + tensors_per_param*n_param + tensors_const) * ggml_tensor_overhead();
        struct ggml_init_params params = {
            /*.mem_size   =*/ size_meta,
            /*.mem_buffer =*/ nullptr,
            /*.no_alloc   =*/ true,
        };
-        result->ctx_static = ggml_init(params);
+        opt_ctx->ctx_static = ggml_init(params);
    }
+    GGML_ASSERT(opt_ctx->build_type <= opt_ctx->build_type_alloc);
+
    {
-        // The static cpu context is used for:
-        //   - optimizer parameters (1 for the entire context)
+        // The cpu context is allocated statically if using static graphs, dynamically otherwise.
+        // It is used for:
+        //   - optimizer parameters (1 shared for all optimizer invocations)
        const size_t size_meta = 1 * ggml_tensor_overhead();
        struct ggml_init_params params = {
            /*.mem_size   =*/ size_meta,
            /*.mem_buffer =*/ nullptr,
            /*.no_alloc   =*/ true,
        };
-        result->ctx_static_cpu = ggml_init(params);
+        ggml_free(opt_ctx->ctx_cpu);
+        opt_ctx->ctx_cpu = ggml_init(params);
+
+        ggml_backend_buffer_free(opt_ctx->buf_cpu);
+        opt_ctx->buf_cpu = nullptr;
    }

+    struct ggml_context * ctx_results = opt_ctx->static_graphs ? opt_ctx->ctx_static : opt_ctx->ctx_compute;

-    switch (params.loss_type) {
+    switch (opt_ctx->loss_type) {
        case GGML_OPT_LOSS_TYPE_MEAN: {
-            result->loss = ggml_sum(result->ctx_static, result->outputs);
-            ggml_set_name(result->loss, "loss_sum");
-            const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
-            result->loss = ggml_scale(result->ctx_static, result->loss, scale);
-            ggml_set_name(result->loss, "loss_mean");
-            result->loss_per_datapoint = true;
+            opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->outputs);
+            ggml_set_name(opt_ctx->loss, "loss_sum");
+            const float scale = 1.0f / (opt_ctx->opt_period * ggml_nelements(opt_ctx->outputs));
+            opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, scale);
+            ggml_set_name(opt_ctx->loss, "loss_mean");
+            opt_ctx->loss_per_datapoint = true;
            break;
        }
        case GGML_OPT_LOSS_TYPE_SUM: {
-            result->loss = ggml_sum(result->ctx_static, result->outputs);
-            ggml_set_name(result->loss, "loss_sum");
-            result->loss_per_datapoint = false;
+            opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->outputs);
+            ggml_set_name(opt_ctx->loss, "loss_sum");
+            opt_ctx->loss_per_datapoint = false;
            break;
        }
        case GGML_OPT_LOSS_TYPE_CROSS_ENTROPY: {
-            result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
-            ggml_set_input(result->labels);
-            ggml_set_name(result->labels, "labels");
-            result->loss = ggml_cross_entropy_loss(result->ctx_static, result->outputs, result->labels);
-            ggml_set_name(result->loss, "loss_cross_entropy");
-            if (result->opt_period > 1) {
-                result->loss = ggml_scale(result->ctx_static, result->loss, 1.0f / result->opt_period);
-                ggml_set_name(result->loss, "loss_cross_entropy_scaled");
+            opt_ctx->labels = ggml_dup_tensor(ctx_results, opt_ctx->outputs);
+            ggml_set_input(opt_ctx->labels);
+            ggml_set_name(opt_ctx->labels, "labels");
+            opt_ctx->loss = ggml_cross_entropy_loss(ctx_results, opt_ctx->outputs, opt_ctx->labels);
+            ggml_set_name(opt_ctx->loss, "loss_cross_entropy");
+            if (opt_ctx->opt_period > 1) {
+                opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, 1.0f / opt_ctx->opt_period);
+                ggml_set_name(opt_ctx->loss, "loss_cross_entropy_scaled");
            }
-            result->loss_per_datapoint = true;
+            opt_ctx->loss_per_datapoint = true;
            break;
        }
        case GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR: {
-            result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
-            ggml_set_input(result->labels);
-            ggml_set_name(result->labels, "labels");
-            result->loss = ggml_sub(result->ctx_static, result->outputs, result->labels);
-            ggml_set_name(result->loss, "loss_error");
-            result->loss = ggml_sqr(result->ctx_static, result->loss);
-            ggml_set_name(result->loss, "loss_squared_error");
-            result->loss = ggml_sum(result->ctx_static, result->loss);
-            ggml_set_name(result->loss, "loss_sum_squared_error");
-            const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
-            result->loss = ggml_scale(result->ctx_static, result->loss, scale);
-            ggml_set_name(result->loss, "loss_mean_squared_error");
-            result->loss_per_datapoint = true;
+            opt_ctx->labels = ggml_dup_tensor(ctx_results, opt_ctx->outputs);
+            ggml_set_input(opt_ctx->labels);
+            ggml_set_name(opt_ctx->labels, "labels");
+            opt_ctx->loss = ggml_sub(ctx_results, opt_ctx->outputs, opt_ctx->labels);
+            ggml_set_name(opt_ctx->loss, "loss_error");
+            opt_ctx->loss = ggml_sqr(ctx_results, opt_ctx->loss);
+            ggml_set_name(opt_ctx->loss, "loss_squared_error");
+            opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->loss);
+            ggml_set_name(opt_ctx->loss, "loss_sum_squared_error");
+            const float scale = 1.0f / (opt_ctx->opt_period * ggml_nelements(opt_ctx->outputs));
+            opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, scale);
+            ggml_set_name(opt_ctx->loss, "loss_mean_squared_error");
+            opt_ctx->loss_per_datapoint = true;
            break;
        }
    }
-    ggml_set_output(result->loss);
-    ggml_set_loss(result->loss);
-    ggml_build_forward_expand(result->gf, result->loss);
+    ggml_set_output(opt_ctx->loss);
+    ggml_set_loss(opt_ctx->loss);
+    ggml_build_forward_expand(opt_ctx->gf, opt_ctx->loss);

-    result->pred = ggml_argmax(result->ctx_static, result->outputs);
-    ggml_set_name(result->pred, "pred");
-    ggml_set_output(result->pred);
-    ggml_build_forward_expand(result->gf, result->pred);
+    if (opt_ctx->loss_type == GGML_OPT_LOSS_TYPE_CROSS_ENTROPY) {
+        opt_ctx->pred = ggml_argmax(ctx_results, opt_ctx->outputs);
+        ggml_set_name(opt_ctx->pred, "pred");
+        ggml_set_output(opt_ctx->pred);
+        ggml_build_forward_expand(opt_ctx->gf, opt_ctx->pred);

-    if (result->labels) {
-        result->ncorrect = ggml_count_equal(result->ctx_static, result->pred, ggml_argmax(result->ctx_static, result->labels));
-        ggml_set_name(result->ncorrect, "ncorrect");
-        ggml_set_output(result->ncorrect);
-        ggml_build_forward_expand(result->gf, result->ncorrect);
-    } else {
-        result->ncorrect = nullptr;
+        opt_ctx->ncorrect = ggml_count_equal(ctx_results, opt_ctx->pred, ggml_argmax(ctx_results, opt_ctx->labels));
+        ggml_set_name(opt_ctx->ncorrect, "ncorrect");
+        ggml_set_output(opt_ctx->ncorrect);
+        ggml_build_forward_expand(opt_ctx->gf, opt_ctx->ncorrect);
    }

-    if (params.build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
-        result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
-        return result;
+    if (opt_ctx->buf_static) {
+        if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
+            return;
+        }
+    } else if (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_FORWARD) {
+        opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors(
+            opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
+        return;
    }

-    // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
-    result->gb_grad = ggml_graph_dup(result->ctx_compute, result->gf);
-    ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate);
+    if (opt_ctx->grad_accs.empty()) {
+        GGML_ASSERT(opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD);

-    if (params.build_type == GGML_OPT_BUILD_TYPE_GRAD) {
-        result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
-        ggml_graph_reset(result->gb_grad);
-        return result;
-    }
+        const int n_nodes = opt_ctx->gf->n_nodes;
+        opt_ctx->grad_accs.resize(n_nodes);
+        for (int i = 0; i < n_nodes; ++i) {
+            ggml_tensor * node = opt_ctx->gf->nodes[i];
+            if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
+                opt_ctx->grad_accs[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
+            } else {
+                opt_ctx->grad_accs[i] = nullptr;
+            }
+        }

-    GGML_ASSERT(params.build_type == GGML_OPT_BUILD_TYPE_OPT);
-
-    // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
-    result->gb_opt = ggml_graph_dup(result->ctx_compute, result->gb_grad);
-
-    result->adamw_params = ggml_new_tensor_1d(result->ctx_static_cpu, GGML_TYPE_F32, 7);
-    ggml_set_input(result->adamw_params);
-    ggml_set_name(result->adamw_params, "adamw_params");
-
-    for (int i = result->gf->n_nodes-1; i >= 0; --i) {
-        struct ggml_tensor * node = result->gb_opt->nodes[i];
-        struct ggml_tensor * grad = ggml_graph_get_grad(result->gb_opt, node);
-
-        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
-            struct ggml_tensor * m        = ggml_dup_tensor(result->ctx_static, node);
-            struct ggml_tensor * v        = ggml_dup_tensor(result->ctx_static, node);
-            struct ggml_tensor * opt_step = ggml_opt_step_adamw(result->ctx_compute, node, grad, m, v, result->adamw_params);
-            ggml_build_forward_expand(result->gb_opt, opt_step);
+        if (opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_OPT) {
+            opt_ctx->grad_m.resize(n_nodes);
+            opt_ctx->grad_v.resize(n_nodes);
+            for (int i = 0; i < n_nodes; ++i) {
+                ggml_tensor * node = opt_ctx->gf->nodes[i];
+                if (node->flags & GGML_TENSOR_FLAG_PARAM) {
+                    opt_ctx->grad_m[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
+                    opt_ctx->grad_v[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
+                } else {
+                    opt_ctx->grad_m[i] = nullptr;
+                    opt_ctx->grad_v[i] = nullptr;
+                }
+            }
        }
    }

-    result->buf_static = ggml_backend_alloc_ctx_tensors(
-        result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
+    // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
+    opt_ctx->gb_grad = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gf, /*force_grads =*/ true);
+    ggml_build_backward_expand(opt_ctx->ctx_compute, opt_ctx->gb_grad, opt_ctx->grad_accs.data());

-    result->buf_static_cpu = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, ggml_backend_cpu_buffer_type());
+    if (opt_ctx->buf_static) {
+        if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_GRAD) {
+            return;
+        }
+    } else if (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_GRAD) {
+        opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors(opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
+        ggml_graph_reset(opt_ctx->gb_grad);
+    }

-    ggml_graph_reset(result->gb_opt);
+    GGML_ASSERT(opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT);
+
+    // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
+    opt_ctx->gb_opt = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true);
+
+    opt_ctx->adamw_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, 7);
+    ggml_set_input(opt_ctx->adamw_params);
+    ggml_set_name(opt_ctx->adamw_params, "adamw_params");
+
+    for (int i = opt_ctx->gf->n_nodes-1; i >= 0; --i) {
+        struct ggml_tensor * node = opt_ctx->gb_opt->nodes[i];
+        struct ggml_tensor * grad = ggml_graph_get_grad(opt_ctx->gb_opt, node);
+
+        if (grad && (node->flags & GGML_TENSOR_FLAG_PARAM)) {
+            struct ggml_tensor * m        = opt_ctx->grad_m[i];
+            struct ggml_tensor * v        = opt_ctx->grad_v[i];
+            struct ggml_tensor * opt_step = ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, opt_ctx->adamw_params);
+
+            ggml_set_name(m,        (std::string("AdamW m for ")    + std::string(node->name)).c_str());
+            ggml_set_name(v,        (std::string("AdamW v for ")    + std::string(node->name)).c_str());
+            ggml_set_name(opt_step, (std::string("AdamW step for ") + std::string(node->name)).c_str());
+
+            ggml_build_forward_expand(opt_ctx->gb_opt, opt_step);
+        }
+    }
+
+    if (!opt_ctx->buf_static) {
+        opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors(
+            opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
+        ggml_graph_reset(opt_ctx->gb_opt);
+    }
+
+    opt_ctx->buf_cpu = ggml_backend_alloc_ctx_tensors_from_buft(opt_ctx->ctx_cpu, ggml_backend_cpu_buffer_type());
+}
+
+ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
+    ggml_opt_context_t result = new struct ggml_opt_context;
+    result->backend_sched    = params.backend_sched;
+    result->ctx_compute      = params.ctx_compute;
+    result->loss_type        = params.loss_type;
+    result->build_type       = params.build_type;
+    result->build_type_alloc = params.build_type;
+    result->inputs           = params.inputs;
+    result->outputs          = params.outputs;
+    result->opt_period       = params.opt_period;
+    result->get_opt_pars     = params.get_opt_pars;
+    result->get_opt_pars_ud  = params.get_opt_pars_ud;
+
+    GGML_ASSERT(result->opt_period >= 1);
+
+    result->static_graphs = result->ctx_compute;
+
+    if (!result->static_graphs) {
+        GGML_ASSERT(!result->inputs);
+        GGML_ASSERT(!result->outputs);
+        return result;
+    }
+
+    GGML_ASSERT(result->inputs);
+    GGML_ASSERT(result->outputs);
+
+    result->gf = ggml_new_graph_custom(result->ctx_compute, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass.
+    ggml_build_forward_expand(result->gf, result->outputs);
+
+    ggml_opt_build(result);

    return result;
 }
@@ -464,9 +561,9 @@ void ggml_opt_free(ggml_opt_context_t opt_ctx) {
        return;
    }
    ggml_backend_buffer_free(opt_ctx->buf_static);
-    ggml_backend_buffer_free(opt_ctx->buf_static_cpu);
+    ggml_backend_buffer_free(opt_ctx->buf_cpu);
    ggml_free(opt_ctx->ctx_static);
-    ggml_free(opt_ctx->ctx_static_cpu);
+    ggml_free(opt_ctx->ctx_cpu);
    delete opt_ctx;
 }

@@ -479,6 +576,10 @@ void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer) {
    }
 }

+bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->static_graphs;
+}
+
 struct ggml_tensor * ggml_opt_inputs(ggml_opt_context_t opt_ctx) {
    return opt_ctx->inputs;
 }
@@ -582,8 +683,79 @@ void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, doubl

 // ====== Computation ======

-static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph, ggml_opt_result * result) {
-    if (graph != opt_ctx->gf) {
+void ggml_opt_prepare_alloc(
+        ggml_opt_context_t    opt_ctx,
+        struct ggml_context * ctx_compute,
+        struct ggml_cgraph  * gf,
+        struct ggml_tensor  * inputs,
+        struct ggml_tensor  * outputs) {
+    GGML_ASSERT(!opt_ctx->static_graphs);
+    opt_ctx->ctx_compute = ctx_compute;
+    opt_ctx->gf          = gf;
+    opt_ctx->inputs      = inputs;
+    opt_ctx->outputs     = outputs;
+}
+
+void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) {
+    GGML_ASSERT(!opt_ctx->eval_ready);
+    if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period > 1 && opt_ctx->opt_i == 0) {
+        ggml_graph_reset(opt_ctx->gb_grad);
+    }
+    if (backward) {
+        const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
+        opt_ctx->build_type = opt_i_next == 0 ? GGML_OPT_BUILD_TYPE_OPT : GGML_OPT_BUILD_TYPE_GRAD;
+    } else {
+        opt_ctx->build_type = GGML_OPT_BUILD_TYPE_FORWARD;
+    }
+
+    if (!opt_ctx->static_graphs) {
+        ggml_opt_build(opt_ctx);
+    }
+
+    struct ggml_cgraph * graph = nullptr;
+    switch (opt_ctx->build_type) {
+        case GGML_OPT_BUILD_TYPE_FORWARD: {
+            graph = opt_ctx->gf;
+        } break;
+        case GGML_OPT_BUILD_TYPE_GRAD: {
+            graph = opt_ctx->gb_grad;
+        } break;
+        case GGML_OPT_BUILD_TYPE_OPT: {
+            graph = opt_ctx->gb_opt;
+        } break;
+    }
+    GGML_ASSERT(graph);
+
+    if (opt_ctx->allocated_graph == graph) {
+        opt_ctx->eval_ready = true;
+        return;
+    }
+
+    ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph
+
+    if (opt_ctx->static_graphs) {
+        ggml_init_params params = {
+            /*.mem_size   =*/ graph->size*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph->size, graph->grads),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ggml_free(opt_ctx->ctx_copy);
+        opt_ctx->ctx_copy = ggml_init(params);
+
+        opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
+    } else {
+        opt_ctx->allocated_graph_copy = graph;
+    }
+
+    ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
+    opt_ctx->allocated_graph = graph;
+
+    opt_ctx->eval_ready = true;
+}
+
+void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
+    GGML_ASSERT(opt_ctx->eval_ready);
+    if (opt_ctx->allocated_graph == opt_ctx->gb_opt) {
        struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);

        GGML_ASSERT(opt_pars.adamw.alpha >  0.0f);
@@ -609,9 +781,19 @@ static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph,
        adamw_par_data[6] = beta2h;
    }

-    ggml_opt_alloc_graph(opt_ctx, graph);
    ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
    opt_ctx->iter += opt_ctx->allocated_graph == opt_ctx->gb_opt;
+    opt_ctx->opt_i = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
+
+    if (!opt_ctx->static_graphs) {
+        opt_ctx->gf                   = nullptr;
+        opt_ctx->gb_grad              = nullptr;
+        opt_ctx->gb_opt               = nullptr;
+        opt_ctx->allocated_graph      = nullptr;
+        opt_ctx->allocated_graph_copy = nullptr;
+    }
+
+    opt_ctx->eval_ready = false;

    if (!result) {
        return;
@@ -635,12 +817,14 @@ static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph,
    ggml_backend_tensor_get(opt_ctx->loss, &loss, 0, ggml_nbytes(opt_ctx->loss));
    result->loss.push_back(loss);

-    GGML_ASSERT(opt_ctx->pred->type == GGML_TYPE_I32);
-    std::vector<int32_t> pred(ndata);
-    ggml_backend_tensor_get(opt_ctx->pred, pred.data(), 0, ggml_nbytes(opt_ctx->pred));
-    result->pred.insert(result->pred.end(), pred.begin(), pred.end());
+    if (opt_ctx->pred) {
+        GGML_ASSERT(opt_ctx->pred->type == GGML_TYPE_I32);
+        std::vector<int32_t> pred(ndata);
+        ggml_backend_tensor_get(opt_ctx->pred, pred.data(), 0, ggml_nbytes(opt_ctx->pred));
+        result->pred.insert(result->pred.end(), pred.begin(), pred.end());
+    }

-    if (!opt_ctx->labels || result->ncorrect < 0) {
+    if (!opt_ctx->ncorrect || result->ncorrect < 0) {
        result->ncorrect = -1;
        return;
    }
@@ -652,26 +836,6 @@ static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph,
    result->ncorrect += ncorrect;
 }

-void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
-    ggml_opt_eval_graph(opt_ctx, opt_ctx->gf, result);
-}
-
-void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
-    if (opt_ctx->opt_period == 1) {
-        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
-        return;
-    }
-
-    const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
-    if (opt_i_next == 0) {
-        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
-        ggml_opt_reset(opt_ctx, /*optimizer =*/ false);
-    } else {
-        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_grad, result);
-    }
-    opt_ctx->opt_i = opt_i_next;
-}
-
 // ====== High-Level Functions ======

 void ggml_opt_epoch(
@@ -682,6 +846,7 @@ void ggml_opt_epoch(
        int64_t                 idata_split,
        ggml_opt_epoch_callback callback_train,
        ggml_opt_epoch_callback callback_eval) {
+    GGML_ASSERT(ggml_opt_static_graphs(opt_ctx) && "ggml_opt_epoch requires static graphs");
    struct ggml_tensor * inputs = ggml_opt_inputs(opt_ctx);
    struct ggml_tensor * labels = ggml_opt_labels(opt_ctx);
    struct ggml_tensor * data   = ggml_opt_dataset_data(dataset);
@@ -700,16 +865,18 @@ void ggml_opt_epoch(
    int64_t ibatch = 0;
    int64_t t_loop_start = ggml_time_us();
    for (; ibatch < ibatch_split; ++ibatch) {
+        ggml_opt_alloc(opt_ctx, /*backward =*/ true);
        ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
-        ggml_opt_forward_backward(opt_ctx, result_train);
+        ggml_opt_eval(opt_ctx, result_train);
        if (callback_train) {
            callback_train(true, opt_ctx, dataset, result_train, ibatch+1, ibatch_split, t_loop_start);
        }
    }
    t_loop_start = ggml_time_us();
    for (; ibatch < nbatches; ++ibatch) {
+        ggml_opt_alloc(opt_ctx, /*backward =*/ false);
        ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
-        ggml_opt_forward(opt_ctx, result_eval);
+        ggml_opt_eval(opt_ctx, result_eval);
        if (callback_eval) {
            callback_eval(false, opt_ctx, dataset, result_eval, ibatch+1-ibatch_split, nbatches-ibatch_split, t_loop_start);
        }
@@ -726,13 +893,26 @@ void ggml_opt_epoch_callback_progress_bar(
        int64_t            t_start_us) {
    fprintf(stderr, "%s[", train ? "train: " : "val:   ");

-    constexpr int64_t bar_length = 25;
+    // The progress bar consists of partially filled blocks, unicode has 8 separate fill levels.
+    constexpr int64_t bar_length = 8;
+    const int64_t ibatch8 = 8 * ibatch;
    for (int64_t j = 0; j < bar_length; ++j) {
-        const int64_t ibatch_j = ibatch_max * j/bar_length;
-        if (ibatch_j < ibatch) {
-            fprintf(stderr, "=");
-        } else if (ibatch_max * (j - 1)/bar_length < ibatch) {
-            fprintf(stderr, ">");
+        if        (ibatch_max * (8*j + 8) / bar_length < ibatch8) {
+            fprintf(stderr, "\u2588"); // full block
+        } else if (ibatch_max * (8*j + 7) / bar_length < ibatch8) {
+            fprintf(stderr, "\u2589"); // 7/8 filled
+        } else if (ibatch_max * (8*j + 6) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258A"); // 6/8 filled
+        } else if (ibatch_max * (8*j + 5) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258B"); // 5/8 filled
+        } else if (ibatch_max * (8*j + 4) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258C"); // 4/8 filled
+        } else if (ibatch_max * (8*j + 3) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258D"); // 3/8 filled
+        } else if (ibatch_max * (8*j + 2) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258E"); // 2/8 filled
+        } else if (ibatch_max * (8*j + 1) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258F"); // 1/8 filled
        } else {
            fprintf(stderr, " ");
        }
@@ -764,8 +944,8 @@ void ggml_opt_epoch_callback_progress_bar(
    const int64_t t_eta_m = t_eta_s / 60;
    t_eta_s -= t_eta_m * 60;

-    fprintf(stderr, "| data=%06" PRId64 "/%06" PRId64 ", loss=%.6lf+-%.6lf, accuracy=%.2lf+-%.2lf%%, "
-            "t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 ", ETA=%02" PRId64 ":%02" PRId64 ":%02" PRId64 "]\r",
+    fprintf(stderr, "] data=%07" PRId64 "/%07" PRId64 " loss=%.5lf±%.5lf acc=%.2lf±%.2lf%% "
+            "t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 " ETA=%02" PRId64 ":%02" PRId64 ":%02" PRId64 " \r",
            idata, idata_max, loss, loss_unc, 100.0*accuracy, 100.0*accuracy_unc,
            t_ibatch_h, t_ibatch_m, t_ibatch_s, t_eta_h, t_eta_m, t_eta_s);
    if (ibatch == ibatch_max) {
@@ -806,7 +986,10 @@ void ggml_opt_fit(

    int64_t epoch = 1;

-    ggml_opt_params params = ggml_opt_default_params(backend_sched, ctx_compute, inputs, outputs, loss_type);
+    ggml_opt_params params = ggml_opt_default_params(backend_sched, loss_type);
+    params.ctx_compute     = ctx_compute;
+    params.inputs          = inputs;
+    params.outputs         = outputs;
    params.opt_period      = opt_period;
    params.get_opt_pars    = get_opt_pars;
    params.get_opt_pars_ud = &epoch;
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -49,35 +49,38 @@ endif()
 target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")

 # Link against oneDNN
-find_package(DNNL)
 set(GGML_SYCL_DNNL 0)
-if(DNNL_FOUND)
-    if (DEFINED ENV{ONEAPI_ROOT} AND NOT DEFINED DNNL_GPU_VENDOR)
-        # Assuming oneDNN packaged with oneapi release is used which
-        # supports only intel target
-        set(DNNL_GPU_VENDOR "INTEL")
-        if(NOT "${GGML_SYCL_TARGET}" STREQUAL "INTEL")
-            message(WARNING "oneDNN builds bundled with oneapi release only support INTEL target")
+if(GGML_SYCL_DNN)
+    find_package(DNNL)
+    if(DNNL_FOUND)
+        if (NOT DEFINED DNNL_GPU_VENDOR)
+            # default to intel target
+            set(DNNL_GPU_VENDOR "INTEL")
+            if(NOT "${GGML_SYCL_TARGET}" STREQUAL "INTEL")
+                message(WARNING "oneDNN builds bundled with oneapi release only support INTEL target")
+            endif()
        endif()
-    endif()

-    # Verify oneDNN was compiled for the same target as llama
-    if("${GGML_SYCL_TARGET}" STREQUAL "${DNNL_GPU_VENDOR}")
-        target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
-        set(GGML_SYCL_DNNL 1)
-        get_target_property(CONFIGS DNNL::dnnl IMPORTED_CONFIGURATIONS)
-        foreach(CONFIG ${CONFIGS})
-            get_target_property(DNNL_LIB DNNL::dnnl IMPORTED_LOCATION_${CONFIG})
-            message(STATUS "Found oneDNN: ${DNNL_LIB}")
-        endforeach()
+        # Verify oneDNN was compiled for the same target as llama
+        if("${GGML_SYCL_TARGET}" STREQUAL "${DNNL_GPU_VENDOR}")
+            target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
+            set(GGML_SYCL_DNNL 1)
+            get_target_property(CONFIGS DNNL::dnnl IMPORTED_CONFIGURATIONS)
+            foreach(CONFIG ${CONFIGS})
+                get_target_property(DNNL_LIB DNNL::dnnl IMPORTED_LOCATION_${CONFIG})
+                message(STATUS "Found oneDNN: ${DNNL_LIB}")
+            endforeach()
+        else()
+            message(WARNING
+                "oneDNN must be compiled for the same target as llama.cpp.
+                 llama.cpp: ${GGML_SYCL_TARGET}, oneDNN: ${DNNL_GPU_VENDOR}.
+                 Disabling oneDNN support.")
+        endif()
    else()
-        message(WARNING
-            "oneDNN must be compiled for the same target as llama.cpp.
-             llama.cpp: ${GGML_SYCL_TARGET}, oneDNN: ${DNNL_GPU_VENDOR}.
-             Disabling oneDNN support.")
+        message(STATUS "oneDNN not found, disabling oneDNN support")
    endif()
 else()
-    message(STATUS "oneDNN not found, disabling oneDNN support")
+    message(STATUS "oneDNN support disabled by the user")
 endif()
 target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_DNNL=${GGML_SYCL_DNNL})

@@ -108,6 +111,9 @@ endif()
 if (GGML_SYCL_TARGET STREQUAL "INTEL")
    # Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically
    # See https://github.com/uxlfoundation/oneMath/issues/654
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        set(SYCL_COMPILER ON)
+    endif()
    find_package(MKL REQUIRED)
    target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
    target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL)
--- a/ggml/src/ggml-sycl/binbcast.cpp
+++ b/ggml/src/ggml-sycl/binbcast.cpp
@@ -1,93 +1,74 @@
 #include "binbcast.hpp"

+#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/sycl.hpp>

+#include "dpct/helper.hpp"
 #include "ggml.h"

-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
-        int ne0, int ne1, int ne2, int ne3,
-        int ne10, int ne11, int ne12, int ne13,
-        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s00,*/ int s01, int s02, int s03,
-        /*int s10,*/ int s11, int s12, int s13,
-        const sycl::nd_item<3> &item_ct1) {
-    const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-    const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1));
-    const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
-                    item_ct1.get_local_id(0)) /
-                   ne3;
-    const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
-                    item_ct1.get_local_id(0)) %
-                   ne3;
-
-    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-        return;
-    }
-
-    const int i11 = i1 % ne11;
-    const int i12 = i2 % ne12;
-    const int i13 = i3 % ne13;
-
-    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
-
-    const src0_t * src0_row = src0 + i_src0;
-    const src1_t * src1_row = src1 + i_src1;
-    dst_t * dst_row = dst + i_dst;
-
-    for (int i0 = i0s; i0 < ne0;
-         i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
-        const int i10 = i0 % ne10;
-        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
+template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static __dpct_inline__ void k_bin_bcast_contiguous(const src0_t * __restrict__ src0, const src1_t * __restrict__ src1,
+                                                   dst_t * dst, std::size_t num_elements, const sycl::nd_item<1> & it) {
+    auto element_id   = it.get_global_id(0);
+    auto global_range = it.get_global_range(0);
+    for (; element_id < num_elements; element_id += global_range) {
+        auto  src0_float_val = sycl::vec(src0[element_id]).template convert<float, sycl::rounding_mode::rte>();
+        auto  src1_float_val = sycl::vec(src1[element_id]).template convert<float, sycl::rounding_mode::rte>();
+        float dst_val        = bin_op(src0_float_val[0], src1_float_val[0]);
+        auto  val_to_store   = sycl::vec(dst_val).template convert<dst_t, sycl::rounding_mode::rte>();
+        dst[element_id]      = val_to_store;
    }
 }

-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
-        int ne0, int ne1, int ne2, int ne3,
-        int ne10, int ne11, int ne12, int ne13,
-        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s00,*/ int s01, int s02, int s03,
-        /*int s10,*/ int s11, int s12, int s13,
-        const sycl::nd_item<3> &item_ct1) {
+template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static __dpct_inline__ void k_bin_bcast(const src0_t * __restrict__ src0, const src1_t * __restrict__ src1, dst_t * dst,
+                                        int ne0, int ne1, int ne2, int ne3, int ne10, int ne11, int ne12, int ne13,
+                                        int s0, int s1, int s2, int s3, int s00, int s01, int s02, int s03, int s10,
+                                        int s11, int s12, int s13, std::size_t num_dst_elements,
+                                        const sycl::nd_item<1> & item_ct1) {
+    auto calculate_logical_index =
+        [](const std::array<int, 4> & dims, std::size_t element_id) __attribute__((always_inline))->std::array<int, 4> {
+        std::array<int, 4> logical_index;
+#pragma unroll(4)
+        for (int i = 3; i >= 0; i--) {
+            logical_index[i] = element_id % dims[i];
+            element_id /= dims[i];
+        }
+        return logical_index;
+    };

-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
+    auto calculate_index = [](const std::array<int, 4> & dims, const std::array<int, 4> & strides,
+                              const std::array<int, 4> & indices) __attribute__((always_inline))
+                               ->std::size_t {
+        std::size_t index = 0;
+#pragma unroll(4)
+        for (int i = 0; i < 4; i++) {
+            auto index_i = indices[i];
+            if (indices[i] >= dims[i]) {
+                index_i = indices[i] % dims[i];
+            }
+            index += strides[i] * index_i;
+        }
+        return index;
+    };

-    const int i3 = i/(ne2*ne1*ne0);
-    const int i2 = (i/(ne1*ne0)) % ne2;
-    const int i1 = (i/ne0) % ne1;
-    const int i0 = i % ne0;
-
-    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-        return;
+    auto element_id = item_ct1.get_global_id(0);
+    for (; element_id < num_dst_elements; element_id += item_ct1.get_global_range(0)) {
+        auto  logical_index  = calculate_logical_index({ ne3, ne2, ne1, ne0 }, element_id);
+        auto  src_0_index    = calculate_index({ ne3, ne2, ne1, ne0 }, { s03, s02, s01, s00 }, logical_index);
+        auto  src_1_index    = calculate_index({ ne13, ne12, ne11, ne10 }, { s13, s12, s11, s10 }, logical_index);
+        auto  dst_index      = calculate_index({ ne3, ne2, ne1, ne0 }, { s3, s2, s1, s0 }, logical_index);
+        auto  src0_float_val = sycl::vec(src0[src_0_index]).template convert<float, sycl::rounding_mode::rte>();
+        auto  src1_float_val = sycl::vec(src1[src_1_index]).template convert<float, sycl::rounding_mode::rte>();
+        float dst_val        = bin_op(src0_float_val[0], src1_float_val[0]);
+        auto  val_to_store   = sycl::vec(dst_val).template convert<dst_t, sycl::rounding_mode::rte>();
+        dst[dst_index]       = val_to_store;
    }
-
-    const int i11 = i1 % ne11;
-    const int i12 = i2 % ne12;
-    const int i13 = i3 % ne13;
-
-    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
-
-    const src0_t * src0_row = src0 + i_src0;
-    const src1_t * src1_row = src1 + i_src1;
-    dst_t * dst_row = dst + i_dst;
-
-    const int i10 = i0 % ne10;
-    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
 }

-
-template<float (*bin_op)(const float, const float)>
-struct bin_bcast_sycl {
+template <float (*bin_op)(const float, const float)> struct bin_bcast_sycl {
    template <typename src0_t, typename src1_t, typename dst_t>
    void operator()(const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, const int64_t ne00,
                    const int64_t ne01, const int64_t ne02, const int64_t ne03, const int64_t ne10, const int64_t ne11,
@@ -96,165 +77,73 @@ struct bin_bcast_sycl {
                    const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0,
                    const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous,
                    const bool src1_is_contiguous, const bool dst_is_contiguous, queue_ptr stream) {
-        int nr0 = ne10 / ne0;
-        int nr1 = ne11/ne1;
-        int nr2 = ne12/ne2;
-        int nr3 = ne13/ne3;
-
-        int nr[4] = { nr0, nr1, nr2, nr3 };
-
-        // collapse dimensions until first broadcast dimension
-        int64_t cne[] = {ne0, ne1, ne2, ne3};
-        int64_t cne0[] = {ne00, ne01, ne02, ne03};
-        int64_t cne1[] = {ne10, ne11, ne12, ne13};
-        size_t cnb[] = {nb0, nb1, nb2, nb3};
-        size_t cnb0[] = {nb00, nb01, nb02, nb03};
-        size_t cnb1[] = {nb10, nb11, nb12, nb13};
-        auto collapse = [](int64_t cne[]) {
-            cne[0] *= cne[1];
-            cne[1] = cne[2];
-            cne[2] = cne[3];
-            cne[3] = 1;
-        };
-
-        auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
-            cnb[1] *= cne[1];
-            cnb[2] *= cne[2];
-            cnb[3] *= cne[3];
-        };
-
-        if (src0_is_contiguous && src1_is_contiguous && dst_is_contiguous) {
+        auto check_bcast_required = [](const std::array<int64_t, 4> & src_dims,
+                                       const std::array<int64_t, 4> & dst_dims) -> bool {
            for (int i = 0; i < 4; i++) {
-                if (nr[i] != 1) {
-                    break;
-                }
-                if (i > 0) {
-                    collapse_nb(cnb, cne);
-                    collapse_nb(cnb0, cne0);
-                    collapse_nb(cnb1, cne1);
-                    collapse(cne);
-                    collapse(cne0);
-                    collapse(cne1);
+                if (dst_dims[i] > src_dims[i]) {
+                    return true;
                }
            }
-        }
-        {
-            int64_t ne0 = cne[0];
-            int64_t ne1 = cne[1];
-            int64_t ne2 = cne[2];
-            int64_t ne3 = cne[3];
+            return false;
+        };

-            int64_t ne10 = cne1[0];
-            int64_t ne11 = cne1[1];
-            int64_t ne12 = cne1[2];
-            int64_t ne13 = cne1[3];
+        dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });

-            size_t nb0 = cnb[0];
-            size_t nb1 = cnb[1];
-            size_t nb2 = cnb[2];
-            size_t nb3 = cnb[3];
+        GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
+        GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
+        GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
+        GGML_ASSERT(nb3 % sizeof(dst_t) == 0);

-            size_t nb00 = cnb0[0];
-            size_t nb01 = cnb0[1];
-            size_t nb02 = cnb0[2];
-            size_t nb03 = cnb0[3];
+        GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
+        GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
+        GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
+        GGML_ASSERT(nb03 % sizeof(src0_t) == 0);

-            size_t nb10 = cnb1[0];
-            size_t nb11 = cnb1[1];
-            size_t nb12 = cnb1[2];
-            size_t nb13 = cnb1[3];
+        GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
+        GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
+        GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
+        GGML_ASSERT(nb13 % sizeof(src1_t) == 0);

-            size_t s0 = nb0 / sizeof(dst_t);
-            size_t s1 = nb1 / sizeof(dst_t);
-            size_t s2 = nb2 / sizeof(dst_t);
-            size_t s3 = nb3 / sizeof(dst_t);
+        // dst strides in number of elements
+        size_t s0 = nb0 / sizeof(dst_t);
+        size_t s1 = nb1 / sizeof(dst_t);
+        size_t s2 = nb2 / sizeof(dst_t);
+        size_t s3 = nb3 / sizeof(dst_t);

-            size_t s10 = nb10 / sizeof(src1_t);
-            size_t s11 = nb11 / sizeof(src1_t);
-            size_t s12 = nb12 / sizeof(src1_t);
-            size_t s13 = nb13 / sizeof(src1_t);
+        // src1 strides in number of elements
+        size_t s10 = nb10 / sizeof(src0_t);
+        size_t s11 = nb11 / sizeof(src1_t);
+        size_t s12 = nb12 / sizeof(src1_t);
+        size_t s13 = nb13 / sizeof(src1_t);

-            size_t s00 = nb00 / sizeof(src0_t);
-            size_t s01 = nb01 / sizeof(src0_t);
-            size_t s02 = nb02 / sizeof(src0_t);
-            size_t s03 = nb03 / sizeof(src0_t);
+        // src0 strides in number of elements
+        size_t s00 = nb00 / sizeof(src0_t);
+        size_t s01 = nb01 / sizeof(src0_t);
+        size_t s02 = nb02 / sizeof(src0_t);
+        size_t s03 = nb03 / sizeof(src0_t);

-            GGML_UNUSED(s00);
+        std::size_t num_dst_elements = static_cast<std::size_t>(ne0) * static_cast<std::size_t>(ne1) *
+                                       static_cast<std::size_t>(ne2) * static_cast<std::size_t>(ne3);
+        std::size_t local_range  = 256;
+        std::size_t global_range = ceil_div(num_dst_elements, local_range) * local_range;

-            GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
-            GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
-            GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
-            GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
+        bool needs_broadcasting = check_bcast_required({ ne00, ne01, ne02, ne03 }, { ne0, ne1, ne2, ne3 }) ||
+                                  check_bcast_required({ ne10, ne11, ne12, ne13 }, { ne0, ne1, ne2, ne3 });
+        bool all_contiguous = src0_is_contiguous && src1_is_contiguous && dst_is_contiguous;

-            GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
-            GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
-            GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
-            GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
-
-            GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
-            GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
-            GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
-            GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
-
-            GGML_ASSERT(s0 == 1);
-            GGML_ASSERT(s10 == 1);
-
-            const int block_size = 128;
-
-            int64_t hne0 = std::max(ne0/2LL, 1LL);
-
-            sycl::range<3> block_dims(1, 1, 1);
-            block_dims[2] = std::min<unsigned int>(hne0, block_size);
-            block_dims[1] = std::min<unsigned int>(
-                ne1, block_size / (unsigned int)block_dims[2]);
-            block_dims[0] = std::min(
-                std::min<unsigned int>(
-                    ne2 * ne3, block_size / (unsigned int)block_dims[2] /
-                                   (unsigned int)block_dims[1]),
-                64U);
-
-            sycl::range<3> block_nums(
-                (ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
-                (ne1 + block_dims[1] - 1) / block_dims[1],
-                (hne0 + block_dims[2] - 1) / block_dims[2]);
-
-            if (block_nums[0] > 65535) {
-                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
-                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
-                {
-                    dpct::has_capability_or_fail(stream->get_device(),
-                                                 {sycl::aspect::fp16});
-
-                    stream->parallel_for(
-                        sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
-                                              sycl::range<3>(1, 1, block_size),
-                                          sycl::range<3>(1, 1, block_size)),
-                        [=](sycl::nd_item<3> item_ct1) {
-                            k_bin_bcast_unravel<bin_op>(
-                                src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
-                                ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
-                                s03, s11, s12, s13, item_ct1);
-                        });
-                }
-            } else {
-                /*
-                DPCT1049:16: The work-group size passed to the SYCL kernel may
-                exceed the limit. To get the device limit, query
-                info::device::max_work_group_size. Adjust the work-group size if
-                needed.
-                */
-                dpct::has_capability_or_fail(stream->get_device(),
-                                             {sycl::aspect::fp16});
-
-                stream->parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
-                                            ne2, ne3, ne10, ne11, ne12, ne13,
-                                            s1, s2, s3, s01, s02, s03, s11, s12, s13,
-                                            item_ct1);
-                    });
-            }
+        if (! needs_broadcasting && all_contiguous) {
+            stream->submit([&](sycl::handler & cgh) {
+                cgh.parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), [=](sycl::nd_item<1> it) {
+                    k_bin_bcast_contiguous<bin_op>(src0_dd, src1_dd, dst_dd, num_dst_elements, it);
+                });
+            });
+        } else {
+            stream->submit([&](sycl::handler & cgh) {
+                cgh.parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), [=](sycl::nd_item<1> it) {
+                    k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, ne12, ne13, s0, s1,
+                                        s2, s3, s00, s01, s02, s03, s10, s11, s12, s13, num_dst_elements, it);
+                });
+            });
        }
    }
 };
--- a/ggml/src/ggml-sycl/convert.cpp
+++ b/ggml/src/ggml-sycl/convert.cpp
@@ -183,6 +183,24 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k,
    }
 }

+template <typename dst_t>
+static void dequantize_row_q4_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
+    const int64_t nb = k / QK_K;
+    const size_t  local_size  = 32;
+    const size_t  global_size = nb * local_size;
+
+    dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
+
+    stream->submit([&](sycl::handler & cgh) {
+        sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
+
+        cgh.parallel_for(sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
+                         [=](sycl::nd_item<1> item_ct1) {
+                             dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
+                         });
+    });
+}
+
 template <typename dst_t>
 static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
                                     dpct::queue_ptr stream) {
@@ -504,7 +522,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
        case GGML_TYPE_Q3_K:
            return dequantize_row_q3_K_sycl;
        case GGML_TYPE_Q4_K:
-            return dequantize_row_q4_K_sycl;
+            if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
+                return dequantize_row_q4_K_sycl_reorder;
+            } else {
+                return dequantize_row_q4_K_sycl;
+            }
        case GGML_TYPE_Q5_K:
            return dequantize_row_q5_K_sycl;
        case GGML_TYPE_Q6_K:
@@ -556,7 +578,12 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
        case GGML_TYPE_Q3_K:
            return dequantize_row_q3_K_sycl;
        case GGML_TYPE_Q4_K:
-            return dequantize_row_q4_K_sycl;
+            if (dst->src[0]->extra &&
+                ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
+                return dequantize_row_q4_K_sycl_reorder;
+            } else {
+                return dequantize_row_q4_K_sycl;
+            }
        case GGML_TYPE_Q5_K:
            return dequantize_row_q5_K_sycl;
        case GGML_TYPE_Q6_K:
--- a/ggml/src/ggml-sycl/dequantize.hpp
+++ b/ggml/src/ggml-sycl/dequantize.hpp
@@ -357,6 +357,28 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
 }
 #endif

+template <typename dst_t>
+inline void dequantize_q4_K_common(dst_t * __restrict__ y, const uint8_t * __restrict__ qs_ptr, const float dall,
+                                   const float dmin, uint8_t * __restrict__ scales_local, int il, int ir) {
+    const int is = 2 * il;
+    constexpr int n  = 4;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, scales_local, sc, m);
+    const float d1 = dall * sc;
+    const float m1 = dmin * m;
+
+    get_scale_min_k4(is + 1, scales_local, sc, m);
+    const float d2 = dall * sc;
+    const float m2 = dmin * m;
+
+    sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(qs_ptr + 32 * il + n * ir);
+    for (int l = 0; l < n; ++l) {
+        y[l + 0]  = d1 * (q_vec[l] & 0xF) - m1;
+        y[l + 32] = d2 * (q_vec[l] >> 4) - m2;
+    }
+}
+
 template<typename dst_t>
 static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
                                  uint8_t* scales_local, const sycl::nd_item<3> &item_ct1) {
@@ -365,36 +387,22 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
    const int64_t i = item_ct1.get_group(2);

 #if QK_K == 256
-    // assume 32 threads
    const int64_t tid = item_ct1.get_local_id(2);
-    const int64_t il  = tid/8;
-    const int64_t ir  = tid%8;
-    const int64_t is  = 2*il;
-    const int64_t n   = 4;
+    const int64_t il  = tid / 8;
+    const int64_t ir  = tid % 8;

-    dst_t * y = yy + i*QK_K + 64*il + n*ir;
+    dst_t * y = yy + i * QK_K + 64 * il + 4 * ir;

    const sycl::half2 dm = x[i].dm;
    const float dall = dm[0];
    const float dmin = dm[1];

-    if (tid < 12)
+    if (tid < 12) {
        scales_local[tid] = x[i].scales[tid];
-    item_ct1.barrier(sycl::access::fence_space::local_space);
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, scales_local, sc, m);
-    const float d1 = dall * sc;
-    const float m1 = dmin * m;
-    get_scale_min_k4(is + 1, scales_local, sc, m);
-    const float d2 = dall * sc;
-    const float m2 = dmin * m;
-
-    sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(x[i].qs + 32*il + n*ir);
-    for (int l = 0; l < n; ++l) {
-        y[l + 0] = d1 * (q_vec[l] & 0xF) - m1;
-        y[l +32] = d2 * (q_vec[l] >>  4) - m2;
    }
+
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+    dequantize_q4_K_common(y, x[i].qs, dall, dmin, scales_local, il, ir);
 #else
    const int64_t tid = item_ct1.get_local_id(2);
    const uint8_t * q = x[i].qs;
@@ -406,6 +414,36 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
 #endif
 }

+template <typename dst_t>
+static void dequantize_block_q4_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, uint8_t * scales_local,
+                                          const sycl::nd_item<1> & item_ct1, int64_t nb) {
+    const int64_t i   = item_ct1.get_group(0);     // block index
+    const int64_t tid = item_ct1.get_local_id(0);  // thread index within block
+    const int64_t il  = tid / 8;
+    const int64_t ir  = tid % 8;
+
+    dst_t * y = yy + i * QK_K + 64 * il + 4 * ir;
+
+    const uint8_t * base          = static_cast<const uint8_t *>(vx);
+    const size_t    qs_offset     = i * (QK_K / 2);
+    const size_t    scales_offset = nb * (QK_K / 2) + i * K_SCALE_SIZE;
+    const size_t    dm_offset     = nb * (QK_K / 2) + nb * K_SCALE_SIZE + i * sizeof(ggml_half2);
+
+    const uint8_t *    qs_ptr     = base + qs_offset;
+    const uint8_t *    scales_ptr = base + scales_offset;
+    ggml_half2         dm_values  = *reinterpret_cast<const ggml_half2 *>(base + dm_offset);
+
+    const float dall = dm_values.x();
+    const float dmin = dm_values.y();
+
+    if (tid < 12) {
+        scales_local[tid] = scales_ptr[tid];
+    }
+
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+    dequantize_q4_K_common(y, qs_ptr, dall, dmin, scales_local, il, ir);
+}
+
 template<typename dst_t>
 static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
                                  const sycl::nd_item<3> &item_ct1) {
--- a/ggml/src/ggml-sycl/dmmv.cpp
+++ b/ggml/src/ggml-sycl/dmmv.cpp
@@ -1129,7 +1129,13 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
            dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q4_K:
-            dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
+                ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
+                // reorder is currently not supported for dmmv
+                GGML_ABORT("Unimplemented dequantize case case for q4_k reorder");
+            } else {
+                dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            }
            break;
        case GGML_TYPE_Q5_K:
            dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
--- a/ggml/src/ggml-sycl/element_wise.cpp
+++ b/ggml/src/ggml-sycl/element_wise.cpp
@@ -655,7 +655,6 @@ inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -688,7 +687,6 @@ inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -722,7 +720,6 @@ inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -754,7 +751,6 @@ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -786,7 +782,6 @@ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -818,7 +813,6 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -850,7 +844,6 @@ inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -883,7 +876,6 @@ inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -917,7 +909,6 @@ inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tenso
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -949,7 +940,6 @@ inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -981,7 +971,6 @@ inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -1013,7 +1002,6 @@ inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -1045,7 +1033,6 @@ inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor *
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -1078,7 +1065,6 @@ inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -1110,7 +1096,6 @@ inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -1142,7 +1127,6 @@ inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -1174,7 +1158,6 @@ inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -1206,7 +1189,6 @@ inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -1241,7 +1223,6 @@ inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -1273,7 +1254,6 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -1315,7 +1295,6 @@ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor *
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -1350,7 +1329,6 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

@@ -1388,7 +1366,6 @@ inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * ds
            }
        default:
            GGML_ABORT("GGML tensor type not supported!\n");
-            break;
    }
 }

--- a/ggml/src/ggml-sycl/gemm.hpp
+++ b/ggml/src/ggml-sycl/gemm.hpp
@@ -32,16 +32,36 @@ public:
        else static_assert(0);
    }

-    static inline void row_gemm(ggml_backend_sycl_context & ctx, bool a_trans, bool b_trans, int m, int n, int k,
-                                const void * a, dt at, const void * b, dt bt, void * c, dt ct, const queue_ptr & q) {
+    // matrix A has m rows, k columns
+    // matrix B has k rows, n columns
+    // nra - number of elements to skip when moving into next row in A
+    // nrb - number of elements to skip when moving into next row in B
+    // nca - number of elements to skip when moving into next column in A
+    // ncb - number of elements to skip when moving into next column in B
+    // stride_a - number of elements to skip when moving to next A matrix
+    // stride_b - number of elements to skip when moving to next B matrix
+    // batches_a - number of A matrices
+    // batches_b - number of B matrices
+    static void gemm(ggml_backend_sycl_context & ctx, int m, int n, int k,
+        const void * a, dt at, dnnl_dim_t nra, dnnl_dim_t nca, dnnl_dim_t stride_a,
+        const void * b, dt bt, dnnl_dim_t nrb, dnnl_dim_t ncb, dnnl_dim_t stride_b,
+        void * c, dt ct, const queue_ptr & q, dnnl_dim_t batches_a, dnnl_dim_t batches_b) {
+
        auto stream = ctx.stream_dnnl(q);
        auto eng = ctx.engine_dnnl(q);
-        dnnl::memory::dims a_dims = { m, k };
-        dnnl::memory::dims b_dims = { k, n };
-        dnnl::memory::dims c_dims = { m, n };
-        const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
-        const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
-        const auto c_md    = dnnl::memory::desc(c_dims, ct, tag::ab);
+
+        // { # strides, # rows, # columns }
+        dnnl::memory::dims a_dims = { batches_a, m, k };
+        dnnl::memory::dims b_dims = { batches_b, k, n };
+        dnnl::memory::dims c_dims = { std::max(batches_a, batches_b), m, n };
+
+        // { # elements to skip to next stride, # elements to skip to next row, # elements to skip to next column }
+        dnnl::memory::dims a_strides = { stride_a, nra, nca };
+        dnnl::memory::dims b_strides = { stride_b, nrb, ncb };
+
+        const auto a_in_md = dnnl::memory::desc(a_dims, at, a_strides);
+        const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_strides);
+        const auto c_md    = dnnl::memory::desc(c_dims, ct, tag::abc);

        dnnl::primitive_attr primitive_attr;
        primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
@@ -63,6 +83,15 @@ public:

        matmul_prim.execute(stream, matmul_args);
    }
+
+    // matrices A and B are column major, both having k rows
+    // matrix A has m column, matrix B has n columns
+    // output: column major matrix C = A transposed * B
+    static void row_gemm(ggml_backend_sycl_context & ctx, int m, int n, int k,
+        const void * a, dt at, const void * b, dt bt, void * c, dt ct, const queue_ptr & q) {
+
+        gemm(ctx, m, n, k, a, at, k, 1, k * m, b, bt, 1, k, n * k, c, ct, q, 1, 1);
+    }
 };

 #endif
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -49,6 +49,7 @@ static bool g_sycl_loaded = false;
 int g_ggml_sycl_debug = 0;
 int g_ggml_sycl_disable_optimize = 0;
 int g_ggml_sycl_disable_graph = 0;
+int g_ggml_sycl_disable_dnn = 0;
 int g_ggml_sycl_prioritize_dmmv = 0;

 static ggml_sycl_device_info ggml_sycl_init() {
@@ -196,12 +197,22 @@ static void ggml_check_sycl() try {
        g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
        g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
        g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
+        g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
        g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
        GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
        GGML_LOG_INFO("Running with Environment Variables:\n");
        GGML_LOG_INFO("  GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
        GGML_LOG_INFO("  GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
+#ifdef GGML_SYCL_GRAPH
        GGML_LOG_INFO("  GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph);
+#else
+        GGML_LOG_INFO("  GGML_SYCL_DISABLE_GRAPH: graph disabled by compile flag\n");
+#endif
+#if GGML_SYCL_DNNL
+        GGML_LOG_INFO("  GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
+#else
+        GGML_LOG_INFO("  GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n");
+#endif
        GGML_LOG_INFO("  GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
        GGML_LOG_INFO("Build with Macros:\n");
 #if defined(GGML_SYCL_FORCE_MMQ)
@@ -341,7 +352,7 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
        assert(tensor->view_src->buffer->buft == buffer->buft);
        return GGML_STATUS_SUCCESS;
    }
-    if (tensor->type == GGML_TYPE_Q4_0 && !g_ggml_sycl_disable_optimize) {
+    if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K) && !g_ggml_sycl_disable_optimize) {
        ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
        tensor->extra                 = extra;
        ctx->tensor_extras.push_back(extra);  //used to release it when destroy ctx.
@@ -374,16 +385,17 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
    ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
    ggml_sycl_set_device(ctx->device);
    auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
+    SYCL_CHECK(CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
+#ifndef _WIN32
    // Note: Use host buffer to save the data from mmap(), then copy to device. It's workaround for mmap() issue on PVC GPU.
    // This function will be called during load model from disk. Use memory buffer replace dynamic won't save more time and brings potential memory leak risk here.
-    char* host_buf = (char*)malloc(size);
+    char * host_buf = (char *) malloc(size);
    memcpy(host_buf, data, size);
-    SYCL_CHECK(
-        CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, host_buf, size)
-                             .wait()));
+    SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, host_buf, size).wait()));
    free(host_buf);
+#else
+    SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, data, size).wait()));
+#endif
 }
 catch (sycl::exception const &exc) {
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -1985,19 +1997,18 @@ inline void ggml_sycl_op_mul_mat_sycl(

    const int64_t ne00 = src0->ne[0];
    const int64_t ne10 = src1->ne[0];
-
+    GGML_ASSERT(ne00 == ne10);

    const int64_t row_diff = row_high - row_low;

    int id;
    SYCL_CHECK(
        CHECK_TRY_ERROR(id = get_current_device_id()));
-#if !GGML_SYCL_DNNL
-    const int64_t ne0 = dst->ne[0];
+
+    const int64_t ne0 = dst->ne[0]; // used by MKL only
    // the main device has a larger memory buffer to hold the results from all GPUs
    // ldc == nrows of the matrix that cuBLAS writes into
-    int ldc = id == ctx.device ? ne0 : row_diff;
-#endif
+    int ldc = id == ctx.device ? ne0 : row_diff; // used by MKL only

 #ifdef GGML_SYCL_F16
    bool use_fp16 = true;  // TODO(Yu) SYCL capability check
@@ -2033,25 +2044,29 @@ inline void ggml_sycl_op_mul_mat_sycl(
                                         : src1_as_f16.get();
        ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);

-#if !GGML_SYCL_DNNL
-        const sycl::half alpha_f16 = 1.0f;
-        const sycl::half beta_f16  = 0.0f;
-        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
-            *stream, oneapi::math::transpose::trans,
-            oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
-            &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
-            src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
-            dst_f16.get(), dpct::library_data_t::real_half, ldc,
-            dpct::library_data_t::real_half)));
-        const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
-        to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
-#else
-        DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ptr,
-                                  DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
-                                  dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
-        const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
-        to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
+#if GGML_SYCL_DNNL
+        if (!g_ggml_sycl_disable_dnn) {
+            DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
+                                      DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
+                                      dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
+            const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
+            to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
+        }
+        else
 #endif
+        {
+            const sycl::half alpha_f16 = 1.0f;
+            const sycl::half beta_f16  = 0.0f;
+            SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
+                *stream, oneapi::math::transpose::trans,
+                oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
+                &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
+                src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
+                dst_f16.get(), dpct::library_data_t::real_half, ldc,
+                dpct::library_data_t::real_half)));
+            const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
+            to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
+        }
    }
    else {
        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp32 path\n");
@@ -2072,18 +2087,22 @@ inline void ggml_sycl_op_mul_mat_sycl(
        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
        const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();

-#if !GGML_SYCL_DNNL
-        const float alpha = 1.0f;
-        const float beta  = 0.0f;
-        SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm(
-            get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff,
-            src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
-            dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
-#else
-        DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i,
-                                  DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
-                                  dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
+#if GGML_SYCL_DNNL
+        if (!g_ggml_sycl_disable_dnn) {
+            DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ddf1_i,
+                                      DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
+                                      dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
+        }
+        else
 #endif
+        {
+            const float alpha = 1.0f;
+            const float beta  = 0.0f;
+            SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm(
+                get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff,
+                src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
+                dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
+        }
    }
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_ddq_i);
@@ -2697,7 +2716,7 @@ catch (sycl::exception const &exc) {
  std::exit(1);
 }

-static void k_compute_batched_ptrs(const sycl::half * src0_as_f16, const sycl::half * src1_as_f16, char * dst,
+static void k_compute_batched_ptrs(const sycl::half * src0_as_f16, const sycl::half * src1_as_f16, void * dst,
                                   const void ** ptrs_src, void ** ptrs_dst, int64_t ne12, int64_t ne13, int64_t ne23,
                                   size_t nb02, size_t nb03, size_t nb12, size_t nb13, size_t nbd2, size_t nbd3,
                                   int64_t r2, int64_t r3, const sycl::nd_item<3> & item_ct1) {
@@ -2713,7 +2732,7 @@ static void k_compute_batched_ptrs(const sycl::half * src0_as_f16, const sycl::h

    const uint8_t * src0_bytes = reinterpret_cast<const uint8_t *>(src0_as_f16);
    const uint8_t * src1_bytes = reinterpret_cast<const uint8_t *>(src1_as_f16);
-    uint8_t *       dst_bytes  = reinterpret_cast<uint8_t *>(dst);
+    uint8_t *       dst_bytes  = static_cast<uint8_t *>(dst);

    ptrs_src[0 * ne23 + i12 + i13 * ne12] = src0_bytes + i02 * nb02 + i03 * nb03;
    ptrs_src[1 * ne23 + i12 + i13 * ne12] = src1_bytes + i12 * nb12 + i13 * nb13;
@@ -2726,6 +2745,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
    GGML_ASSERT(!ggml_is_transposed(src1));
    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);

    GGML_TENSOR_BINARY_OP_LOCALS

@@ -2766,7 +2786,6 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
    }

    ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool());
-    char *                           dst_t = reinterpret_cast<char *>(dst_ddf);

    dpct::library_data_t mkl_compute_type = dpct::library_data_t::real_float;
    dpct::library_data_t mkl_data_type    = dpct::library_data_t::real_float;
@@ -2783,42 +2802,83 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons

    GGML_ASSERT(ne12 % ne02 == 0);
    GGML_ASSERT(ne13 % ne03 == 0);
+    GGML_ASSERT(ne01 == static_cast<int64_t>(nb1/nb0));
+    GGML_ASSERT(ne10 == ne00);

    // broadcast factors
    const int64_t r2 = ne12 / ne02;
    const int64_t r3 = ne13 / ne03;

-    if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
-        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
-        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
-                                                    oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
-                                                    src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
-                                                    src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_t,
-                                                    mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
-    } else {
-        const int ne23 = ne12 * ne13;
+#if GGML_SYCL_DNNL
+    if (!g_ggml_sycl_disable_dnn) {
+        auto dnn_gemm = [&ctx, queue, ne11, ne01, ne10, nb00, nb01, nb02, s11, s12]
+            (const sycl::half* src1, const sycl::half* src0, float* dst, const dnnl_dim_t batches_a, const dnnl_dim_t batches_b) {

-        ggml_sycl_pool_alloc<const void *>         ptrs_src(ctx.pool(), 2 * ne23);
-        ggml_sycl_pool_alloc<void *>               ptrs_dst(ctx.pool(), 1 * ne23);
-        ggml_sycl_pool_alloc<matrix_info_t<float>> matrix_info(ctx.host_pool(), 1);
+            DnnlGemmWrapper::gemm(ctx, ne11,ne01, ne10,
+                            src1, DnnlGemmWrapper::to_dt<sycl::half>(), s11, 1, s12,
+                            src0, DnnlGemmWrapper::to_dt<sycl::half>(), 1, nb01/nb00, nb02/nb00,
+                            dst, DnnlGemmWrapper::to_dt<float>(), queue, batches_a, batches_b);
+        };

-        sycl::range<3> block_dims(1, ne12, ne13);
-        queue->submit([&](sycl::handler & cgh) {
-            const void ** ptrs_src_get = ptrs_src.get();
-            void **       ptrs_dst_get = ptrs_dst.get();
-            size_t        nb12_scaled  = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half);
-            size_t        nb13_scaled  = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half);
-            cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
-                k_compute_batched_ptrs(src0_f16, src1_f16, dst_t, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02,
-                                       nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1);
+        if (r2 == 1 && r3 == 1) {
+            if (ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
+                dnn_gemm(src1_f16, src0_f16, dst_ddf, ne12*ne13, ne02 * ne03);
+            }
+            else {
+                for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
+                    const sycl::half* src0_f16_shifted = src0_f16 + ((ie03*nb03)/sizeof(sycl::half)); // nb is in bytes
+                    const sycl::half* src1_f16_shifted = src1_f16 + ie03*s13;
+                    float* dst_shifted = dst_ddf + ((ie03*nb3)/sizeof(float));
+                    dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, ne12, ne02);
+                }
+            }
+        } else {
+            // iterate over batches from smaller set of matrices (matrix 0)
+            for (int64_t ie02 = 0; ie02 < ne02; ++ie02) {
+                for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
+                    const sycl::half* src0_f16_shifted = src0_f16 + ((ie02*nb02 + ie03*nb03)/sizeof(sycl::half));
+                    const sycl::half* src1_f16_shifted = src1_f16 + ie02*s12*r2 + ie03*s13*r3;
+                    float* dst_shifted = dst_ddf + ((ie02*nb2*r2 + ie03*nb3*r3)/sizeof(float));
+                    dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, r2*r3, 1);
+                }
+            }
+        }
+    }
+    else
+#endif
+    {
+        if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
+            // there is no broadcast and src0, src1 are contiguous across dims 2, 3
+            SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
+                                                        oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
+                                                        src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
+                                                        src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_ddf,
+                                                        mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
+        } else {
+            const int ne23 = ne12 * ne13;
+
+            ggml_sycl_pool_alloc<const void *>         ptrs_src(ctx.pool(), 2 * ne23);
+            ggml_sycl_pool_alloc<void *>               ptrs_dst(ctx.pool(), 1 * ne23);
+            ggml_sycl_pool_alloc<matrix_info_t<float>> matrix_info(ctx.host_pool(), 1);
+
+            sycl::range<3> block_dims(1, ne12, ne13);
+            queue->submit([&](sycl::handler & cgh) {
+                const void ** ptrs_src_get = ptrs_src.get();
+                void **       ptrs_dst_get = ptrs_dst.get();
+                size_t        nb12_scaled  = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half);
+                size_t        nb13_scaled  = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half);
+                cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                    k_compute_batched_ptrs(src0_f16, src1_f16, dst_ddf, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02,
+                                           nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1);
+                });
            });
-        });

-        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
-            *queue, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
-            (const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
-            (const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, s11, beta,
-            (void **) (ptrs_dst.get() + 0 * ne23), mkl_data_type, ne0, ne23, mkl_compute_type, matrix_info.get())));
+            SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
+                *queue, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
+                (const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
+                (const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, s11, beta,
+                (void **) (ptrs_dst.get() + 0 * ne23), mkl_data_type, ne0, ne23, mkl_compute_type, matrix_info.get())));
+        }
    }
 } catch (const sycl::exception & exc) {
    std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
@@ -2841,6 +2901,8 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
    switch (type) {
        case GGML_TYPE_Q4_0:
            return true;
+        case GGML_TYPE_Q4_K:
+            return !g_ggml_sycl_prioritize_dmmv;
        default:
            return false;
    }
@@ -2858,6 +2920,7 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
 inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
    switch (type) {
        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_K:
            return true;
        default:
            return false;
@@ -2883,16 +2946,16 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
    }
 }

-static void reorder_qw(char *data_device, const int ncols, const int nrows,
-                size_t size, size_t offset, dpct::queue_ptr stream) {
-    auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
+static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
+                            dpct::queue_ptr stream) {
+    auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
    SYCL_CHECK(
        CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
            .wait()));
    GGML_ASSERT((size % sizeof(block_q4_0) == 0));
    GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
    int offset_blks = offset / sizeof(block_q4_0);
-    auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;
+    auto qs_ptr      = data_device + offset_blks * QK4_0 / 2;
    auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;

    stream->parallel_for(
@@ -2906,25 +2969,66 @@ static void reorder_qw(char *data_device, const int ncols, const int nrows,
                *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
            }
            *(d_ptr + ib) = x[ib].d;
-        });
+        }).wait_and_throw();
+
+    sycl::free(tmp_buf, *stream);
+}
+
+static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
+    GGML_ASSERT(size % sizeof(block_q4_K) == 0);
+    GGML_ASSERT(offset % sizeof(block_q4_K) == 0);
+
+    const int nblocks = size / sizeof(block_q4_K);
+
+    auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
+    SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
+
+    auto * qs_ptr     = data_device;
+    auto * scales_ptr = qs_ptr + QK_K / 2 * nblocks;
+    auto * dm_ptr     = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
+
+    stream->parallel_for(nblocks, [=](auto i) {
+        const block_q4_K * x  = (const block_q4_K *) tmp_buf;
+        const int          ib = i;
+
+        for (int j = 0; j < QK_K / 2; ++j) {
+            qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
+        }
+
+        for (int j = 0; j < K_SCALE_SIZE; ++j) {
+            scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
+        }
+
+        dm_ptr[ib] = x[ib].dm;
+    }).wait_and_throw();

    sycl::free(tmp_buf, *stream);
 }

 static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
-    char*data_device = (char*)src0->data;
+    uint8_t * data_device = (uint8_t *) src0->data;
    size_t ncols = src0->ne[0];
    size_t nrows = src0->ne[1];
    size_t size = ggml_nbytes(src0);

-    reorder_qw(data_device, ncols, nrows, size, 0, stream);
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+            reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
+            break;
+        case GGML_TYPE_Q4_K:
+            reorder_qw_q4_k(data_device, size, 0, stream);
+            break;
+        default:
+            GGML_ABORT("reorder_qw() called with unsupported type");
+            break;
+    }
 }

 static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_tensor * dst) {
    return !g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
            ctx.opt_feature.reorder &&      //allow this device due to good perf, skip the devices with bad perf.
            dst->op == GGML_OP_MUL_MAT &&   //limit to some supported cases of Q4_0, to do for more cases.
-            dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
+            dst->src[1]->ne[1]==1 && dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
 }

 static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * /* src1 */,
@@ -2960,8 +3064,18 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
    extra->optimized_feature.reorder = true;  // Used to decode/dequan in next steps and avoid re-reordering
 }

-static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {

+static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    return ggml_sycl_supports_dmmv(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
+           src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
+}
+
+static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    return ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
+           src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
+}
+
+static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
    int64_t min_compute_capability = INT_MAX;

@@ -2984,13 +3098,9 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
    }

    // check data types and tensor shapes for custom matrix multiplication kernels:
-    bool use_dequantize_mul_mat_vec = ggml_sycl_supports_dmmv(src0->type)
-        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
+    bool use_dequantize_mul_mat_vec = can_use_dequantize_mul_mat_vec(src0, src1, dst);

-    bool use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
-        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
+    bool use_mul_mat_vec_q = can_use_mul_mat_vec_q(src0, src1, dst);

    bool use_mul_mat_q =  ggml_sycl_supports_mmq(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
@@ -3041,8 +3151,6 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
        ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1);
    } else {
        constexpr bool convert_src1_to_q8_1 = false;
-        // MUL_MAT_SYCL supports reorder
-        opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::MUL_MAT_SYCL);
        ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
    }
    GGML_SYCL_DEBUG("call %s done\n", __func__);
@@ -3632,7 +3740,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
    GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
    const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
    SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
-        data, (const char *)tensor->data + offset, size).wait()));
+        data, (const char *)tensor->data + offset, size)));
 }
 catch (sycl::exception const &exc) {
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -3652,7 +3760,7 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
        */
        const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
        SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
-            dst->data, src->data, ggml_nbytes(dst)).wait()));
+            dst->data, src->data, ggml_nbytes(dst))));
        return true;
    }

@@ -3701,11 +3809,43 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * syc
    }
 }

+#ifdef GGML_SYCL_GRAPH
+static bool check_graph_compatibility(ggml_cgraph * cgraph) {
+    if (ggml_sycl_info().device_count > 1) {
+        // A sycl_ex::command_graph object can only be created for a single device
+        GGML_LOG_INFO("%s: disabling SYCL graphs due to multiple devices\n", __func__);
+        return false;
+    }
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        const ggml_op node_op = cgraph->nodes[i]->op;
+        switch (node_op) {
+            default:
+                break;
+            case GGML_OP_CONCAT:
+                // ggml_sycl_op_concat() does a blocking host wait after memcpy operations,
+                // but wait() can't be called on the events returned by a queue recording
+                // to a graph.
+                [[fallthrough]];
+            case GGML_OP_MUL_MAT_ID:
+                // ggml_sycl_mul_mat_id() does a blocking host wait on the sycl queue after
+                // submitting a memcpy operation, but wait() can't be called on a queue that
+                // is recording to a graph.
+                GGML_LOG_INFO("%s: disabling SYCL graphs due to unsupported node type %s\n", __func__,
+                              ggml_op_name(node_op));
+                return false;
+        }
+    }
+    return true;
+}
+#endif
+
 static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
    auto * sycl_ctx = static_cast<ggml_backend_sycl_context *>(backend->context);

 #ifdef GGML_SYCL_GRAPH
-    if (!g_ggml_sycl_disable_graph) {
+    bool use_sycl_graph = !g_ggml_sycl_disable_graph && check_graph_compatibility(cgraph);
+    if (use_sycl_graph) {
        const bool graph_support = dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_limited_graph);
        if (!graph_support) {
            GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device);
@@ -3713,7 +3853,8 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
            return GGML_STATUS_SUCCESS;
        }

-        sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream()));
+        sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream()), {sycl_ex::property::graph::assume_buffer_outlives_graph{}});
+
        model_sycl_graph.begin_recording(*(sycl_ctx->stream()));
        ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
        model_sycl_graph.end_recording();
--- a/ggml/src/ggml-sycl/mmvq.cpp
+++ b/ggml/src/ggml-sycl/mmvq.cpp
@@ -24,6 +24,7 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
    const int     blocks_per_row              = ncols / block_traits::qk;
    constexpr int blocks_per_subgroup         = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi);
    constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq;
+    const int     nblocks                     = nrows * (ncols / block_traits::qk);

    static_assert(blocks_per_subgroup > 0);
    static_assert(block_elements_per_subgroup > 0);
@@ -45,7 +46,7 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
            // x block quant index when casting the quants to int
            const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);

-            partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs);
+            partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs, nblocks);
        }
    }

@@ -739,6 +740,27 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
    }
 }

+static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
+    const int nrows, dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+
+    const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
+    constexpr size_t num_subgroups = 16;
+    GGML_ASSERT(block_num_y % num_subgroups == 0);
+
+    const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
+    const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
+
+    stream->submit([&](sycl::handler & cgh) {
+        cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
+                            [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(vx, vy, dst, ncols,
+                                                                                            nrows, nd_item);
+                            });
+    });
+}
+
+
 static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
@@ -1035,7 +1057,12 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
                mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
                break;
            case GGML_TYPE_Q4_K:
-                mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
+                    ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
+                    reorder_mul_mat_vec_q4_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                } else {
+                    mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                }
                break;
            case GGML_TYPE_Q5_K:
                mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
--- a/ggml/src/ggml-sycl/quants.hpp
+++ b/ggml/src/ggml-sycl/quants.hpp
@@ -56,6 +56,28 @@ template <> struct block_q_t<GGML_TYPE_Q4_0> {
    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
 };

+template <> struct block_q_t<GGML_TYPE_Q4_K> {
+    struct traits {
+        static constexpr uint32_t qk       = QK_K;
+        static constexpr uint32_t qi       = QI4_K;
+        static constexpr uint32_t qr       = QR4_K;
+        static constexpr uint32_t vdr_mmvq = 2;
+    };
+
+    static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); }
+
+    static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
+        auto nblocks = (nrows * (ncols / traits::qk));
+        return (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2));
+    }
+
+    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
+
+    constexpr size_t get_total_qs_bytes(int nblocks) { return nblocks * QK_K / 2; }
+
+    constexpr size_t get_dm_offset(int nblocks) { return get_total_qs_bytes(nblocks) + nblocks * K_SCALE_SIZE; }
+};
+
 }  // namespace ggml_sycl_reordered

 #endif  // GGML_SYCL_QUANTS_HPP
--- a/ggml/src/ggml-sycl/vecdotq.hpp
+++ b/ggml/src/ggml-sycl/vecdotq.hpp
@@ -285,7 +285,7 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
    }

    __dpct_inline__ float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
-                     const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+                     const block_q8_1 * __restrict__ bq8_1, const int & iqs, int /* nblocks */) {
        const uint8_t * bq4_0 = static_cast<const uint8_t *>(vbq) + ibx_offset;
        const ggml_half d     = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset));
        int             v[q4_0_traits::vdr_mmvq];
@@ -303,6 +303,67 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
    };
 };

+static inline float vec_dot_q4_K_q8_1_common(const int * __restrict__ q4, const uint16_t * __restrict__ scales,
+                                             const ggml_half2 & dm, const block_q8_1 * __restrict__ bq8_1,
+                                             const int &        iqs) {
+    int   v[2];
+    int   u[2 * QR4_K];
+    float d8[QR4_K];
+
+    v[0] = q4[0];
+    v[1] = q4[4];
+
+    uint16_t  aux[2];
+    const int j = (QR4_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
+    if (j < 2) {
+        aux[0] = scales[j + 0] & 0x3f3f;
+        aux[1] = scales[j + 2] & 0x3f3f;
+    } else {
+        aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
+        aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
+    }
+
+    const uint8_t * sc = (const uint8_t *) aux;
+    const uint8_t * m  = sc + 2;
+
+    const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
+
+    for (int i = 0; i < QR4_K; ++i) {
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i]                   = bq8i->ds[0];
+
+        const int * q8 = (const int *) bq8i->qs + ((iqs / 2) % 4);
+        u[2 * i + 0]   = q8[0];
+        u[2 * i + 1]   = q8[4];
+    }
+
+    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, dm, d8);
+}
+
+template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
+    static constexpr ggml_type gtype = GGML_TYPE_Q4_K;
+
+    using q4_k_block  = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q4_K>;
+    using q4_k_traits = typename q4_k_block::traits;
+
+    float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
+                     const block_q8_1 * __restrict__ bq8_1, const int & iqs, int nblocks) {
+        const int ib = ibx_offset / (QK_K / 2);
+
+        const uint8_t *    base           = static_cast<const uint8_t *>(vbq);
+        const uint8_t *    qs             = base + ibx_offset;
+        const int          total_qs_bytes = nblocks * (QK_K / 2);
+        const uint8_t *    scs            = base + total_qs_bytes + ib * K_SCALE_SIZE;
+        const ggml_half2 * dms            = reinterpret_cast<const ggml_half2 *>(base + d_offset);
+
+        const int        bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
+        const int *      q4         = (const int *) (qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
+        const uint16_t * scales     = (const uint16_t *) scs;
+
+        return vec_dot_q4_K_q8_1_common(q4, scales, *dms, bq8_1, iqs);
+    }
+};
+
 #define VDR_Q4_0_Q8_1_MMVQ 2
 #define VDR_Q4_0_Q8_1_MMQ  4

@@ -649,52 +710,17 @@ vec_dot_q3_K_q8_1(const void *__restrict__ vbq,
    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
 }

-static __dpct_inline__ float
-vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
+static __dpct_inline__ float vec_dot_q4_K_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
+                                               const int & iqs) {
 #ifndef GGML_QKK_64
+
    const block_q4_K * bq4_K = (const block_q4_K *) vbq;

-    int    v[2];
-    int    u[2*QR4_K];
-    float d8[QR4_K];
+    const int        bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
+    const int *      q4         = (const int *) (bq4_K->qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
+    const uint16_t * scales     = (const uint16_t *) bq4_K->scales;

-    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
-    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
-
-    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
-    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
-    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
-    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
-
-    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
-    v[0] = q4[0];
-    v[1] = q4[4];
-
-    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
-    uint16_t aux[2];
-    const int j = bq8_offset/2;
-    if (j < 2) {
-        aux[0] = scales[j+0] & 0x3f3f;
-        aux[1] = scales[j+2] & 0x3f3f;
-    } else {
-        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
-        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
-    }
-    const uint8_t * sc = (const uint8_t *)aux;
-    const uint8_t * m  = sc + 2;
-
-    for (int i = 0; i < QR4_K; ++i) {
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = bq8i->ds[0];
-
-        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
-        u[2*i+0] = q8[0];
-        u[2*i+1] = q8[4];
-    }
-
-    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
+    return vec_dot_q4_K_q8_1_common(q4, scales, bq4_K->dm, bq8_1, iqs);

 #else

--- a/ggml/src/ggml-vulkan/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/CMakeLists.txt
@@ -15,6 +15,32 @@ function(detect_host_compiler)
    set(HOST_CXX_COMPILER "${HOST_CXX_COMPILER}" PARENT_SCOPE)
 endfunction()

+# Function to test shader extension support
+# Parameters:
+#  EXTENSION_NAME - Name of the extension to test (e.g., "GL_EXT_integer_dot_product")
+#  TEST_SHADER_FILE - Path to the test shader file
+#  RESULT_VARIABLE - Name of the variable to set (ON/OFF) based on test result
+function(test_shader_extension_support EXTENSION_NAME TEST_SHADER_FILE RESULT_VARIABLE)
+    execute_process(
+        COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${TEST_SHADER_FILE}"
+        OUTPUT_VARIABLE glslc_output
+        ERROR_VARIABLE glslc_error
+    )
+
+    if (${glslc_error} MATCHES ".*extension not supported: ${EXTENSION_NAME}.*")
+        message(STATUS "${EXTENSION_NAME} not supported by glslc")
+        set(${RESULT_VARIABLE} OFF PARENT_SCOPE)
+    else()
+        message(STATUS "${EXTENSION_NAME} supported by glslc")
+        set(${RESULT_VARIABLE} ON PARENT_SCOPE)
+        add_compile_definitions(${RESULT_VARIABLE})
+
+        # Ensure the extension support is forwarded to vulkan-shaders-gen
+        list(APPEND VULKAN_SHADER_GEN_CMAKE_ARGS -D${RESULT_VARIABLE}=ON)
+        set(VULKAN_SHADER_GEN_CMAKE_ARGS "${VULKAN_SHADER_GEN_CMAKE_ARGS}" PARENT_SCOPE)
+    endif()
+endfunction()
+
 if (Vulkan_FOUND)
    message(STATUS "Vulkan found")

@@ -23,69 +49,40 @@ if (Vulkan_FOUND)
                             ../../include/ggml-vulkan.h
                            )

-    # Compile a test shader to determine whether GL_KHR_cooperative_matrix is supported.
-    # If it's not, there will be an error to stderr.
-    # If it's supported, set a define to indicate that we should compile those shaders
-    execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat_support.comp"
-                    OUTPUT_VARIABLE glslc_output
-                    ERROR_VARIABLE glslc_error)
+    set(VULKAN_SHADER_GEN_CMAKE_ARGS
+        -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}
+        -DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
+    )

-    if (${glslc_error} MATCHES ".*extension not supported: GL_KHR_cooperative_matrix.*")
-        message(STATUS "GL_KHR_cooperative_matrix not supported by glslc")
-        set(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT OFF)
-    else()
-        message(STATUS "GL_KHR_cooperative_matrix supported by glslc")
-        set(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT ON)
-        add_compile_definitions(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+    set(VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS "")
+    if (CMAKE_BUILD_TYPE AND CMAKE_BUILD_TYPE MATCHES "Debug|Release|MinSizeRel|RelWithDebInfo")
+        list(APPEND VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS --config=${CMAKE_BUILD_TYPE})
    endif()

-    # Compile a test shader to determine whether GL_NV_cooperative_matrix2 is supported.
-    # If it's not, there will be an error to stderr.
-    # If it's supported, set a define to indicate that we should compile those shaders
-    execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat2_support.comp"
-                    OUTPUT_VARIABLE glslc_output
-                    ERROR_VARIABLE glslc_error)
+    # Test all shader extensions
+    test_shader_extension_support(
+        "GL_KHR_cooperative_matrix"
+        "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat_support.comp"
+        "GGML_VULKAN_COOPMAT_GLSLC_SUPPORT"
+    )

-    if (${glslc_error} MATCHES ".*extension not supported: GL_NV_cooperative_matrix2.*")
-        message(STATUS "GL_NV_cooperative_matrix2 not supported by glslc")
-        set(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT OFF)
-    else()
-        message(STATUS "GL_NV_cooperative_matrix2 supported by glslc")
-        set(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT ON)
-        add_compile_definitions(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
-    endif()
+    test_shader_extension_support(
+        "GL_NV_cooperative_matrix2"
+        "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat2_support.comp"
+        "GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT"
+    )

-    # Compile a test shader to determine whether GL_EXT_integer_dot_product is supported.
-    # If it's not, there will be an error to stderr.
-    # If it's supported, set a define to indicate that we should compile those shaders
-    execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_integer_dot_support.comp"
-                    OUTPUT_VARIABLE glslc_output
-                    ERROR_VARIABLE glslc_error)
+    test_shader_extension_support(
+        "GL_EXT_integer_dot_product"
+        "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_integer_dot_support.comp"
+        "GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT"
+    )

-    if (${glslc_error} MATCHES ".*extension not supported: GL_EXT_integer_dot_product.*")
-        message(STATUS "GL_EXT_integer_dot_product not supported by glslc")
-        set(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT OFF)
-    else()
-        message(STATUS "GL_EXT_integer_dot_product supported by glslc")
-        set(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT ON)
-        add_compile_definitions(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
-    endif()
-
-    # Compile a test shader to determine whether GL_EXT_bfloat16 is supported.
-    # If it's not, there will be an error to stderr.
-    # If it's supported, set a define to indicate that we should compile those shaders
-    execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_bfloat16_support.comp"
-                    OUTPUT_VARIABLE glslc_output
-                    ERROR_VARIABLE glslc_error)
-
-    if (${glslc_error} MATCHES ".*extension not supported: GL_EXT_bfloat16.*")
-        message(STATUS "GL_EXT_bfloat16 not supported by glslc")
-        set(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT OFF)
-    else()
-        message(STATUS "GL_EXT_bfloat16 supported by glslc")
-        set(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT ON)
-        add_compile_definitions(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
-    endif()
+    test_shader_extension_support(
+        "GL_EXT_bfloat16"
+        "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_bfloat16_support.comp"
+        "GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT"
+    )

    target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
    target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
@@ -124,16 +121,8 @@ if (Vulkan_FOUND)
        add_compile_definitions(GGML_VULKAN_RUN_TESTS)
    endif()

-    if (NOT CMAKE_CROSSCOMPILING)
-        add_subdirectory(vulkan-shaders)
-        if (MSVC)
-            foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES})
-                string(TOUPPER ${CONFIG} CONFIG)
-                set_target_properties(vulkan-shaders-gen PROPERTIES
-                    RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
-            endforeach()
-        endif()
-    else()
+    # Set up toolchain for host compilation whether cross-compiling or not
+    if (CMAKE_CROSSCOMPILING)
        if (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN)
            set(HOST_CMAKE_TOOLCHAIN_FILE ${GGML_VULKAN_SHADERS_GEN_TOOLCHAIN})
        else()
@@ -146,25 +135,31 @@ if (Vulkan_FOUND)
            configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/host-toolchain.cmake.in ${CMAKE_BINARY_DIR}/host-toolchain.cmake @ONLY)
            set(HOST_CMAKE_TOOLCHAIN_FILE ${CMAKE_BINARY_DIR}/host-toolchain.cmake)
        endif()
-        message(STATUS "vulkan-shaders-gen toolchain file: ${HOST_CMAKE_TOOLCHAIN_FILE}")
-
-        include(ExternalProject)
-        # Native build through ExternalProject_Add
-        ExternalProject_Add(
-            vulkan-shaders-gen
-            SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders
-            CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${HOST_CMAKE_TOOLCHAIN_FILE}
-                    -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}
-                    -DGGML_VULKAN_COOPMAT_GLSLC_SUPPORT=${GGML_VULKAN_COOPMAT_GLSLC_SUPPORT}
-                    -DGGML_VULKAN_COOPMAT2_GLSLC_SUPPORT=${GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT}
-                    -DGGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT=${GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT}
-                    -DGGML_VULKAN_BFLOAT16_GLSLC_SUPPORT=${GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT}
-            BUILD_COMMAND ${CMAKE_COMMAND} --build .
-            INSTALL_COMMAND ${CMAKE_COMMAND} --install .
-            INSTALL_DIR ${CMAKE_BINARY_DIR}
-        )
-        ExternalProject_Add_StepTargets(vulkan-shaders-gen build install)
+    else()
+        # For non-cross-compiling, use empty toolchain (use host compiler)
+        set(HOST_CMAKE_TOOLCHAIN_FILE "")
    endif()
+
+    # Always use ExternalProject_Add approach
+    include(ExternalProject)
+
+    # Add toolchain file if cross-compiling
+    if (CMAKE_CROSSCOMPILING)
+        list(APPEND VULKAN_SHADER_GEN_CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${HOST_CMAKE_TOOLCHAIN_FILE})
+        message(STATUS "vulkan-shaders-gen toolchain file: ${HOST_CMAKE_TOOLCHAIN_FILE}")
+    endif()
+
+    # Native build through ExternalProject_Add
+    ExternalProject_Add(
+        vulkan-shaders-gen
+        SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders
+        CMAKE_ARGS ${VULKAN_SHADER_GEN_CMAKE_ARGS}
+        BUILD_COMMAND ${CMAKE_COMMAND} --build . ${VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS}
+        INSTALL_COMMAND ${CMAKE_COMMAND} --install .
+        INSTALL_DIR ${CMAKE_BINARY_DIR}
+    )
+    ExternalProject_Add_StepTargets(vulkan-shaders-gen build install)
+
    set (_ggml_vk_host_suffix $<IF:$<STREQUAL:${CMAKE_HOST_SYSTEM_NAME},Windows>,.exe,>)
    set (_ggml_vk_genshaders_cmd ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/vulkan-shaders-gen${_ggml_vk_host_suffix})
    set (_ggml_vk_header     ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp)
@@ -175,9 +170,8 @@ if (Vulkan_FOUND)
    file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
    set (_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen)

-    if (CMAKE_CROSSCOMPILING)
-        set(_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen-build vulkan-shaders-gen-install)
-    endif()
+    # Add build and install dependencies for all builds
+    set(_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen-build vulkan-shaders-gen-install)

    add_custom_command(
        OUTPUT ${_ggml_vk_header}
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -288,6 +288,9 @@ struct vk_device_struct {
    bool coopmat_acc_f32_support {};
    bool coopmat_acc_f16_support {};
    bool coopmat_bf16_support {};
+    bool coopmat_support_16x16x16_f16acc {};
+    bool coopmat_support_16x16x16_f32acc {};
+    bool coopmat1_fa_support {};
    uint32_t coopmat_m;
    uint32_t coopmat_n;
    uint32_t coopmat_k;
@@ -410,6 +413,13 @@ struct vk_device_struct {
    vk_pipeline pipeline_flash_attn_f32_f16_D128_cm2[GGML_TYPE_COUNT][2][2][2];
    vk_pipeline pipeline_flash_attn_f32_f16_D256_cm2[GGML_TYPE_COUNT][2][2][2];

+    vk_pipeline pipeline_flash_attn_f32_f16_D64_cm1[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D80_cm1[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D96_cm1[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D112_cm1[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D128_cm1[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D256_cm1[GGML_TYPE_COUNT][2][2][2];
+
    vk_pipeline pipeline_flash_attn_f32_f16_D64[GGML_TYPE_COUNT][2][2][2];
    vk_pipeline pipeline_flash_attn_f32_f16_D80[GGML_TYPE_COUNT][2][2][2];
    vk_pipeline pipeline_flash_attn_f32_f16_D96[GGML_TYPE_COUNT][2][2][2];
@@ -1588,19 +1598,36 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
    );
 }

+enum FaCodePath {
+    FA_SCALAR,
+    FA_COOPMAT1,
+    FA_COOPMAT2,
+};
+
 // number of rows/cols for flash attention shader
 static constexpr uint32_t flash_attention_num_small_rows = 32;
 static constexpr uint32_t scalar_flash_attention_num_small_rows = 1;
 static constexpr uint32_t scalar_flash_attention_num_large_rows = 8;

-static uint32_t get_fa_num_small_rows(bool scalar) {
-    return scalar ? scalar_flash_attention_num_small_rows : flash_attention_num_small_rows;
+// The FA coopmat1 shader assumes 16x16x16 matrix multiply support.
+// 128 threads split into four subgroups, each subgroup does 1/4
+// of the Bc dimension.
+static constexpr uint32_t coopmat1_flash_attention_num_large_rows = 16;
+static constexpr uint32_t scalar_flash_attention_Bc = 64;
+static constexpr uint32_t scalar_flash_attention_workgroup_size = 128;
+
+static uint32_t get_fa_num_small_rows(FaCodePath path) {
+    if (path == FA_COOPMAT2) {
+        return flash_attention_num_small_rows;
+    } else {
+        return scalar_flash_attention_num_small_rows;
+    }
 }

-static std::array<uint32_t, 2> fa_rows_cols(bool scalar, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) {
+static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) {
    GGML_UNUSED(clamp);

-    if (scalar) {
+    if (path == FA_SCALAR) {
        if (small_rows) {
            return {scalar_flash_attention_num_small_rows, 64};
        } else {
@@ -1608,9 +1635,17 @@ static std::array<uint32_t, 2> fa_rows_cols(bool scalar, uint32_t D, uint32_t cl
        }
    }

+    if (path == FA_COOPMAT1) {
+        if (small_rows) {
+            return {scalar_flash_attention_num_small_rows, scalar_flash_attention_Bc};
+        } else {
+            return {coopmat1_flash_attention_num_large_rows, scalar_flash_attention_Bc};
+        }
+    }
+
    // small rows, large cols
    if (small_rows) {
-        return {get_fa_num_small_rows(scalar), 32};
+        return {get_fa_num_small_rows(FA_COOPMAT2), 32};
    }

    // small cols to reduce register count
@@ -1907,17 +1942,19 @@ static void ggml_vk_load_shaders(vk_device& device) {
                                      parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size));
    };

-    auto const &fa_wg_denoms = [&](bool scalar, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::array<uint32_t, 3> {
-        return {fa_rows_cols(scalar, D, clamp, type, small_rows)[0], 1, 1};
+    auto const &fa_wg_denoms = [&](FaCodePath path, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::array<uint32_t, 3> {
+        return {fa_rows_cols(path, D, clamp, type, small_rows)[0], 1, 1};
    };

-    auto const &fa_spec_constants = [&](bool scalar, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::vector<uint32_t> {
+    auto const &fa_spec_constants = [&](FaCodePath path, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::vector<uint32_t> {
        // For large number of rows, 128 invocations seems to work best.
        // For small number of rows (e.g. N==1), 256 works better. But matrix granularity for 256 is 32, so we
        // can't use 256 for D==80.
        // For scalar, use 128 (arbitrary)
-        uint32_t wg_size = scalar ? 128 : ((small_rows && (D % 32) == 0) ? 256 : 128);
-        auto rows_cols = fa_rows_cols(scalar, D, clamp, type, small_rows);
+        uint32_t wg_size = (path == FA_SCALAR || path == FA_COOPMAT1)
+                            ? scalar_flash_attention_workgroup_size
+                            : ((small_rows && (D % 32) == 0) ? 256 : 128);
+        auto rows_cols = fa_rows_cols(path, D, clamp, type, small_rows);

        // D_split can't be larger than a subgroup because we use subgroupShuffle to reduce it.
        // D_split can't be larger than the LSB of D divided by 4 due to vectorization in the shader.
@@ -1929,36 +1966,43 @@ static void ggml_vk_load_shaders(vk_device& device) {
        return {wg_size, rows_cols[0], rows_cols[1], (D), clamp, D_split};
    };

-#define CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, D) \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][0][0], "flash_attn_f32_f16_D" #D "_f16acc"         #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,1,TYPE,false), fa_spec_constants(SCALAR, D,1,TYPE,false), 1, true);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][0][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc" #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,0,TYPE,false), fa_spec_constants(SCALAR, D,0,TYPE,false), fa_rows_cols(SCALAR,D,0,TYPE,false)[1], true);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][0][0], "flash_attn_f32_f16_D" #D "_f32acc"         #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,1,TYPE,false), fa_spec_constants(SCALAR, D,1,TYPE,false), 1, true);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][0][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc" #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,0,TYPE,false), fa_spec_constants(SCALAR, D,0,TYPE,false), fa_rows_cols(SCALAR,D,0,TYPE,false)[1], true);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][1][0], "flash_attn_f32_f16_D" #D "_f16acc_smallrows"         #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,1,TYPE,true), fa_spec_constants(SCALAR, D,1,TYPE,true), 1, true);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][1][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,0,TYPE,true), fa_spec_constants(SCALAR, D,0,TYPE,true), fa_rows_cols(SCALAR,D,0,TYPE,true)[1], true);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][1][0], "flash_attn_f32_f16_D" #D "_f32acc_smallrows"         #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,1,TYPE,true), fa_spec_constants(SCALAR, D,1,TYPE,true), 1, true);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][1][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,0,TYPE,true), fa_spec_constants(SCALAR, D,0,TYPE,true), fa_rows_cols(SCALAR,D,0,TYPE,true)[1], true);     \
+#define CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, D) \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][0][0], "flash_attn_f32_f16_D" #D "_f16acc"         #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,false), fa_spec_constants(FAPATH, D,1,TYPE,false), 1,                                      true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][0][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc" #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,false), fa_spec_constants(FAPATH, D,0,TYPE,false), fa_rows_cols(FAPATH,D,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][0][0], "flash_attn_f32_f16_D" #D "_f32acc"         #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,false), fa_spec_constants(FAPATH, D,1,TYPE,false), 1,                                      true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][0][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc" #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,false), fa_spec_constants(FAPATH, D,0,TYPE,false), fa_rows_cols(FAPATH,D,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][1][0], "flash_attn_f32_f16_D" #D "_f16acc_smallrows"         #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,true), fa_spec_constants(FAPATH, D,1,TYPE,true),   1,                                      true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][1][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,true), fa_spec_constants(FAPATH, D,0,TYPE,true),   fa_rows_cols(FAPATH,D,0,TYPE,true)[1],  true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][1][0], "flash_attn_f32_f16_D" #D "_f32acc_smallrows"         #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,true), fa_spec_constants(FAPATH, D,1,TYPE,true),   1,                                      true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][1][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,true), fa_spec_constants(FAPATH, D,0,TYPE,true),   fa_rows_cols(FAPATH,D,0,TYPE,true)[1],  true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \

-#define CREATE_FA(TYPE, NAMELC, SCALAR, SUFFIX) \
-        CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 64) \
-        CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 80) \
-        CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 96) \
-        CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 112) \
-        CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 128) \
-        CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 256)
+#define CREATE_FA(TYPE, NAMELC, FAPATH, SUFFIX) \
+        CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 64) \
+        CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 80) \
+        CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 96) \
+        CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 112) \
+        CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 128) \
+        CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 256)

-    CREATE_FA(GGML_TYPE_F16, f16, true, )
-    CREATE_FA(GGML_TYPE_Q4_0, q4_0, true, )
-    CREATE_FA(GGML_TYPE_Q8_0, q8_0, true, )
+    CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, )
+    CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, )
+    CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_SCALAR, )
+#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+    if (device->coopmat1_fa_support) {
+        CREATE_FA(GGML_TYPE_F16, f16, FA_COOPMAT1, _cm1)
+        CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_COOPMAT1, _cm1)
+        CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_COOPMAT1, _cm1)
+    }
+#endif
 #if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
    if (device->coopmat2) {
-        CREATE_FA(GGML_TYPE_F16, f16, false, _cm2)
-        CREATE_FA(GGML_TYPE_Q4_0, q4_0, false, _cm2)
-        CREATE_FA(GGML_TYPE_Q4_1, q4_1, false, _cm2)
-        CREATE_FA(GGML_TYPE_Q5_0, q5_0, false, _cm2)
-        CREATE_FA(GGML_TYPE_Q5_1, q5_1, false, _cm2)
-        CREATE_FA(GGML_TYPE_Q8_0, q8_0, false, _cm2)
-        CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, false, _cm2)
+        CREATE_FA(GGML_TYPE_F16, f16, FA_COOPMAT2, _cm2)
+        CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_COOPMAT2, _cm2)
+        CREATE_FA(GGML_TYPE_Q4_1, q4_1, FA_COOPMAT2, _cm2)
+        CREATE_FA(GGML_TYPE_Q5_0, q5_0, FA_COOPMAT2, _cm2)
+        CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_COOPMAT2, _cm2)
+        CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_COOPMAT2, _cm2)
+        CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_COOPMAT2, _cm2)
    }
 #endif
 #undef CREATE_FA2
@@ -1987,25 +2031,25 @@ static void ggml_vk_load_shaders(vk_device& device) {
            CREATE_MM(pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3)
        }
 #endif
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ1_S].f16acc,   matmul_iq1_s_f16,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ1_M].f16acc,   matmul_iq1_m_f16,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_XS].f16acc,  matmul_iq2_xs_f16,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_S].f16acc,   matmul_iq2_s_f16,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_S].f16acc,   matmul_iq3_s_f16,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_XS].f16acc,  matmul_iq4_xs_f16,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
-        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL].f16acc,  matmul_iq4_nl_f16,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_0], matmul_q4_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_1], matmul_q4_1_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_0], matmul_q5_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_1], matmul_q5_1_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q8_0], matmul_q8_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q2_K], matmul_q2_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q3_K], matmul_q3_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_K], matmul_q4_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_K], matmul_q5_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q6_K], matmul_q6_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ1_S],   matmul_iq1_s_f16,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ1_M],   matmul_iq1_m_f16,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_XXS], matmul_iq2_xxs_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_XS],  matmul_iq2_xs_f16,  mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_S],   matmul_iq2_s_f16,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_XXS], matmul_iq3_xxs_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_S],   matmul_iq3_s_f16,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_XS],  matmul_iq4_xs_f16,  mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL],  matmul_iq4_nl_f16,  mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)

        CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, 4)
 #if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
@@ -2041,17 +2085,17 @@ static void ggml_vk_load_shaders(vk_device& device) {
        // Create 6 variants, {s,m,l}x{unaligned,aligned}
 #define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
        if (device->mul_mat ## ID ## _l[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true);   \
        if (device->mul_mat ## ID ## _m[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true);   \
        if (device->mul_mat ## ID ## _s[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true);   \
        if (device->mul_mat ## ID ## _l[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true);   \
        if (device->mul_mat ## ID ## _m[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true);   \
        if (device->mul_mat ## ID ## _s[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true);   \

        // Create 2 variants, {f16,f32} accumulator
 #define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
@@ -2073,47 +2117,47 @@ static void ggml_vk_load_shaders(vk_device& device) {
 #endif

        if (device->coopmat_acc_f16_support) {
-            CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0], matmul_q4_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1], matmul_q4_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0], matmul_q5_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1], matmul_q5_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0], matmul_q8_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );

-            CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ1_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S].f16acc,   matmul_iq1_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ1_M,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M].f16acc,   matmul_iq1_m_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc,  matmul_iq2_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc,   matmul_iq2_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc,   matmul_iq3_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc,  matmul_iq4_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc,  matmul_iq4_nl_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K], matmul_q2_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K], matmul_q3_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K], matmul_q4_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K], matmul_q5_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K], matmul_q6_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_IQ1_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S],   matmul_iq1_s_f32,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_IQ1_M,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M],   matmul_iq1_m_f32,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS], matmul_iq2_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS],  matmul_iq2_xs_f32,  mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S],   matmul_iq2_s_f32,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS], matmul_iq3_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S],   matmul_iq3_s_f32,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS],  matmul_iq4_xs_f32,  mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM2(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL],  matmul_iq4_nl_f32,  mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
        } else {
-            CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );

-            CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ1_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S].f16acc,   matmul_iq1_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ1_M,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M].f16acc,   matmul_iq1_m_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc,  matmul_iq2_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc,   matmul_iq2_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc,   matmul_iq3_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc,  matmul_iq4_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc,  matmul_iq4_nl_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ1_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S].f32acc,   matmul_iq1_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ1_M,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M].f32acc,   matmul_iq1_m_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f32acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f32acc,  matmul_iq2_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f32acc,   matmul_iq2_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f32acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f32acc,   matmul_iq3_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f32acc,  matmul_iq4_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc,  matmul_iq4_nl_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
        }

        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
@@ -2188,13 +2232,19 @@ static void ggml_vk_load_shaders(vk_device& device) {
        if (device->mul_mat ## ID ## _s[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align);   \

-#define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
-        if (device->mul_mat ## ID ## _l[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
-        if (device->mul_mat ## ID ## _m[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1);   \
-        if (device->mul_mat ## ID ## _s[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1);   \
+#define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        if (device->mul_mat ## ID ## _l[TYPE]) { \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f16acc->l, #NAMELC "_f16acc_l", NAMELC ## _f16acc_len, NAMELC ##  _f16acc_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f32acc->l, #NAMELC        "_l", NAMELC ## _len,        NAMELC ##  _data,        "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
+        } \
+        if (device->mul_mat ## ID ## _m[TYPE]) { \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f16acc->m, #NAMELC "_f16acc_m", NAMELC ## _f16acc_len, NAMELC ##  _f16acc_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f32acc->m, #NAMELC        "_m", NAMELC ## _len,        NAMELC ##  _data,        "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1);   \
+        } \
+        if (device->mul_mat ## ID ## _s[TYPE]) { \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f16acc->s, #NAMELC "_f16acc_s", NAMELC ## _f16acc_len, NAMELC ##  _f16acc_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f32acc->s, #NAMELC        "_s", NAMELC ## _len,        NAMELC ##  _data,        "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1);   \
+        } \

        // Create 2 variants, {f16,f32} accumulator
 #define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
@@ -2208,34 +2258,34 @@ static void ggml_vk_load_shaders(vk_device& device) {

        CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );

-        CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0], matmul_q4_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1], matmul_q4_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0], matmul_q5_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1], matmul_q5_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0], matmul_q8_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );

-        CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_IQ1_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S].f16acc,   matmul_iq1_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_IQ1_M,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M].f16acc,   matmul_iq1_m_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc,  matmul_iq2_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc,   matmul_iq2_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc,   matmul_iq3_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc,  matmul_iq4_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc,  matmul_iq4_nl_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K], matmul_q2_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K], matmul_q3_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K], matmul_q4_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K], matmul_q5_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K], matmul_q6_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_IQ1_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S],   matmul_iq1_s_f32,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_IQ1_M,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M],   matmul_iq1_m_f32,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS], matmul_iq2_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS],  matmul_iq2_xs_f32,  mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S],   matmul_iq2_s_f32,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS], matmul_iq3_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S],   matmul_iq3_s_f32,   mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS],  matmul_iq4_xs_f32,  mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL],  matmul_iq4_nl_f32,  mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );

 #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
        if (device->integer_dot_product) {
-            CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
-            CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
-            CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
-            CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
-            CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_q8_1, _f16acc, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0], matmul_q4_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1], matmul_q4_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0], matmul_q5_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_1], matmul_q5_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q8_0], matmul_q8_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
        }
 #endif

@@ -2284,13 +2334,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
        if (device->mul_mat ## ID ## _s[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align);   \

-#define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+#define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
        if (device->mul_mat ## ID ## _l[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC "_l", NAMELC ## _fp32_len, NAMELC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
        if (device->mul_mat ## ID ## _m[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC "_m", NAMELC ## _fp32_len, NAMELC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1);   \
        if (device->mul_mat ## ID ## _s[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC "_s", NAMELC ## _fp32_len, NAMELC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1);   \

        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
@@ -2322,11 +2372,11 @@ static void ggml_vk_load_shaders(vk_device& device) {

 #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
        if (device->integer_dot_product) {
-            CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
-            CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
-            CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
-            CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
-            CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_q8_1, , mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
+            CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, );
        }
 #endif

@@ -2754,23 +2804,29 @@ static vk_device ggml_vk_get_device(size_t idx) {
                pipeline_robustness = true;
            } else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
                device->subgroup_size_control = true;
+#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
            } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
                       !getenv("GGML_VK_DISABLE_COOPMAT")) {
                device->coopmat_support = true;
                device->coopmat_m = 0;
                device->coopmat_n = 0;
                device->coopmat_k = 0;
+#endif
+#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
            } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
                       !getenv("GGML_VK_DISABLE_COOPMAT2")) {
                coopmat2_support = true;
+#endif
 #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
            } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 &&
                       !getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) {
                device->integer_dot_product = true;
 #endif
+#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
            } else if (strcmp("VK_KHR_shader_bfloat16", properties.extensionName) == 0 &&
                       !getenv("GGML_VK_DISABLE_BFLOAT16")) {
                bfloat16_support = true;
+#endif
            }
        }

@@ -3009,6 +3065,11 @@ static vk_device ggml_vk_get_device(size_t idx) {

 #if defined(VK_KHR_cooperative_matrix)
        device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix;
+
+        // coopmat1 fa shader currently assumes 32 invocations per subgroup
+        device->coopmat1_fa_support = device->coopmat_support && device->subgroup_require_full_support &&
+                                      device->subgroup_size_control && device->subgroup_min_size <= 32 &&
+                                      device->subgroup_max_size >= 32;
 #endif

        if (coopmat2_support) {
@@ -3143,6 +3204,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
                            // Only enable if shape is identical
                            device->coopmat_acc_f32_support = true;
                        }
+                        if (prop.MSize == 16 && prop.NSize == 16 && prop.KSize == 16) {
+                            device->coopmat_support_16x16x16_f32acc = true;
+                        }
                    } else if ((vk::ComponentTypeKHR)prop.CType == vk::ComponentTypeKHR::eFloat16 &&
                               (vk::ComponentTypeKHR)prop.ResultType == vk::ComponentTypeKHR::eFloat16) {
                        // coopmat sizes not set yet
@@ -3155,6 +3219,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
                            // Only enable if shape is identical
                            device->coopmat_acc_f16_support = true;
                        }
+                        if (prop.MSize == 16 && prop.NSize == 16 && prop.KSize == 16) {
+                            device->coopmat_support_16x16x16_f16acc = true;
+                        }
                    }
                } else if ((vk::ComponentTypeKHR)prop.AType      == vk::ComponentTypeKHR::eSint8 &&
                           (vk::ComponentTypeKHR)prop.BType      == vk::ComponentTypeKHR::eSint8 &&
@@ -3656,7 +3723,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
 }

 static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) {
-    VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline(" << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")");
+    VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline(" << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ", " << prec << ")");
    if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
        return ctx->device->pipeline_matmul_f32;
    }
@@ -3684,7 +3751,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte

    // MMQ
    if (src1_type == GGML_TYPE_Q8_1) {
-        vk_matmul_pipeline pipelines = ctx->device->pipeline_dequant_mul_mat_mat_q8_1[src0_type].f16acc;
+        vk_matmul_pipeline pipelines = (ctx->device->fp16 && prec == GGML_PREC_DEFAULT) ? ctx->device->pipeline_dequant_mul_mat_mat_q8_1[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat_q8_1[src0_type].f32acc;

        if (pipelines->s == nullptr && pipelines->m == nullptr && pipelines->l == nullptr) {
            return nullptr;
@@ -3724,9 +3791,12 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte

    if (ctx->device->coopmat2) {
        assert(src1_type == GGML_TYPE_F16);
-        return ctx->device->pipeline_dequant_mul_mat_mat_f16[src0_type].f16acc;
+        return prec == GGML_PREC_DEFAULT ? ctx->device->pipeline_dequant_mul_mat_mat_f16[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat_f16[src0_type].f32acc;
    }
-    return ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc;
+    if (ctx->device->coopmat_support) {
+        return (ctx->device->fp16 && ctx->device->coopmat_acc_f16_support && prec == GGML_PREC_DEFAULT) ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc;
+    }
+    return (ctx->device->fp16 && prec == GGML_PREC_DEFAULT) ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc;
 }

 static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type, uint32_t num_cols) {
@@ -4449,6 +4519,8 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
        return aligned ? mmp->a_m : mmp->m;
    }
    return aligned ? mmp->a_l : mmp->l;
+
+    GGML_UNUSED(src1_type);
 }

 static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type src0_type, ggml_type src1_type) {
@@ -4604,6 +4676,19 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
        }
    }

+    if (src->type == to) {
+        // Copy two or four bytes at a time, depending on block size.
+        // For quantized types, we scale by block size/type size. But
+        // this path is also used for bf16->bf16 for example, where the
+        // type size must be exactly 2 or 4.
+        GGML_ASSERT(ggml_is_quantized(to) || ggml_type_size(src->type) == 2 || ggml_type_size(src->type) == 4);
+        if ((ggml_type_size(src->type) % 4) == 0) {
+            return ctx->device->pipeline_contig_cpy_f32_f32;
+        } else {
+            return ctx->device->pipeline_contig_cpy_f16_f16;
+        }
+    }
+
    std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl;
    GGML_ABORT("fatal error");
 }
@@ -5688,6 +5773,36 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx
    }
 }

+static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const uint32_t D, bool f32acc) {
+    // Needs to be kept up to date on shader changes
+    const uint32_t wg_size = scalar_flash_attention_workgroup_size;
+    const uint32_t Br = scalar_flash_attention_num_large_rows;
+    const uint32_t Bc = scalar_flash_attention_Bc;
+
+    const uint32_t acctype = f32acc ? 4 : 2;
+    const uint32_t f16vec4 = 8;
+
+    const uint32_t tmpsh = wg_size * sizeof(float);
+    const uint32_t tmpshv4 = wg_size * 4 * acctype;
+
+    const uint32_t Qf = Br * (D / 4 + 2) * f16vec4;
+
+    const uint32_t sfshstride = (D <= 128) ? (Br + 8) : Br;
+    const uint32_t sfsh = Bc * sfshstride * acctype;
+
+    const uint32_t kshstride = D / 4 + 2;
+    const uint32_t ksh = Bc * kshstride * f16vec4;
+
+    const uint32_t slope = Br * sizeof(float);
+
+    const uint32_t total_size = tmpsh + tmpshv4 + Qf + sfsh + ksh + slope;
+    const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize;
+
+    VK_LOG_DEBUG("ggml_vk_flash_attn_coopmat_shmem_support(D=" << D << ", f32acc=" << f32acc << ", total_size=" << total_size << ", supported=" << supported);
+
+    return supported;
+}
+
 static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, ggml_tensor * dst, bool dryrun = false) {
    VK_LOG_DEBUG("ggml_vk_flash_attn((" << q << ", name=" << q->name << ", type=" << q->type << ", ne0=" << q->ne[0] << ", ne1=" << q->ne[1] << ", ne2=" << q->ne[2] << ", ne3=" << q->ne[3] << ", nb0=" << q->nb[0] << ", nb1=" << q->nb[1] << ", nb2=" << q->nb[2] << ", nb3=" << q->nb[3];
    std::cerr << "), (" << k << ", name=" << k->name << ", type=" << k->type << ", ne0=" << k->ne[0] << ", ne1=" << k->ne[1] << ", ne2=" << k->ne[2] << ", ne3=" << k->ne[3] << ", nb0=" << k->nb[0] << ", nb1=" << k->nb[1] << ", nb2=" << k->nb[2] << ", nb3=" << k->nb[3];
@@ -5738,7 +5853,19 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    assert(q->type == GGML_TYPE_F32);
    assert(k->type == v->type);

-    bool scalar = !ctx->device->coopmat2;
+    FaCodePath path = ctx->device->coopmat2 ? FA_COOPMAT2 :
+                      ctx->device->coopmat1_fa_support ? FA_COOPMAT1 : FA_SCALAR;
+
+    if (path == FA_COOPMAT1) {
+        const bool coopmat_shape_supported = (dst->op_params[3] == GGML_PREC_F32 && ctx->device->coopmat_support_16x16x16_f32acc) ||
+                                             (dst->op_params[3] != GGML_PREC_F32 && ctx->device->coopmat_support_16x16x16_f16acc);
+
+        const bool coopmat_shmem_supported = ggml_vk_flash_attn_coopmat_shmem_support(ctx->device, D, dst->op_params[3] == GGML_PREC_F32);
+
+        if (!coopmat_shape_supported || !coopmat_shmem_supported) {
+            path = FA_SCALAR;
+        }
+    }

    uint32_t gqa_ratio = 1;
    uint32_t qk_ratio = neq2 / nek2;
@@ -5746,9 +5873,21 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    uint32_t workgroups_y = (uint32_t)neq2;
    uint32_t workgroups_z = (uint32_t)neq3;

-    // For scalar FA, we can use the "large" size to accommodate qga.
-    // For coopmat FA, we always use the small size (which is still pretty large for gqa).
-    const uint32_t max_gqa = scalar ? scalar_flash_attention_num_large_rows : get_fa_num_small_rows(false);
+    // For scalar/coopmat1 FA, we can use the "large" size to accommodate qga.
+    // For coopmat2 FA, we always use the small size (which is still pretty large for gqa).
+    uint32_t max_gqa;
+    switch (path) {
+    case FA_SCALAR:
+    case FA_COOPMAT1:
+        // We may switch from coopmat1 to scalar, so use the scalar limit for both
+        max_gqa = scalar_flash_attention_num_large_rows;
+        break;
+    case FA_COOPMAT2:
+        max_gqa = get_fa_num_small_rows(FA_COOPMAT2);
+        break;
+    default:
+        GGML_ASSERT(0);
+    }

    if (N == 1 && qk_ratio > 1 && qk_ratio <= max_gqa &&
        qk_ratio * nek2 == neq2 && nek2 == nev2 && neq3 == 1 && nek3 == 1 && nev3 == 1) {
@@ -5761,11 +5900,23 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    }

    vk_pipeline *pipelines;
-    // XXX TODO other backends may be changing accumulator precision to default to f32 soon
-    bool f32acc = scalar || dst->op_params[3] == GGML_PREC_F32;
-    bool small_rows = N <= get_fa_num_small_rows(scalar);
+    bool small_rows = N <= get_fa_num_small_rows(path);

-    if (scalar) {
+    // coopmat1 does not actually support "small rows" (it needs 16 rows).
+    // So use scalar instead.
+    if (small_rows && path == FA_COOPMAT1) {
+        path = FA_SCALAR;
+    }
+
+    // scalar is faster than coopmat2 when N==1
+    if (N == 1 && path == FA_COOPMAT2) {
+        path = FA_SCALAR;
+    }
+
+    bool f32acc = path == FA_SCALAR || dst->op_params[3] == GGML_PREC_F32;
+
+    switch (path) {
+    case FA_SCALAR:
        switch (D) {
        case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64[k->type][f32acc][small_rows][0]; break;
        case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80[k->type][f32acc][small_rows][0]; break;
@@ -5777,7 +5928,21 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
            GGML_ASSERT(!"unsupported D value");
            return;
        }
-    } else {
+        break;
+    case FA_COOPMAT1:
+        switch (D) {
+        case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64_cm1[k->type][f32acc][small_rows][0]; break;
+        case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80_cm1[k->type][f32acc][small_rows][0]; break;
+        case 96: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D96_cm1[k->type][f32acc][small_rows][0]; break;
+        case 112: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D112_cm1[k->type][f32acc][small_rows][0]; break;
+        case 128: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D128_cm1[k->type][f32acc][small_rows][0]; break;
+        case 256: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D256_cm1[k->type][f32acc][small_rows][0]; break;
+        default:
+            GGML_ASSERT(!"unsupported D value");
+            return;
+        }
+        break;
+    case FA_COOPMAT2:
        switch (D) {
        case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64_cm2[k->type][f32acc][small_rows][0]; break;
        case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80_cm2[k->type][f32acc][small_rows][0]; break;
@@ -5789,6 +5954,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
            GGML_ASSERT(!"unsupported D value");
            return;
        }
+        break;
+    default:
+        GGML_ASSERT(0);
    }
    assert(pipelines);

@@ -6582,7 +6750,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
    case GGML_OP_UNARY:
    case GGML_OP_CONV_2D_DW:
        {
-            const uint32_t ne = ggml_nelements(dst);
+            uint32_t ne = ggml_nelements(dst);
+            if (op == GGML_OP_CPY && ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
+                // Convert from number of logical elements to 2- or 4-byte units.
+                ne /= ggml_blck_size(src0->type);
+                if ((ggml_type_size(src0->type) % 4) == 0) {
+                    ne *= ggml_type_size(src0->type) / 4;
+                } else {
+                    ne *= ggml_type_size(src0->type) / 2;
+                }
+            }
            if (ne > 262144) {
                elements = { 512, 512, CEIL_DIV(ne, 262144) };
            } else if (ne > 512) {
@@ -7132,8 +7309,19 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const
    const uint32_t src0_type_size = ggml_type_size(src0->type);
    const uint32_t dst_type_size = ggml_type_size(dst->type);

+    uint32_t ne = (uint32_t)ggml_nelements(src0);
+    if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
+        // Convert from number of logical elements to 2- or 4-byte units.
+        ne /= ggml_blck_size(src0->type);
+        if ((ggml_type_size(src0->type) % 4) == 0) {
+            ne *= ggml_type_size(src0->type) / 4;
+        } else {
+            ne *= ggml_type_size(src0->type) / 2;
+        }
+    }
+
    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
-        (uint32_t)ggml_nelements(src0),
+        ne,
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
@@ -9115,8 +9303,7 @@ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_
    try {
        ptr = ggml_vk_host_malloc(vk_instance.devices[0], size);
    } catch (vk::SystemError& e) {
-        std::cerr << "ggml_vulkan: Failed to allocate pinned memory." << std::endl;
-        std::cerr << "ggml_vulkan: " << e.what() << std::endl;
+        GGML_LOG_WARN("ggml_vulkan: Failed to allocate pinned memory (%s)\n", e.what());
        // fallback to cpu buffer
        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
    }
@@ -9718,6 +9905,15 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
                    return true;
                }
+
+                // We can handle copying from a type to the same type if it's
+                // contiguous (memcpy). We use f16 or f32 shaders to do the copy,
+                // so the type/block size must be a multiple of 4.
+                if (src0_type == src1_type &&
+                    ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op) &&
+                    (ggml_type_size(src0_type) % 2) == 0) {
+                    return true;
+                }
                return false;
            } break;
        case GGML_OP_REPEAT:
@@ -10123,7 +10319,7 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
    } else if (tensor->op == GGML_OP_CONCAT) {
        tensor_clone = ggml_concat(ggml_ctx, src_clone[0], src_clone[1], *(int *)tensor->op_params);
    } else if (tensor->op == GGML_OP_UPSCALE) {
-        tensor_clone = ggml_upscale_ext(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->op_params[0], tensor->op_params[1], (ggml_scale_mode) tensor->op_params[0]);
+        tensor_clone = ggml_upscale_ext(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], (ggml_scale_mode) tensor->op_params[0]);
    } else if (tensor->op == GGML_OP_SCALE) {
        const float * params = (const float *)tensor->op_params;
        tensor_clone = ggml_scale(ggml_ctx, src_clone[0], params[0]);
@@ -10412,7 +10608,8 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
                        ggml_vk_print_graph_origin(tensor, done);
                        GGML_ABORT("fatal error");
                    }
-                    if (first_error[0] == -1 && std::fabs(correct - result) > 0.1f) {
+                    const double denom = std::fabs(correct) > 1.0f ? (std::fabs(correct) > 1e-8 ? std::fabs(correct) : 1e-8) : 1.0f;
+                    if (first_error[0] == -1 && std::fabs(correct - result) / denom > 0.5) {
                        first_error[0] = i0;
                        first_error[1] = i1;
                        first_error[2] = i2;
@@ -10424,7 +10621,7 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
                    // Special case, value is infinite, avoid NaN result in avg_err
                    // NaN also appears in results, if both are nan error is 0
                    if (!std::isinf(correct) && !std::isinf(result) && !std::isnan(correct) && !std::isnan(result)) {
-                        avg_err += std::fabs(correct - result);
+                        avg_err += std::fabs(correct - result) / denom;
                    }
                    counter++;
                }
@@ -10459,7 +10656,7 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
        ggml_vk_print_graph_origin(tensor, done);
    }

-    if (avg_err > 0.05 || std::isnan(avg_err)) {
+    if (avg_err > 0.5 || std::isnan(avg_err)) {
        std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
        std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
        if (src0 != nullptr) {
--- a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
@@ -5,18 +5,35 @@ find_package (Threads REQUIRED)

 if (GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
    add_compile_definitions(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+    message(STATUS "Enabling coopmat glslc support")
 endif()
 if (GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
    add_compile_definitions(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
+    message(STATUS "Enabling coopmat2 glslc support")
 endif()
 if (GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
    add_compile_definitions(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
+    message(STATUS "Enabling dot glslc support")
 endif()
 if (GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
    add_compile_definitions(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
+    message(STATUS "Enabling bfloat16 glslc support")
 endif()
+
 set(TARGET vulkan-shaders-gen)
 add_executable(${TARGET} vulkan-shaders-gen.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
+
+# Configure output directories for MSVC builds
+if(MSVC)
+    # Get the main project's runtime output directory if possible
+    if(DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY)
+        foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES})
+            string(TOUPPER ${CONFIG} CONFIG)
+            set_target_properties(${TARGET} PROPERTIES
+                RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+        endforeach()
+    endif()
+endif()
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp
@@ -1,6 +1,6 @@
 #version 450

-#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require

 #include "dequant_head.comp"

--- a/Show More
+++ b/Show More