llama : remove logits_all flag + reorder llama_context_params

ggml-ci
context : remove logits_all flag
2026-04-16 16:27:32 +03:00 · 2025-05-08 13:01:41 +03:00 · 2025-05-08 13:01:34 +03:00
220 changed files with 6733 additions and 13731 deletions
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2025.1.1-0-devel-ubuntu24.04
+ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04

 ## Build Image

--- a/.github/actions/get-tag-name/action.yml
+++ b/.github/actions/get-tag-name/action.yml
@@ -1,22 +0,0 @@
-name: "Determine tag name"
-description: "Determine the tag name to use for a release"
-outputs:
-  name:
-    description: "The name of the tag"
-    value: ${{ steps.tag.outputs.name }}
-
-runs:
-  using: "composite"
-  steps:
-    - name: Determine tag name
-      id: tag
-      shell: bash
-      run: |
-        BUILD_NUMBER="$(git rev-list --count HEAD)"
-        SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-        if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-          echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-        else
-          SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-          echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-        fi
--- a/.github/actions/windows-setup-cuda/action.yml
+++ b/.github/actions/windows-setup-cuda/action.yml
@@ -1,67 +0,0 @@
-name: "Windows - Setup CUDA Toolkit"
-description: "Setup CUDA Toolkit for Windows"
-inputs:
-  cuda_version:
-    description: "CUDA toolkit version"
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Install Cuda Toolkit 11.7
-      if: ${{ inputs.cuda_version == '11.7' }}
-      shell: pwsh
-      run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-
-    - name: Install Cuda Toolkit 12.4
-      if: ${{ inputs.cuda_version == '12.4' }}
-      shell: pwsh
-      run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
--- a/.github/actions/windows-setup-curl/action.yml
+++ b/.github/actions/windows-setup-curl/action.yml
@@ -5,10 +5,6 @@ inputs:
    description: 'CURL version'
    required: false
    default: '8.6.0_6'
-  architecture:
-    description: 'Architecture of the libcurl to download'
-    required: false
-    default: 'win64'
 outputs:
  curl_path:
    description: "Path to the downloaded libcurl"
@@ -22,9 +18,8 @@ runs:
      shell: powershell
      env:
        CURL_VERSION: ${{ inputs.curl_version }}
-        ARCHITECTURE: ${{ inputs.architecture }}
      run: |
-        curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-${env:ARCHITECTURE}-mingw.zip"
+        curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
        mkdir $env:RUNNER_TEMP/libcurl
        tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
        echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -140,94 +140,3 @@ jobs:
                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH

          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-24-ppc64el-cpu-cross:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup PowerPC64le
-        run: |
-          sudo dpkg --add-architecture ppc64el
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-          EOF
-
-          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-powerpc64le-linux-gnu \
-                  g++-14-powerpc64le-linux-gnu \
-                  libcurl4-openssl-dev:ppc64el
-
-      - name: Build
-        run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
-                         -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-24-ppc64el-vulkan-cross:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup PowerPC64le
-        run: |
-          sudo dpkg --add-architecture ppc64el
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-          EOF
-
-          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  glslc \
-                  gcc-14-powerpc64le-linux-gnu \
-                  g++-14-powerpc64le-linux-gnu \
-                  libvulkan-dev:ppc64el \
-                  libcurl4-openssl-dev:ppc64el
-
-      - name: Build
-        run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_VULKAN=ON \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
-                         -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -2,19 +2,30 @@ name: CI

 on:
  workflow_dispatch: # allows manual triggering
+    inputs:
+      create_release:
+        description: 'Create new release'
+        required: true
+        type: boolean
  push:
    branches:
      - master
-    paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
+    paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
+    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  contents: write # for creating release
+
 env:
+  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
  LLAMA_LOG_COLORS: 1
@@ -29,6 +40,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
@@ -61,6 +74,33 @@ jobs:
          cd build
          ctest -L 'main|curl' --verbose --timeout 900

+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp LICENSE ./build/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
+          name: llama-bin-macos-arm64.zip
+
  macOS-latest-cmake-x64:
    runs-on: macos-13

@@ -68,6 +108,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
@@ -101,6 +143,33 @@ jobs:
          cd build
          ctest -L main --verbose --timeout 900

+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp LICENSE ./build/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
+          name: llama-bin-macos-x64.zip
+
  ubuntu-cpu-cmake:
    strategy:
      matrix:
@@ -116,6 +185,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
@@ -154,6 +225,33 @@ jobs:
          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp LICENSE ./build/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
+          name: llama-bin-ubuntu-${{ matrix.build }}.zip
+
  ubuntu-latest-cmake-sanitizer:
    runs-on: ubuntu-latest

@@ -280,6 +378,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
@@ -307,7 +407,34 @@ jobs:
        run: |
          cd build
          # This is using llvmpipe and runs slower than other backends
-          ctest -L main --verbose --timeout 3600
+          ctest -L main --verbose --timeout 2700
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp LICENSE ./build/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
+          name: llama-bin-ubuntu-vulkan-x64.zip

  ubuntu-22-cmake-hip:
    runs-on: ubuntu-22.04
@@ -704,6 +831,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
@@ -806,6 +935,35 @@ jobs:
      #     $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
      #     & $sde -future -- ctest -L main -C Release --verbose --timeout 900

+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
+          name: llama-bin-win-${{ matrix.build }}.zip
+
  ubuntu-latest-cmake-cuda:
    runs-on: ubuntu-latest
    container: nvidia/cuda:12.6.2-devel-ubuntu24.04
@@ -814,6 +972,8 @@ jobs:
        - name: Clone
          id: checkout
          uses: actions/checkout@v4
+          with:
+            fetch-depth: 0

        - name: Install dependencies
          env:
@@ -845,23 +1005,77 @@ jobs:
    strategy:
      matrix:
        cuda: ['12.4', '11.7']
+        build: ['cuda']

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+            fetch-depth: 0

      - name: Install ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
-          key: windows-cuda-${{ matrix.cuda }}
+          key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
          variant: ccache
          evict-old-files: 1d

-      - name: Install Cuda Toolkit
-        uses: ./.github/actions/windows-setup-cuda
-        with:
-          cuda_version: ${{ matrix.cuda }}
+      - name: Install Cuda Toolkit 11.7
+        if: ${{ matrix.cuda == '11.7' }}
+        run: |
+          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
+          choco install unzip -y
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
+          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+
+      - name: Install Cuda Toolkit 12.4
+        if: ${{ matrix.cuda == '12.4' }}
+        run: |
+          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+          choco install unzip -y
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
+          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8

      - name: Install Ninja
        id: install_ninja
@@ -891,6 +1105,51 @@ jobs:
          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
          cmake --build build --config Release

+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
+          name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+
+      - name: Copy and pack Cuda runtime
+        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+        run: |
+          echo "Cuda install location: ${{ env.CUDA_PATH }}"
+          $dst='.\build\bin\cudart\'
+          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
+
+      - name: Upload Cuda runtime
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+          name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+
  windows-latest-cmake-sycl:
    runs-on: windows-latest

@@ -899,13 +1158,15 @@ jobs:
        shell: bash

    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
@@ -924,6 +1185,52 @@ jobs:
        id: cmake_build
        run:  examples/sycl/win-build-sycl.bat

+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Build the release package
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
+
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
+
+          echo "cp oneAPI running time dll files to ./build/bin done"
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
+
+      - name: Upload the release package
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
+          name: llama-bin-win-sycl-x64.zip
+
  windows-latest-cmake-hip:
    if: ${{ github.event.inputs.create_release != 'true' }}
    runs-on: windows-latest
@@ -981,12 +1288,110 @@ jobs:
            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}

+  # TODO: reuse windows-latest-cmake-hip instead of duplicating this job
+  windows-latest-cmake-hip-release:
+    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+    runs-on: windows-latest
+
+    strategy:
+      matrix:
+        gpu_target: [gfx1100, gfx1101, gfx1030]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+            fetch-depth: 0
+
+      - name: Clone rocWMMA repository
+        id: clone_rocwmma
+        run: |
+          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-hip-release
+          evict-old-files: 1d
+
+      - name: Install
+        id: depends
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "Downloading AMD HIP SDK Installer"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          write-host "Installing AMD HIP SDK"
+          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          write-host "Completed AMD HIP SDK installation"
+
+      - name: Verify ROCm
+        id: verify
+        run: |
+          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
+
+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+
+      - name: Build
+        id: cmake_build
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake -G "Unix Makefiles" -B build -S . `
+            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
+            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
+            -DCMAKE_BUILD_TYPE=Release `
+            -DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
+            -DGGML_HIP_ROCWMMA_FATTN=ON `
+            -DGGML_HIP=ON `
+            -DGGML_RPC=ON `
+            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
+          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+          md "build\bin\rocblas\library\"
+          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+
  ios-xcode-build:
    runs-on: macos-latest

    steps:
      - name: Checkout code
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: Build
        id: cmake_build
@@ -1013,6 +1418,32 @@ jobs:
      - name: Build Xcode project
        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build

+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
+          name: llama-${{ steps.tag.outputs.name }}-xcframework
+
  android-build:
    runs-on: ubuntu-latest

@@ -1040,8 +1471,283 @@ jobs:
      - name: Build
        run: |
          cd examples/llama.android
+
          ./gradlew build --no-daemon

+  release:
+    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+
+    runs-on: ubuntu-latest
+
+    needs:
+      - ubuntu-cpu-cmake
+      - ubuntu-22-cmake-vulkan
+      - windows-latest-cmake
+      - windows-2019-cmake-cuda
+      - windows-latest-cmake-sycl
+      - windows-latest-cmake-hip-release
+      - macOS-latest-cmake-arm64
+      - macOS-latest-cmake-x64
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: release
+          evict-old-files: 1d
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Download artifacts
+        id: download-artifact
+        uses: actions/download-artifact@v4
+        with:
+          path: ./artifact
+
+      - name: Move artifacts
+        id: move_artifacts
+        run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
+
+      - name: Create release
+        id: create_release
+        uses: ggml-org/action-create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ steps.tag.outputs.name }}
+
+      - name: Upload release
+        id: upload_release
+        uses: actions/github-script@v3
+        with:
+          github-token: ${{secrets.GITHUB_TOKEN}}
+          script: |
+            const path = require('path');
+            const fs = require('fs');
+            const release_id = '${{ steps.create_release.outputs.id }}';
+            for (let file of await fs.readdirSync('./artifact/release')) {
+              if (path.extname(file) === '.zip') {
+                console.log('uploadReleaseAsset', file);
+                await github.repos.uploadReleaseAsset({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  release_id: release_id,
+                  name: file,
+                  data: await fs.readFileSync(`./artifact/release/${file}`)
+                });
+              }
+            }
+
+#  ubuntu-latest-gcc:
+#    runs-on: ubuntu-latest
+#
+#    strategy:
+#      matrix:
+#        build: [Debug, Release]
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v4
+#
+#      - name: Dependencies
+#        run: |
+#          sudo apt-get update
+#          sudo apt-get install build-essential
+#          sudo apt-get install cmake
+#
+#      - name: Configure
+#        run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+#
+#      - name: Build
+#        run: |
+#          make
+#
+#  ubuntu-latest-clang:
+#    runs-on: ubuntu-latest
+#
+#    strategy:
+#      matrix:
+#        build: [Debug, Release]
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v4
+#
+#      - name: Dependencies
+#        run: |
+#          sudo apt-get update
+#          sudo apt-get install build-essential
+#          sudo apt-get install cmake
+#
+#      - name: Configure
+#        run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
+#
+#      - name: Build
+#        run: |
+#          make
+#
+#  ubuntu-latest-gcc-sanitized:
+#    runs-on: ubuntu-latest
+#
+#    strategy:
+#      matrix:
+#        sanitizer: [ADDRESS, THREAD, UNDEFINED]
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v4
+#
+#      - name: Dependencies
+#        run: |
+#          sudo apt-get update
+#          sudo apt-get install build-essential
+#          sudo apt-get install cmake
+#
+#      - name: Configure
+#        run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
+#
+#      - name: Build
+#        run: |
+#          make
+#
+#  windows:
+#    runs-on: windows-latest
+#
+#    strategy:
+#      matrix:
+#        build: [Release]
+#        arch: [Win32, x64]
+#        include:
+#          - arch: Win32
+#            s2arc: x86
+#          - arch: x64
+#            s2arc: x64
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v4
+#
+#      - name: Add msbuild to PATH
+#        uses: microsoft/setup-msbuild@v1
+#
+#      - name: Configure
+#        run: >
+#          cmake -S . -B ./build -A ${{ matrix.arch }}
+#          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+#
+#      - name: Build
+#        run: |
+#          cd ./build
+#          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
+#
+#      - name: Upload binaries
+#        uses: actions/upload-artifact@v4
+#        with:
+#          name: llama-bin-${{ matrix.arch }}
+#          path: build/bin/${{ matrix.build }}
+#
+#  windows-blas:
+#    runs-on: windows-latest
+#
+#    strategy:
+#      matrix:
+#        build: [Release]
+#        arch: [Win32, x64]
+#        blas: [ON]
+#        include:
+#          - arch: Win32
+#            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
+#            s2arc: x86
+#          - arch: x64
+#            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
+#            s2arc: x64
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v4
+#
+#      - name: Add msbuild to PATH
+#        uses: microsoft/setup-msbuild@v1
+#
+#      - name: Fetch OpenBLAS
+#        if: matrix.blas == 'ON'
+#        run: |
+#          C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
+#          7z x blas.zip -oblas -y
+#          copy blas/include/cblas.h .
+#          copy blas/include/openblas_config.h .
+#          echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
+#
+#      - name: Configure
+#        run: >
+#          cmake -S . -B ./build -A ${{ matrix.arch }}
+#          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+#          -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }}
+#          -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
+#
+#      - name: Build
+#        run: |
+#          cd ./build
+#          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
+#
+#      - name: Copy libopenblas.dll
+#        if: matrix.blas == 'ON'
+#        run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
+#
+#      - name: Upload binaries
+#        if: matrix.blas == 'ON'
+#        uses: actions/upload-artifact@v4
+#        with:
+#          name: llama-blas-bin-${{ matrix.arch }}
+#          path: build/bin/${{ matrix.build }}
+#
+#  emscripten:
+#    runs-on: ubuntu-latest
+#
+#    strategy:
+#      matrix:
+#        build: [Release]
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v4
+#
+#      - name: Dependencies
+#        run: |
+#          wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
+#          tar -xvf master.tar.gz
+#          emsdk-master/emsdk update
+#          emsdk-master/emsdk install latest
+#          emsdk-master/emsdk activate latest
+#
+#      - name: Configure
+#        run: echo "tmp"
+#
+#      - name: Build
+#        run: |
+#          pushd emsdk-master
+#          source ./emsdk_env.sh
+#          popd
+#          emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+#          make
+
  openEuler-latest-cmake-cann:
    if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
    defaults:
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -42,7 +42,8 @@ jobs:
          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
-          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
+          # Note: the intel images are failing due to an out of disk space error
+          # - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,716 +0,0 @@
-name: Create Release
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      create_release:
-        description: 'Create new release'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-  CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
-
-jobs:
-  macOS-arm64:
-    runs-on: macos-14
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-arm64
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DGGML_RPC=ON \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
-          name: llama-bin-macos-arm64.zip
-
-  macOS-x64:
-    runs-on: macos-13
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-x64
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
-          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_METAL=OFF \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
-          name: llama-bin-macos-x64.zip
-
-  ubuntu-22-cpu:
-    strategy:
-      matrix:
-        include:
-          - build: 'x64'
-            os: ubuntu-22.04
-          - build: 'arm64'
-            os: ubuntu-22.04-arm
-
-    runs-on: ${{ matrix.os }}
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-cpu-cmake
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
-          name: llama-bin-ubuntu-${{ matrix.build }}.zip
-
-  ubuntu-22-vulkan:
-    runs-on: ubuntu-22.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-vulkan
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DGGML_VULKAN=ON \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
-          name: llama-bin-ubuntu-vulkan-x64.zip
-
-  windows:
-    runs-on: windows-latest
-
-    env:
-      OPENBLAS_VERSION: 0.3.23
-      VULKAN_VERSION: 1.4.309.0
-
-    strategy:
-      matrix:
-        include:
-          - build: 'cpu-x64'
-            arch: 'x64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
-          #- build: 'openblas-x64'
-          #  arch: 'x64'
-          #  defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
-          - build: 'vulkan-x64'
-            arch: 'x64'
-            defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
-          - build: 'cpu-arm64'
-            arch: 'arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
-          - build: 'opencl-adreno-arm64'
-            arch: 'arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-${{ matrix.build }}
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Download OpenBLAS
-        id: get_openblas
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
-          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
-          mkdir $env:RUNNER_TEMP/openblas
-          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
-          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
-          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
-          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
-          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
-
-      - name: Install Vulkan SDK
-        id: get_vulkan
-        if: ${{ matrix.build == 'vulkan-x64' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
-          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
-          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
-          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: Install OpenCL Headers and Libs
-        id: install_opencl
-        if: ${{ matrix.build == 'opencl-adreno-arm64' }}
-        run: |
-          git clone https://github.com/KhronosGroup/OpenCL-Headers
-          cd OpenCL-Headers
-          cmake -B build `
-            -DBUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build --target install
-          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
-          cd OpenCL-ICD-Loader
-          cmake -B build-arm64-release `
-            -A arm64 `
-            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build-arm64-release --target install --config release
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-        with:
-          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
-
-      - name: Build
-        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cmake -S . -B build ${{ matrix.defines }} `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
-
-      - name: Add libopenblas.dll
-        id: add_libopenblas_dll
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
-          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
-          name: llama-bin-win-${{ matrix.build }}.zip
-
-  windows-cuda:
-    runs-on: windows-2019
-
-    strategy:
-      matrix:
-        cuda: ['12.4', '11.7']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-            fetch-depth: 0
-
-      - name: Install ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-cuda-${{ matrix.cuda }}
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Install Cuda Toolkit
-        uses: ./.github/actions/windows-setup-cuda
-        with:
-          cuda_version: ${{ matrix.cuda }}
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
-          cmake -S . -B build -G "Ninja Multi-Config" ^
-            -DGGML_NATIVE=OFF ^
-            -DGGML_BACKEND_DL=ON ^
-            -DGGML_CPU_ALL_VARIANTS=ON ^
-            -DGGML_CUDA=ON ^
-            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
-            ${{ env.CMAKE_ARGS }}
-          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
-          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
-          cmake --build build --config Release
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip
-          name: llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
-
-      - name: Copy and pack Cuda runtime
-        run: |
-          echo "Cuda install location: ${{ env.CUDA_PATH }}"
-          $dst='.\build\bin\cudart\'
-          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          7z a cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip $dst\*
-
-      - name: Upload Cuda runtime
-        uses: actions/upload-artifact@v4
-        with:
-          path: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
-          name: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
-
-  windows-sycl:
-    runs-on: windows-latest
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-sycl
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Install
-        run:  |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
-      - name: Build
-        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Build the release package
-        id: pack_artifacts
-        run: |
-          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
-
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
-
-          echo "cp oneAPI running time dll files to ./build/bin done"
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
-
-      - name: Upload the release package
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
-          name: llama-bin-win-sycl-x64.zip
-
-  windows-hip:
-    runs-on: windows-latest
-
-    strategy:
-      matrix:
-        gpu_target: [gfx1100, gfx1101, gfx1030]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-            fetch-depth: 0
-
-      - name: Clone rocWMMA repository
-        id: clone_rocwmma
-        run: |
-          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-hip-release
-          evict-old-files: 1d
-
-      - name: Install
-        id: depends
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
-          write-host "Installing AMD HIP SDK"
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
-          write-host "Completed AMD HIP SDK installation"
-
-      - name: Verify ROCm
-        id: verify
-        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . `
-            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
-            -DCMAKE_BUILD_TYPE=Release `
-            -DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
-            -DGGML_HIP_ROCWMMA_FATTN=ON `
-            -DGGML_HIP=ON `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
-          md "build\bin\rocblas\library\"
-          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
-          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
-
-  ios-xcode-build:
-    runs-on: macos-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          ./build-xcframework.sh
-
-      - name: Build Xcode project
-        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
-          name: llama-${{ steps.tag.outputs.name }}-xcframework
-
-  release:
-    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-
-    # Fine-grant permission
-    # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-    permissions:
-        contents: write # for creating release
-
-    runs-on: ubuntu-latest
-
-    needs:
-      - ubuntu-22-cpu
-      - ubuntu-22-vulkan
-      - windows
-      - windows-cuda
-      - windows-sycl
-      - windows-hip
-      - macOS-arm64
-      - macOS-x64
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Download artifacts
-        id: download-artifact
-        uses: actions/download-artifact@v4
-        with:
-          path: ./artifact
-
-      - name: Move artifacts
-        id: move_artifacts
-        run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
-
-      - name: Create release
-        id: create_release
-        uses: ggml-org/action-create-release@v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          tag_name: ${{ steps.tag.outputs.name }}
-
-      - name: Upload release
-        id: upload_release
-        uses: actions/github-script@v3
-        with:
-          github-token: ${{secrets.GITHUB_TOKEN}}
-          script: |
-            const path = require('path');
-            const fs = require('fs');
-            const release_id = '${{ steps.create_release.outputs.id }}';
-            for (let file of await fs.readdirSync('./artifact/release')) {
-              if (path.extname(file) === '.zip') {
-                console.log('uploadReleaseAsset', file);
-                await github.repos.uploadReleaseAsset({
-                  owner: context.repo.owner,
-                  repo: context.repo.repo,
-                  release_id: release_id,
-                  name: file,
-                  data: await fs.readFileSync(`./artifact/release/${file}`)
-                });
-              }
-            }
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -252,3 +252,20 @@ configure_file(cmake/llama.pc.in

 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
+
+#
+# copy the license files
+#
+
+# Check if running in GitHub Actions
+if(DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
+    message(STATUS "Running inside GitHub Actions - copying license files")
+
+    # Copy all files from licenses/ to build/bin/
+    file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
+    foreach(LICENSE_FILE ${LICENSE_FILES})
+        get_filename_component(FILENAME ${LICENSE_FILE} NAME)
+        configure_file(${LICENSE_FILE} "${CMAKE_BINARY_DIR}/bin/${FILENAME}" COPYONLY)
+    endforeach()
+endif()
+
--- a/README.md
+++ b/README.md
@@ -16,9 +16,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ## Hot topics

- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
 - **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
+- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141]((https://github.com/ggml-org/llama.cpp/pull/13141))), `libllava` will be deprecated
 - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
 - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
@@ -572,11 +571,4 @@ automatically. For example:
 $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
 ```

-## Dependencies
-
- [yhirose/cpp-httplib](https://github.com/yhirose/cpp-httplib) - Single-header HTTP server, used by `llama-server` - MIT license
- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
+## References
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -117,7 +117,6 @@ setup_framework_structure() {
    # Copy all required headers (common for all platforms)
    cp include/llama.h             ${header_path}
    cp ggml/include/ggml.h         ${header_path}
-    cp ggml/include/ggml-opt.h     ${header_path}
    cp ggml/include/ggml-alloc.h   ${header_path}
    cp ggml/include/ggml-backend.h ${header_path}
    cp ggml/include/ggml-metal.h   ${header_path}
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -73,8 +73,6 @@ add_library(${TARGET} STATIC
    minja/minja.hpp
    ngram-cache.cpp
    ngram-cache.h
-    regex-partial.cpp
-    regex-partial.h
    sampling.cpp
    sampling.h
    speculative.cpp
@@ -121,8 +119,8 @@ if (LLAMA_LLGUIDANCE)

    ExternalProject_Add(llguidance_ext
        GIT_REPOSITORY https://github.com/guidance-ai/llguidance
-        # v0.7.20 (+ fix to build on GCC 15):
-        GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
+        # v0.7.10:
+        GIT_TAG 0309d2a6bf40abda35344a362edc71e06d5009f8
        PREFIX ${CMAKE_BINARY_DIR}/llguidance
        SOURCE_DIR ${LLGUIDANCE_SRC}
        BUILD_IN_SOURCE TRUE
@@ -146,27 +144,3 @@ endif ()
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features   (${TARGET} PUBLIC cxx_std_17)
 target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
-
-
-#
-# copy the license files
-#
-
-# Check if running in GitHub Actions
-if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
-    message(STATUS "Running inside GitHub Actions - copying license files")
-
-    # Copy all files from licenses/ to build/bin/
-    file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
-    foreach(LICENSE_FILE ${LICENSE_FILES})
-        get_filename_component(FILENAME ${LICENSE_FILE} NAME)
-        add_custom_command(
-            POST_BUILD
-            TARGET ${TARGET}
-            COMMAND ${CMAKE_COMMAND} -E copy_if_different
-                "${LICENSE_FILE}"
-                "$<TARGET_FILE_DIR:llama>/${FILENAME}"
-            COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
-        message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
-    endforeach()
-endif()
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -40,7 +40,7 @@ using json = nlohmann::ordered_json;

 std::initializer_list<enum llama_example> mmproj_examples = {
    LLAMA_EXAMPLE_LLAVA,
-    LLAMA_EXAMPLE_SERVER,
+    // TODO: add LLAMA_EXAMPLE_SERVER when it's ready
 };

 static std::string read_file(const std::string & fname) {
@@ -2204,33 +2204,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
    add_opt(common_arg(
        {"--mmproj"}, "FILE",
-        "path to a multimodal projector file. see tools/mtmd/README.md\n"
-        "note: if -hf is used, this argument can be omitted",
+        "path to a multimodal projector file. see tools/mtmd/README.md",
        [](common_params & params, const std::string & value) {
            params.mmproj.path = value;
        }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
+    ).set_examples(mmproj_examples));
    add_opt(common_arg(
        {"--mmproj-url"}, "URL",
        "URL to a multimodal projector file. see tools/mtmd/README.md",
        [](common_params & params, const std::string & value) {
            params.mmproj.url = value;
        }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
+    ).set_examples(mmproj_examples));
    add_opt(common_arg(
        {"--no-mmproj"},
        "explicitly disable multimodal projector, useful when using -hf",
        [](common_params & params) {
            params.no_mmproj = true;
        }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
+    ).set_examples(mmproj_examples));
    add_opt(common_arg(
        {"--no-mmproj-offload"},
        "do not offload multimodal projector to GPU",
        [](common_params & params) {
            params.mmproj_use_gpu = false;
        }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
+    ).set_examples(mmproj_examples));
    add_opt(common_arg(
        {"--image"}, "FILE",
        "path to an image file. use with multimodal models. Specify multiple times for batching",
@@ -2437,13 +2436,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ));
-    add_opt(common_arg(
-        {"--no-op-offload"},
-        string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
-        [](common_params & params) {
-            params.no_op_offload = true;
-        }
-    ));
    add_opt(common_arg(
        {"--lora"}, "FNAME",
        "path to LoRA adapter (can be repeated to use multiple adapters)",
@@ -2585,7 +2577,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, int value) {
            params.n_junk = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
+    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
    add_opt(common_arg(
        {"--pos"}, "N",
        string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
@@ -2635,20 +2627,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.i_chunk = value;
        }
    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
-    add_opt(common_arg(
-        {"--parse-special"},
-        string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
-        [](common_params & params) {
-            params.parse_special = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
    add_opt(common_arg(
        {"-pps"},
        string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
        [](common_params & params) {
            params.is_pp_shared = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
    add_opt(common_arg(
        {"-npp"}, "n0,n1,...",
        "number of prompt tokens",
@@ -2880,16 +2865,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.chat_template = read_file(value);
        }
    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
-    add_opt(common_arg(
-        {"--no-prefill-assistant"},
-        string_format(
-            "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
-            "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
-        ),
-        [](common_params & params) {
-            params.prefill_assistant = false;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
    add_opt(common_arg(
        {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
        string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -6,15 +6,6 @@

 #include <optional>

-static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
-    auto time = std::chrono::system_clock::to_time_t(now);
-    auto local_time = *std::localtime(&time);
-    std::ostringstream ss;
-    ss << std::put_time(&local_time, format.c_str());
-    auto res = ss.str();
-    return res;
-}
-
 typedef minja::chat_template common_chat_template;

 struct common_chat_templates {
@@ -33,7 +24,6 @@ struct templates_params {
    std::string grammar;
    bool add_generation_prompt = true;
    bool extract_reasoning     = true;
-    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
 };

 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -135,9 +125,7 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
            msgs.push_back(msg);
        }
    } catch (const std::exception & e) {
-        // @ngxson : disable otherwise it's bloating the API response
-        // printf("%s\n", std::string("; messages = ") + messages.dump(2));
-        throw std::runtime_error("Failed to parse messages: " + std::string(e.what()));
+        throw std::runtime_error("Failed to parse messages: " + std::string(e.what()) + "; messages = " + messages.dump(2));
    }

    return msgs;
@@ -949,83 +937,78 @@ static void expect_tool_parameters(const std::string & name, const json & parame
    }
 }

-static common_chat_params common_chat_params_init_llama_3_x(const common_chat_template & tmpl, const struct templates_params & inputs, bool allow_python_tag_builtin_tools) {
+static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const common_chat_template & tmpl, const struct templates_params & inputs, bool allow_python_tag_builtin_tools) {
    auto builtin_tools = json::array();
    common_chat_params data;
-    if (!inputs.tools.is_null()) {
-        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-            std::vector<std::string> tool_rules;
+    data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+    data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+        std::vector<std::string> tool_rules;

-            auto handle_builtin_tool = [&](const std::string & name, const json & parameters) {
-                if (name == "wolfram_alpha" || name == "web_search" || name == "brave_search") {
-                    // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
-                    // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
-                    expect_tool_parameters(name, parameters, {"query"});
-                } else if (name == "python" || name == "code_interpreter") {
-                    // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
-                    expect_tool_parameters(name, parameters, {"code"});
-                } else {
-                    return false;
-                }
-
-                std::vector<std::string> kvs;
-                for (const auto & [key, value] : parameters.at("properties").items()) {
-                    kvs.push_back("\"" + key + "=\" " + builder.add_schema(name + "-args-" + key, value)); // NOLINT
-                }
-
-                tool_rules.push_back(
-                    builder.add_rule(
-                        name + "-call",
-                        "\"<|python_tag|>" + name + ".call(\" " + string_join(kvs, " \", \" ") + " \")\""));
-                builtin_tools.push_back(name);
-
-                return true;
-            };
-
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                std::string name = function.at("name");
-                auto parameters = function.at("parameters");
-                builder.resolve_refs(parameters);
-
-                // https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/remote/tool_runtime
-                if (allow_python_tag_builtin_tools) {
-                    handle_builtin_tool(name, parameters);
-                }
-                tool_rules.push_back(
-                    builder.add_rule(
-                        name + "-call",
-                        "\"{\" space "
-                        "( \"\\\"type\\\"\"       space \":\" space \"\\\"function\\\"\"     space \",\" space )? "
-                        "  \"\\\"name\\\"\"       space \":\" space \"\\\"" + name + "\\\"\" space \",\" space "
-                        "  \"\\\"parameters\\\"\" space \":\" space " + builder.add_schema(name + "-args", parameters) + " "
-                        "\"}\" space"));
-            });
-            // Small models may hallucinate function names so we match anything (*at the start*) that looks like the JSON of a function call, regardless of the name.
-            data.grammar_triggers.push_back({
-                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
-                "\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"", // + name + "\"[\\s\\S]*",
-            });
-            if (!builtin_tools.empty()) {
-                data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
-                data.preserved_tokens.push_back("<|python_tag|>");
+        auto handle_builtin_tool = [&](const std::string & name, const json & parameters) {
+            if (name == "wolfram_alpha" || name == "web_search" || name == "brave_search") {
+                // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
+                // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
+                expect_tool_parameters(name, parameters, {"query"});
+            } else if (name == "python" || name == "code_interpreter") {
+                // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
+                expect_tool_parameters(name, parameters, {"code"});
+            } else {
+                return false;
            }
-            // Allow a few empty lines on top of the usual constrained json schema space rule.
-            builder.add_rule("root", string_join(tool_rules, " | "));
-            data.additional_stops.push_back("<|eom_id|>");
+
+            std::vector<std::string> kvs;
+            for (const auto & [key, value] : parameters.at("properties").items()) {
+                kvs.push_back("\"" + key + "=\" " + builder.add_schema(name + "-args-" + key, value)); // NOLINT
+            }
+
+            tool_rules.push_back(
+                builder.add_rule(
+                    name + "-call",
+                    "\"<|python_tag|>" + name + ".call(\" " + string_join(kvs, " \", \" ") + " \")\""));
+            builtin_tools.push_back(name);
+
+            return true;
+        };
+
+        foreach_function(inputs.tools, [&](const json & tool) {
+            const auto & function = tool.at("function");
+            std::string name = function.at("name");
+            auto parameters = function.at("parameters");
+            builder.resolve_refs(parameters);
+
+            // https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/remote/tool_runtime
+            if (allow_python_tag_builtin_tools) {
+                handle_builtin_tool(name, parameters);
+            }
+            tool_rules.push_back(
+                builder.add_rule(
+                    name + "-call",
+                    "\"{\" space "
+                    "( \"\\\"type\\\"\"       space \":\" space \"\\\"function\\\"\"     space \",\" space )? "
+                    "  \"\\\"name\\\"\"       space \":\" space \"\\\"" + name + "\\\"\" space \",\" space "
+                    "  \"\\\"parameters\\\"\" space \":\" space " + builder.add_schema(name + "-args", parameters) + " "
+                    "\"}\" space"));
        });
-        data.format = allow_python_tag_builtin_tools && !builtin_tools.empty()
-            ? COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS
-            : COMMON_CHAT_FORMAT_LLAMA_3_X;
-    } else {
-        data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
-    }
+        // Small models may hallucinate function names so we match anything (*at the start*) that looks like the JSON of a function call, regardless of the name.
+        data.grammar_triggers.push_back({
+            COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
+            "\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"", // + name + "\"[\\s\\S]*",
+        });
+        if (!builtin_tools.empty()) {
+            data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
+            data.preserved_tokens.push_back("<|python_tag|>");
+        }
+        // Allow a few empty lines on top of the usual constrained json schema space rule.
+        builder.add_rule("root", string_join(tool_rules, " | "));
+    });
+    data.additional_stops.push_back("<|eom_id|>");
    data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
-        {"date_string", format_time(inputs.now, "%d %b %Y")},
        {"tools_in_user_message", false},
        {"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
    });
+    data.format = allow_python_tag_builtin_tools && !builtin_tools.empty()
+        ? COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS
+        : COMMON_CHAT_FORMAT_LLAMA_3_X;
    return data;
 }
 static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) {
@@ -1165,7 +1148,7 @@ static common_chat_params common_chat_params_init_firefunction_v2(const common_c
    LOG_DBG("%s\n", __func__);
    common_chat_params data;
    data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, {
-        {"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
+        {"datetime", "Jan 29 2025 13:00:00 GMT"},
        {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
    });
    if (inputs.tools.is_array() && !inputs.tools.empty()) {
@@ -1300,59 +1283,55 @@ static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & in
 static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
    // https://github.com/MeetKai/functionary/blob/main/tests/prompt_test_v3-llama3.1.txt
    common_chat_params data;
+    json tools = inputs.tools.is_null() ? inputs.tools : json::array();
+    std::string python_code_argument_name;
+    auto has_raw_python = false;

-    if (!inputs.tools.is_null()) {
-        std::string python_code_argument_name;
-        auto has_raw_python = false;
-
-        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-            std::vector<std::string> tool_rules;
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                const auto & parameters = function.at("parameters");
-                std::string name = function.at("name");
-                if (name == "python" || name == "ipython") {
-                    if (!parameters.contains("type")) {
-                        throw std::runtime_error("Missing type in python tool");
-                    }
-                    has_raw_python = true;
-                    const auto & type = parameters.at("type");
-                    if (type == "object") {
-                        auto properties = parameters.at("properties");
-                        for (auto it = properties.begin(); it != properties.end(); ++it) {
-                            if (it.value().at("type") == "string") {
-                                if (!python_code_argument_name.empty()) {
-                                    throw std::runtime_error("Multiple string arguments found in python tool");
-                                }
-                                python_code_argument_name = it.key();
-                            }
-                        }
-                        if (python_code_argument_name.empty()) {
-                            throw std::runtime_error("No string argument found in python tool");
-                        }
-                    } else if (type != "string") {
-                        throw std::runtime_error("Invalid type in python tool: " + type.dump());
-                    }
+    data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+    data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+        std::vector<std::string> tool_rules;
+        foreach_function(inputs.tools, [&](const json & tool) {
+            const auto & function = tool.at("function");
+            const auto & parameters = function.at("parameters");
+            std::string name = function.at("name");
+            if (name == "python" || name == "ipython") {
+                if (!parameters.contains("type")) {
+                    throw std::runtime_error("Missing type in python tool");
+                }
+                has_raw_python = true;
+                const auto & type = parameters.at("type");
+                if (type == "object") {
+                    auto properties = parameters.at("properties");
+                    for (auto it = properties.begin(); it != properties.end(); ++it) {
+                        if (it.value().at("type") == "string") {
+                            if (!python_code_argument_name.empty()) {
+                                throw std::runtime_error("Multiple string arguments found in python tool");
+                            }
+                            python_code_argument_name = it.key();
+                        }
+                    }
+                    if (python_code_argument_name.empty()) {
+                        throw std::runtime_error("No string argument found in python tool");
+                    }
+                } else if (type != "string") {
+                    throw std::runtime_error("Invalid type in python tool: " + type.dump());
                }
-                tool_rules.push_back(builder.add_rule(name + "-call", "\"<function=" + name + ">\" " + builder.add_schema(name + "-args", parameters) + " \"</function>\" space"));
-            });
-            if (has_raw_python) {
-                tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*"));
-                data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
-                data.preserved_tokens.push_back("<|python_tag|>");
            }
-            auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space";
-            builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
-            data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function="});
+            tool_rules.push_back(builder.add_rule(name + "-call", "\"<function=" + name + ">\" " + builder.add_schema(name + "-args", parameters) + " \"</function>\" space"));
        });
-        data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1;
-    } else {
-        data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
-    }
+        if (has_raw_python) {
+            tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*"));
+            data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
+            data.preserved_tokens.push_back("<|python_tag|>");
+        }
+        auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space";
+        builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
+        data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function="});
+    });

    data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
    // TODO: if (has_raw_python)
+    data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1;
    return data;
 }
 static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::string & input) {
@@ -1612,7 +1591,6 @@ static common_chat_params common_chat_templates_apply_jinja(
    params.extract_reasoning = inputs.extract_reasoning;
    params.tool_choice = inputs.tool_choice;
    params.grammar = inputs.grammar;
-    params.now = inputs.now;
    if (!inputs.json_schema.empty()) {
        params.json_schema = json::parse(inputs.json_schema);
    }
@@ -1664,21 +1642,21 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_firefunction_v2(tmpl, params);
    }

+    // Plain handler (no tools)
+    if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
+        return common_chat_params_init_without_tools(tmpl, params);
+    }
+
    // Functionary v3.1 (w/ tools)
    if (src.find("<|start_header_id|>") != std::string::npos
        && src.find("<function=") != std::string::npos) {
        return common_chat_params_init_functionary_v3_1_llama_3_1(tmpl, params);
    }

-    // Llama 3.1, 3.2, 3.3 (also requires date_string so using it even w/o tools)
+    // Llama 3.1, 3.2, 3.3 (w/ tools)
    if (src.find("<|start_header_id|>ipython<|end_header_id|>") != std::string::npos) {
        auto allow_python_tag_builtin_tools = src.find("<|python_tag|>") != std::string::npos;
-        return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
-    }
-
-    // Plain handler (no tools)
-    if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
-        return common_chat_params_init_without_tools(tmpl, params);
+        return common_chat_params_init_llama_3_1_tool_calls(tmpl, params, allow_python_tag_builtin_tools);
    }

    // Mistral Nemo (w/ tools)
--- a/common/chat.h
+++ b/common/chat.h
@@ -3,7 +3,6 @@
 #pragma once

 #include "common.h"
-#include <chrono>
 #include <string>
 #include <vector>

@@ -72,7 +71,6 @@ struct common_chat_templates_inputs {
    common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
    bool parallel_tool_calls = false;
    bool extract_reasoning     = true;
-    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
 };

 struct common_chat_params {
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -443,25 +443,6 @@ void string_replace_all(std::string & s, const std::string & search, const std::
    s = std::move(builder);
 }

-bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
-    return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
-}
-size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
-    if (!str.empty() && !stop.empty()) {
-        const char text_last_char = str.back();
-        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
-            if (stop[char_index] == text_last_char) {
-                const auto current_partial = stop.substr(0, char_index + 1);
-                if (string_ends_with(str, current_partial)) {
-                    return str.size() - char_index - 1;
-                }
-            }
-        }
-    }
-
-    return std::string::npos;
-}
-
 std::string regex_escape(const std::string & s) {
    static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
    return std::regex_replace(s, special_chars, "\\$0");
@@ -1132,7 +1113,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    cparams.offload_kqv       = !params.no_kv_offload;
    cparams.flash_attn        = params.flash_attn;
    cparams.no_perf           = params.no_perf;
-    cparams.op_offload        = !params.no_op_offload;

    if (params.reranking) {
        cparams.embeddings    = true;
@@ -1584,20 +1564,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c

    return result;
 }
-
-ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
-    const int64_t ne_datapoint = llama_n_ctx(ctx);
-    const int64_t ndata        = (tokens.size() - ne_datapoint - 1) / stride;
-    ggml_opt_dataset_t result = ggml_opt_dataset_init(
-        GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
-
-    llama_token * data   = (llama_token *) ggml_opt_dataset_data(result)->data;
-    llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
-
-    for (int64_t idata = 0; idata < ndata; ++idata) {
-        memcpy(data   + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
-        memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
-    }
-
-    return result;
-}
--- a/common/common.h
+++ b/common/common.h
@@ -6,7 +6,6 @@

 #include <set>
 #include <string>
-#include <string_view>
 #include <vector>
 #include <sstream>

@@ -333,7 +332,6 @@ struct common_params {
    bool no_kv_offload     = false; // disable KV offloading
    bool warmup            = true;  // warmup run
    bool check_tensors     = false; // validate tensor data
-    bool no_op_offload     = false; // globally disable offload host tensor operations to device

    bool single_turn       = false; // single turn chat conversation

@@ -368,7 +366,6 @@ struct common_params {
    bool use_jinja = false;                                                                                 // NOLINT
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
-    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response

    std::vector<std::string> api_keys;

@@ -412,7 +409,6 @@ struct common_params {

    bool process_output = false; // collect data for the output tensor
    bool compute_ppl    = true;  // whether to compute perplexity
-    bool parse_special  = false; // whether to parse special tokens during imatrix tokenization

    // cvector-generator params
    int n_pca_batch = 100;
@@ -505,9 +501,10 @@ static bool string_starts_with(const std::string & str,
    return str.rfind(prefix, 0) == 0;
 }

-// While we wait for C++20's std::string::ends_with...
-bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
-size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
+static bool string_ends_with(const std::string & str,
+                               const std::string & suffix) {  // While we wait for C++20's std::string::ends_with...
+    return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
+}

 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
@@ -667,9 +664,3 @@ const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

 }
-
-//
-// training utils
-//
-
-ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
--- a/common/llguidance.cpp
+++ b/common/llguidance.cpp
@@ -189,7 +189,6 @@ static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab)
        /* .tokenize_fn                        = */ llama_sampler_llg_tokenize_fn,
        /* .use_approximate_greedy_tokenize_fn = */ false,
        /* .tokenize_user_data                 = */ vocab,
-        /* .slices                             = */ nullptr,
    };

    char           error_buffer[1024];
--- a/common/minja/chat-template.hpp
+++ b/common/minja/chat-template.hpp
@@ -13,12 +13,10 @@
 #include <chrono>
 #include <cstddef>
 #include <cstdio>
-#include <ctime>
 #include <exception>
 #include <iomanip>
 #include <memory>
 #include <sstream>
-#include <stdexcept>
 #include <string>
 #include <vector>

@@ -395,8 +393,8 @@ class chat_template {

            for (const auto & message_ : adjusted_messages) {
                auto message = message_;
-                if (!message.contains("role") || (!message.contains("content") && !message.contains("tool_calls"))) {
-                    throw std::runtime_error("message must have 'role' and one of 'content' or 'tool_calls' fields: " + message.dump());
+                if (!message.contains("role") || !message.contains("content")) {
+                    throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
                }
                std::string role = message.at("role");

@@ -417,6 +415,7 @@ class chat_template {
                        }
                    }
                    if (polyfill_tool_calls) {
+                        auto content = message.at("content");
                        auto tool_calls = json::array();
                        for (const auto & tool_call : message.at("tool_calls")) {
                            if (tool_call.at("type") != "function") {
@@ -435,11 +434,8 @@ class chat_template {
                        auto obj = json {
                            {"tool_calls", tool_calls},
                        };
-                        if (message.contains("content")) {
-                            auto content = message.at("content");
-                            if (!content.is_null() && !content.empty()) {
-                                obj["content"] = content;
-                            }
+                        if (!content.is_null() && !content.empty()) {
+                            obj["content"] = content;
                        }
                        message["content"] = obj.dump(2);
                        message.erase("tool_calls");
--- a/common/minja/minja.hpp
+++ b/common/minja/minja.hpp
@@ -11,7 +11,6 @@
 #include <algorithm>
 #include <cctype>
 #include <cstddef>
-#include <cstdint>
 #include <cmath>
 #include <exception>
 #include <functional>
@@ -234,7 +233,7 @@ public:
      }
    } else if (is_object()) {
      if (!index.is_hashable())
-        throw std::runtime_error("Unhashable type: " + index.dump());
+        throw std::runtime_error("Unashable type: " + index.dump());
      auto it = object_->find(index.primitive_);
      if (it == object_->end())
        throw std::runtime_error("Key not found: " + index.dump());
@@ -253,7 +252,7 @@ public:
      auto index = key.get<int>();
      return array_->at(index < 0 ? array_->size() + index : index);
    } else if (object_) {
-      if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
+      if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
      auto it = object_->find(key.primitive_);
      if (it == object_->end()) return Value();
      return it->second;
@@ -262,7 +261,7 @@ public:
  }
  void set(const Value& key, const Value& value) {
    if (!object_) throw std::runtime_error("Value is not an object: " + dump());
-    if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
+    if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
    (*object_)[key.primitive_] = value;
  }
  Value call(const std::shared_ptr<Context> & context, ArgumentsValue & args) const {
@@ -399,7 +398,7 @@ public:
      }
      return false;
    } else if (object_) {
-      if (!value.is_hashable()) throw std::runtime_error("Unhashable type: " + value.dump());
+      if (!value.is_hashable()) throw std::runtime_error("Unashable type: " + value.dump());
      return object_->find(value.primitive_) != object_->end();
    } else {
      throw std::runtime_error("contains can only be called on arrays and objects: " + dump());
@@ -417,7 +416,7 @@ public:
    return const_cast<Value*>(this)->at(index);
  }
  Value& at(const Value & index) {
-    if (!index.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
+    if (!index.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
    if (is_array()) return array_->at(index.get<int>());
    if (is_object()) return object_->at(index.primitive_);
    throw std::runtime_error("Value is not an array or object: " + dump());
@@ -677,8 +676,8 @@ public:
 class VariableExpr : public Expression {
    std::string name;
 public:
-    VariableExpr(const Location & loc, const std::string& n)
-      : Expression(loc), name(n) {}
+    VariableExpr(const Location & location, const std::string& n)
+      : Expression(location), name(n) {}
    std::string get_name() const { return name; }
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
        if (!context->contains(name)) {
@@ -1201,9 +1200,9 @@ public:

 class SliceExpr : public Expression {
 public:
-    std::shared_ptr<Expression> start, end, step;
-    SliceExpr(const Location & loc, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e, std::shared_ptr<Expression> && st = nullptr)
-      : Expression(loc), start(std::move(s)), end(std::move(e)), step(std::move(st)) {}
+    std::shared_ptr<Expression> start, end;
+    SliceExpr(const Location & loc, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e)
+      : Expression(loc), start(std::move(s)), end(std::move(e)) {}
    Value do_evaluate(const std::shared_ptr<Context> &) const override {
        throw std::runtime_error("SliceExpr not implemented");
    }
@@ -1220,35 +1219,18 @@ public:
        if (!index) throw std::runtime_error("SubscriptExpr.index is null");
        auto target_value = base->evaluate(context);
        if (auto slice = dynamic_cast<SliceExpr*>(index.get())) {
-          auto len = target_value.size();
-          auto wrap = [len](int64_t i) -> int64_t {
-            if (i < 0) {
-              return i + len;
-            }
-            return i;
-          };
-          int64_t step = slice->step ? slice->step->evaluate(context).get<int64_t>() : 1;
-          if (!step) {
-            throw std::runtime_error("slice step cannot be zero");
-          }
-          int64_t start = slice->start ? wrap(slice->start->evaluate(context).get<int64_t>()) : (step < 0 ? len - 1 : 0);
-          int64_t end = slice->end ? wrap(slice->end->evaluate(context).get<int64_t>()) : (step < 0 ? -1 : len);
+          auto start = slice->start ? slice->start->evaluate(context).get<int64_t>() : 0;
+          auto end = slice->end ? slice->end->evaluate(context).get<int64_t>() : (int64_t) target_value.size();
          if (target_value.is_string()) {
            std::string s = target_value.get<std::string>();
-
-            std::string result;
-            if (start < end && step == 1) {
-              result = s.substr(start, end - start);
-            } else {
-              for (int64_t i = start; step > 0 ? i < end : i > end; i += step) {
-                result += s[i];
-              }
-            }
-            return result;
-
+            if (start < 0) start = s.size() + start;
+            if (end < 0) end = s.size() + end;
+            return s.substr(start, end - start);
          } else if (target_value.is_array()) {
+            if (start < 0) start = target_value.size() + start;
+            if (end < 0) end = target_value.size() + end;
            auto result = Value::array();
-            for (int64_t i = start; step > 0 ? i < end : i > end; i += step) {
+            for (auto i = start; i < end; ++i) {
              result.push_back(target_value.at(i));
            }
            return result;
@@ -1323,8 +1305,6 @@ public:
              if (name == "iterable") return l.is_iterable();
              if (name == "sequence") return l.is_array();
              if (name == "defined") return !l.is_null();
-              if (name == "true") return l.to_bool();
-              if (name == "false") return !l.to_bool();
              throw std::runtime_error("Unknown type for 'is' operator: " + name);
            };
            auto value = eval();
@@ -1540,10 +1520,6 @@ public:
            vargs.expectArgs("endswith method", {1, 1}, {0, 0});
            auto suffix = vargs.args[0].get<std::string>();
            return suffix.length() <= str.length() && std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
-          } else if (method->get_name() == "startswith") {
-            vargs.expectArgs("startswith method", {1, 1}, {0, 0});
-            auto prefix = vargs.args[0].get<std::string>();
-            return prefix.length() <= str.length() && std::equal(prefix.begin(), prefix.end(), str.begin());
          } else if (method->get_name() == "title") {
            vargs.expectArgs("title method", {0, 0}, {0, 0});
            auto res = str;
@@ -2106,37 +2082,28 @@ private:

      while (it != end && consumeSpaces() && peekSymbols({ "[", "." })) {
        if (!consumeToken("[").empty()) {
-          std::shared_ptr<Expression> index;
-          auto slice_loc = get_location();
-          std::shared_ptr<Expression> start, end, step;
-          bool has_first_colon = false, has_second_colon = false;
-
-          if (!peekSymbols({ ":" })) {
-            start = parseExpression();
-          }
-
-          if (!consumeToken(":").empty()) {
-            has_first_colon = true;
-            if (!peekSymbols({ ":", "]" })) {
-              end = parseExpression();
-            }
+            std::shared_ptr<Expression> index;
            if (!consumeToken(":").empty()) {
-              has_second_colon = true;
-              if (!peekSymbols({ "]" })) {
-                step = parseExpression();
+              auto slice_end = parseExpression();
+              index = std::make_shared<SliceExpr>(slice_end->location, nullptr, std::move(slice_end));
+            } else {
+              auto slice_start = parseExpression();
+              if (!consumeToken(":").empty()) {
+                consumeSpaces();
+                if (peekSymbols({ "]" })) {
+                  index = std::make_shared<SliceExpr>(slice_start->location, std::move(slice_start), nullptr);
+                } else {
+                  auto slice_end = parseExpression();
+                  index = std::make_shared<SliceExpr>(slice_start->location, std::move(slice_start), std::move(slice_end));
+                }
+              } else {
+                index = std::move(slice_start);
              }
            }
-          }
+            if (!index) throw std::runtime_error("Empty index in subscript");
+            if (consumeToken("]").empty()) throw std::runtime_error("Expected closing bracket in subscript");

-          if ((has_first_colon || has_second_colon) && (start || end || step)) {
-            index = std::make_shared<SliceExpr>(slice_loc, std::move(start), std::move(end), std::move(step));
-          } else {
-            index = std::move(start);
-          }
-          if (!index) throw std::runtime_error("Empty index in subscript");
-          if (consumeToken("]").empty()) throw std::runtime_error("Expected closing bracket in subscript");
-
-          value = std::make_shared<SubscriptExpr>(value->location, std::move(value), std::move(index));
+            value = std::make_shared<SubscriptExpr>(value->location, std::move(value), std::move(index));
        } else if (!consumeToken(".").empty()) {
            auto identifier = parseIdentifier();
            if (!identifier) throw std::runtime_error("Expected identifier in subscript");
--- a/common/regex-partial.cpp
+++ b/common/regex-partial.cpp
@@ -1,204 +0,0 @@
-#include "regex-partial.h"
-#include "common.h"
-#include <functional>
-#include <optional>
-
-common_regex::common_regex(const std::string & pattern) :
-    pattern(pattern),
-    rx(pattern),
-    rx_reversed_partial(regex_to_reversed_partial_regex(pattern)) {}
-
-common_regex_match common_regex::search(const std::string & input, size_t pos, bool as_match) const {
-    std::smatch match;
-    if (pos > input.size()) {
-        throw std::runtime_error("Position out of bounds");
-    }
-    auto start = input.begin() + pos;
-    auto found = as_match
-        ? std::regex_match(start, input.end(), match, rx)
-        : std::regex_search(start, input.end(), match, rx);
-    if (found) {
-        common_regex_match res;
-        res.type = COMMON_REGEX_MATCH_TYPE_FULL;
-        for (size_t i = 0; i < match.size(); ++i) {
-            auto begin = pos + match.position(i);
-            res.groups.emplace_back(begin, begin + match.length(i));
-        }
-        return res;
-    }
-    std::match_results<std::string::const_reverse_iterator> srmatch;
-    if (std::regex_match(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial)) {
-        auto group = srmatch[1].str();
-        if (group.length() != 0) {
-            auto it = srmatch[1].second.base();
-            // auto position = static_cast<size_t>(std::distance(input.begin(), it));
-            if ((!as_match) || it == input.begin()) {
-                common_regex_match res;
-                res.type = COMMON_REGEX_MATCH_TYPE_PARTIAL;
-                const size_t begin = std::distance(input.begin(), it);
-                const size_t end = input.size();
-                if (begin == std::string::npos || end == std::string::npos || begin > end) {
-                    throw std::runtime_error("Invalid range");
-                }
-                res.groups.push_back({begin, end});
-                return res;
-            }
-        }
-    }
-    return {};
-}
-
-/*
-  Transforms a regex pattern to a partial match pattern that operates on a reversed input string to find partial final matches of the original pattern.
-
-  Ideally we'd like to use boost::match_partial (https://beta.boost.org/doc/libs/1_59_0/libs/regex/doc/html/boost_regex/partial_matches.html)
-  to see if a string ends with a partial regex match, but but it's not in std::regex yet.
-  Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input.
-
-  - /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:(?:d)?c)?b)?a).*
-  - /a|b/ -> (a|b).*
-  - /a*?/ -> error, could match ""
-  - /a*b/ -> ((?:b)?a*+).* (final repetitions become eager)
-  - /.*?ab/ -> ((?:b)?a).* (merge .*)
-  - /a.*?b/ -> ((?:b)?.*?a).* (keep reluctant matches)
-  - /a(bc)d/ -> ((?:(?:d)?(?:(?:c)?b))?a).*
-  - /a(bc|de)/ -> ((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a).*
-  - /ab{2,4}c/ -> abbb?b?c -> ((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a).*
-
-  The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern
-  (i.e. just where the final .* starts in the inverted pattern; all other groups are turned into non-capturing groups, and reluctant quantifiers are ignored)
-*/
-std::string regex_to_reversed_partial_regex(const std::string & pattern) {
-    auto it = pattern.begin();
-    const auto end = pattern.end();
-
-    std::function<std::string()> process = [&]() {
-        std::vector<std::vector<std::string>> alternatives(1);
-        std::vector<std::string> * sequence = &alternatives.back();
-
-        while (it != end) {
-            if (*it == '[') {
-                auto start = it;
-                ++it;
-                while (it != end) {
-                    if ((*it == '\\') && (++it != end)) {
-                        ++it;
-                    } else if ((it != end) && (*it == ']')) {
-                        break;
-                    } else {
-                        ++it;
-                    }
-                }
-                if (it == end) {
-                    throw std::runtime_error("Unmatched '[' in pattern");
-                }
-                ++it;
-                sequence->push_back(std::string(start, it));
-            } else if (*it == '*' || *it == '?' || *it == '+') {
-                if (sequence->empty()) {
-                    throw std::runtime_error("Quantifier without preceding element");
-                }
-                sequence->back() += *it;
-                auto is_star = *it == '*';
-                ++it;
-                if (is_star) {
-                    if (*it == '?') {
-                        ++it;
-                    }
-                }
-            } else if (*it == '{') {
-                if (sequence->empty()) {
-                    throw std::runtime_error("Repetition without preceding element");
-                }
-                ++it;
-                auto start = it;
-                while (it != end && *it != '}') {
-                    ++it;
-                }
-                if (it == end) {
-                    throw std::runtime_error("Unmatched '{' in pattern");
-                }
-                auto parts = string_split(std::string(start, it), ",");
-                ++it;
-                if (parts.size() > 2) {
-                    throw std::runtime_error("Invalid repetition range in pattern");
-                }
-
-                auto parseOptInt = [&](const std::string & s, const std::optional<int> & def = std::nullopt) -> std::optional<int> {
-                    if (s.empty()) {
-                        return def;
-                    }
-                    return std::stoi(s);
-                };
-                auto min = parseOptInt(parts[0], 0);
-                auto max = parts.size() == 1 ? min : parseOptInt(parts[1]);
-                if (min && max && *max < *min) {
-                    throw std::runtime_error("Invalid repetition range in pattern");
-                }
-                // Brutal but... let's repeat at least min times, then ? for the delta between min & max (or * for unbounded)
-                auto part = sequence->back();
-                sequence->pop_back();
-                for (int i = 0; i < *min; i++) {
-                    sequence->push_back(part);
-                }
-                if (max) {
-                    for (int i = *min; i < *max; i++) {
-                        sequence->push_back(part + "?");
-                    }
-                } else {
-                    sequence->push_back(part + "*");
-                }
-            } else if (*it == '(') {
-                ++it;
-                if (it != end && *it == '?' && (it + 1 != end) && *(it + 1) == ':') {
-                    it += 2;
-                }
-                auto sub = process();
-                if (*it != ')') {
-                    throw std::runtime_error("Unmatched '(' in pattern");
-                }
-                ++it;
-                auto & part = sequence->emplace_back("(?:");
-                part += sub;
-                part += ")";
-            } else if (*it == ')') {
-                break;
-            } else if (*it == '|') {
-                ++it;
-                alternatives.emplace_back();
-                sequence = &alternatives.back();
-            } else if (*it == '\\' && (++it != end)) {
-                auto str = std::string("\\") + *it;
-                sequence->push_back(str);
-                ++it;
-            } else if (it != end) {
-                sequence->push_back(std::string(1, *it));
-                ++it;
-            }
-        }
-
-        // /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:d)?c)?b)?a).*
-        // if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group
-        // We'll do the outermost capturing group and final .* in the enclosing function.
-        std::vector<std::string> res_alts;
-        for (const auto & parts : alternatives) {
-            auto & res = res_alts.emplace_back();
-            for (size_t i = 0; i < parts.size() - 1; i++) {
-                res += "(?:";
-            }
-            for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
-                res += *it;
-                if (it != parts.rend() - 1) {
-                    res += ")?";
-                }
-            }
-        }
-        return string_join(res_alts, "|");
-    };
-    auto res = process();
-    if (it != end) {
-        throw std::runtime_error("Unmatched '(' in pattern");
-    }
-
-    return "(" + res + ")[\\s\\S]*";
-}
--- a/common/regex-partial.h
+++ b/common/regex-partial.h
@@ -1,56 +0,0 @@
-#pragma once
-
-#include <regex>
-#include <string>
-
-enum common_regex_match_type {
-    COMMON_REGEX_MATCH_TYPE_NONE,
-    COMMON_REGEX_MATCH_TYPE_PARTIAL,
-    COMMON_REGEX_MATCH_TYPE_FULL,
-};
-
-struct common_string_range {
-    size_t begin;
-    size_t end;
-    common_string_range(size_t begin, size_t end) : begin(begin), end(end) {
-        if (begin > end) {
-            throw std::runtime_error("Invalid range");
-        }
-    }
-    // prevent default ctor
-    common_string_range() = delete;
-    bool empty() const {
-        return begin == end;
-    }
-    bool operator==(const common_string_range & other) const {
-        return begin == other.begin && end == other.end;
-    }
-};
-
-struct common_regex_match {
-    common_regex_match_type type = COMMON_REGEX_MATCH_TYPE_NONE;
-    std::vector<common_string_range> groups;
-
-    bool operator==(const common_regex_match & other) const {
-        return type == other.type && groups == other.groups;
-    }
-    bool operator!=(const common_regex_match & other) const {
-        return !(*this == other);
-    }
-};
-
-class common_regex {
-    std::string pattern;
-    std::regex rx;
-    std::regex rx_reversed_partial;
-
-  public:
-    explicit common_regex(const std::string & pattern);
-
-    common_regex_match search(const std::string & input, size_t pos, bool as_match = false) const;
-
-    const std::string & str() const { return pattern; }
-};
-
-// For testing only (pretty print of failures).
-std::string regex_to_reversed_partial_regex(const std::string & pattern);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -308,7 +308,6 @@ class ModelBase:
                            gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
                            gguf.MODEL_TENSOR.POSNET_NORM1,
                            gguf.MODEL_TENSOR.POSNET_NORM2,
-                            gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
                        )
                    )
                    or not new_name.endswith(".weight")
@@ -427,11 +426,7 @@ class ModelBase:
            logger.warning(f"Failed to load model config from {dir_model}: {e}")
            logger.warning("Trying to load config.json instead")
            with open(dir_model / "config.json", "r", encoding="utf-8") as f:
-                config = json.load(f)
-                if "llm_config" in config:
-                    # rename for InternVL
-                    config["text_config"] = config["llm_config"]
-                return config
+                return json.load(f)

    @classmethod
    def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@@ -799,9 +794,6 @@ class TextModel(ModelBase):
        if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":
            # ref: https://huggingface.co/mistral-community/pixtral-12b
            res = "pixtral"
-        if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
-            # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
-            res = "seed-coder"

        if res is None:
            logger.warning("\n")
@@ -1396,10 +1388,10 @@ class BaichuanModel(TextModel):
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
        self.gguf_writer.add_file_type(self.ftype)

-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        head_count = self.hparams["num_attention_heads"]
@@ -1520,10 +1512,10 @@ class XverseModel(TextModel):
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
        self.gguf_writer.add_file_type(self.ftype)

-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
@@ -1836,10 +1828,10 @@ class LlamaModel(TextModel):
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
        self.gguf_writer.add_rope_dimension_count(rope_dim)

-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])

    @staticmethod
    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
@@ -2070,9 +2062,6 @@ class Llama4Model(LlamaModel):
        self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
-        if name.startswith("language_model."):
-            name = name.replace("language_model.", "")
-
        # split the gate_up into gate and up
        if "gate_up_proj" in name:
            name_up = name.replace("gate_up_proj", "up_proj.weight")
@@ -2093,26 +2082,6 @@ class Llama4Model(LlamaModel):
        return super().modify_tensors(data_torch, name, bid)


-@ModelBase.register("Llama4ForConditionalGeneration")
-class Llama4VisionModel(VisionModel):
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.LLAMA4)
-        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"])
-        self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / self.hparams["pixel_shuffle_ratio"]))
-        assert self.hparams["hidden_act"] == "gelu"
-        self.gguf_writer.add_vision_use_gelu(True)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid # unused
-        if "multi_modal_projector" in name or "vision_model" in name:
-            # process vision tensors
-            if "positional_embedding_vlm" in name and ".weight" not in name:
-                name += ".weight"
-            return [(self.map_tensor_name(name), data_torch)]
-        return []
-
-
@ModelBase.register("Mistral3ForConditionalGeneration")
 class Mistral3Model(LlamaModel):
    model_arch = gguf.MODEL_ARCH.LLAMA
@@ -2237,10 +2206,10 @@ class DeciModel(TextModel):
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
        self.gguf_writer.add_rope_dimension_count(rope_dim)

-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])

    @staticmethod
    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
@@ -2480,10 +2449,10 @@ class MiniCPMModel(TextModel):
        logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
        self.gguf_writer.add_logit_scale(logit_scale)
        logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "longrope":
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
-            logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
+        if self.hparams.get("rope_scaling") is not None:
+            if self.hparams["rope_scaling"].get("type") == "longrope":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
+                logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")

    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
        rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
@@ -2628,20 +2597,15 @@ class Qwen2Model(TextModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        self._try_set_pooling_type()
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "yarn":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+                self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        if self.hf_arch == "Qwen2Model":
            name = f"model.{name}"  # map to Qwen2ForCausalLM tensors
-        if "language_model." in name:
-            name = name.replace("language_model.", "") # for InternVL
-        if name.startswith("mlp") or name.startswith("vision_model"):
-            # skip visual tensors
-            return []
        yield from super().modify_tensors(data_torch, name, bid)


@@ -2745,62 +2709,6 @@ class Qwen2VLVisionModel(VisionModel):
        return [] # skip other tensors


-@ModelBase.register("InternVisionModel")
-class InternVisionModel(VisionModel):
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        hparams = self.hparams
-        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.INTERNVL)
-        self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
-        # hidden_act
-        if hparams["hidden_act"] == "silu":
-            self.gguf_writer.add_vision_use_silu(True)
-        elif hparams["hidden_act"] == "gelu":
-            self.gguf_writer.add_vision_use_gelu(True)
-        else:
-            raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}")
-        # downsample_ratio
-        downsample_ratio = self.global_config.get("downsample_ratio")
-        assert downsample_ratio is not None
-        self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
-
-    def tensor_force_quant(self, name, new_name, bid, n_dims):
-        del bid, name, n_dims  # unused
-        if ".patch_embd." in new_name:
-            return gguf.GGMLQuantizationType.F16
-        if ".position_embd." in new_name:
-            return gguf.GGMLQuantizationType.F32
-        return False
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid  # unused
-        if name.startswith("vision_model") or name.startswith("mlp"):
-            # process visual tensors
-            # correct name
-            if name.startswith("vision_model"):
-                name = "vision_tower." + name
-            if (".ls" in name or "position_embedding" in name) and not name.endswith(".weight"):
-                name += ".weight"
-            # split QKV tensors if needed
-            if ".qkv." in name:
-                if data_torch.ndim == 2: # weight
-                    c3, _ = data_torch.shape
-                else: # bias
-                    c3 = data_torch.shape[0]
-                assert c3 % 3 == 0
-                c = c3 // 3
-                wq = data_torch[:c]
-                wk = data_torch[c: c * 2]
-                wv = data_torch[c * 2:]
-                return [
-                    (self.map_tensor_name(name.replace("attn.qkv", "self_attn.q_proj")), wq),
-                    (self.map_tensor_name(name.replace("attn.qkv", "self_attn.k_proj")), wk),
-                    (self.map_tensor_name(name.replace("attn.qkv", "self_attn.v_proj")), wv),
-                ]
-            return [(self.map_tensor_name(name), data_torch)]
-        return [] # skip other tensors
-
-
@ModelBase.register("WavTokenizerDec")
 class WavTokenizerDecModel(TextModel):
    model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
@@ -2855,11 +2763,11 @@ class Qwen2MoeModel(TextModel):
            logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
        # YaRN is not enabled by default
        # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "yarn":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+                self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])

    _experts: list[dict[str, Tensor]] | None = None

@@ -3127,7 +3035,7 @@ class Phi3MiniModel(TextModel):

        scale = max_pos_embds / orig_max_pos_embds

-        rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
+        rope_scaling_type = rope_scaling.get('type', '').lower()
        if len(rope_scaling_type) == 0:
            raise KeyError('Missing the required key rope_scaling.type')

@@ -3439,10 +3347,10 @@ class InternLM2Model(TextModel):
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
        self.gguf_writer.add_file_type(self.ftype)
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        num_heads = self.hparams["num_attention_heads"]
@@ -3452,11 +3360,6 @@ class InternLM2Model(TextModel):
        head_dim = n_embd // num_heads
        num_groups = num_heads // q_per_kv

-        name = name.replace("language_model.", "") # InternVL
-        if name.startswith("mlp") or name.startswith("vision_model"):
-            # skip visual tensors
-            return []
-
        if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
            qkv = data_torch

@@ -3522,18 +3425,14 @@ class InternLM3Model(TextModel):
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
        self.gguf_writer.add_rope_dimension_count(rope_dim)

-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        n_head = self.hparams["num_attention_heads"]
        n_kv_head = self.hparams.get("num_key_value_heads")
-        name = name.replace("language_model.", "") # InternVL
-        if name.startswith("mlp") or name.startswith("vision_model"):
-            # skip visual tensors
-            return []
        if name.endswith(("q_proj.weight", "q_proj.bias")):
            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
        if name.endswith(("k_proj.weight", "k_proj.bias")):
@@ -4967,12 +4866,12 @@ class DeepseekV2Model(TextModel):

        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])

-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
-            self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "yarn":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+                self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
+                self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])

    _experts: list[dict[str, Tensor]] | None = None

@@ -5464,11 +5363,11 @@ class Glm4Model(TextModel):
        super().set_gguf_parameters()
        rope_dim = self.hparams["head_dim"]
        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "yarn":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+                self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])


@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
@@ -5701,10 +5600,10 @@ class ExaoneModel(TextModel):
        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
        rotary_factor = rotary_factor if rotary_factor is not None else 1.0
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+        if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]:
+            if hparams["rope_scaling"].get("type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])

    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
@@ -5770,20 +5669,11 @@ class GraniteModel(LlamaModel):
            logger.info("gguf: (granite) logits_scale = %s", logits_scale)


-@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM")
+@ModelBase.register("GraniteMoeForCausalLM")
 class GraniteMoeModel(GraniteModel):
    """Conversion for IBM's GraniteMoeForCausalLM"""
    model_arch = gguf.MODEL_ARCH.GRANITE_MOE

-    def set_gguf_parameters(self):
-        """GraniteMoeShared uses GraniteMoe parameters plus the following:
-        - shared_intermediate_size
-        """
-        super().set_gguf_parameters()
-        if shared_feed_forward_length := self.hparams.get("shared_intermediate_size"):
-            self.gguf_writer.add_expert_shared_feed_forward_length(shared_feed_forward_length)
-            logger.info("gguf: (granitemoeshared) shared_feed_forward_length = %s", shared_feed_forward_length)
-
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        """In modeling_granitemoe, the JetMoe implementation of parallel experts
        is used. This essentially merges w1 and w3 into a single tensor with 2x
@@ -5794,21 +5684,12 @@ class GraniteMoeModel(GraniteModel):
        if name.endswith("block_sparse_moe.input_linear.weight"):
            ffn_dim = self.hparams["intermediate_size"]
            assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
-            gate, up = data_torch.split(ffn_dim, dim=-2)
+            gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
            return [
                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
            ]

-        if name.endswith("shared_mlp.input_linear.weight"):
-            ffn_dim = self.hparams["shared_intermediate_size"]
-            assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
-            gate, up = data_torch.split(ffn_dim, dim=-2)
-            return [
-                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
-                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
-            ]
-
        return super().modify_tensors(data_torch, name, bid)


@@ -5825,11 +5706,10 @@ class BailingMoeModel(TextModel):
        rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]

        self.gguf_writer.add_rope_dimension_count(rope_dim)
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
+        if (self.hparams.get("rope_scaling") or {}).get("type") == "yarn" and "factor" in self.hparams["rope_scaling"]:
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
+            self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
        else:
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -116,7 +116,6 @@ models = [
    {"name": "llama4",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
    {"name": "glm4",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
    {"name": "pixtral",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
-    {"name": "seed-coder",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
 ]


--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -17,25 +17,25 @@

 **SYCL** is a high-level parallel programming model designed to improve developers productivity writing code across various hardware accelerators such as CPUs, GPUs, and FPGAs. It is a single-source language designed for heterogeneous computing and based on standard C++17.

-**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to Intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
+**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:

 - **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
 - **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over Intel iGPUs and dGPUs.
+- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
 - **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.

 ### Llama.cpp + SYCL

-The llama.cpp SYCL backend is primarily designed for **Intel GPUs**.
-SYCL cross-platform capabilities enable support for Nvidia GPUs as well, with limited support for AMD.
+The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.

 ## Recommended Release

-The following releases are verified and recommended:
+The SYCL backend would be broken by some PRs due to no online CI.
+
+The following release is verified with good quality:

 |Commit ID|Tag|Release|Verified  Platform| Update date|
 |-|-|-|-|-|
-|24e86cae7219b0f3ede1d5abdf5bf3ad515cccb8|b5377 |[llama-b5377-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b5377/llama-b5377-bin-win-sycl-x64.zip) |ArcB580/Linux/oneAPI 2025.1<br>LNL Arc GPU/Windows 11/oneAPI 2025.1.1|2025-05-15|
 |3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
 |fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||

@@ -106,14 +106,15 @@ SYCL backend supports Intel GPU Family:
 |-------------------------------|---------|---------------------------------------|
 | Intel Data Center Max Series  | Support | Max 1550, 1100                        |
 | Intel Data Center Flex Series | Support | Flex 170                              |
-| Intel Arc Series              | Support | Arc 770, 730M, Arc A750, B580         |
-| Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake, Arrow Lake, Lunar Lake |
-| Intel iGPU                    | Support | iGPU in 13700k, 13400, i5-1250P, i7-1260P, i7-1165G7  |
+| Intel Arc Series              | Support | Arc 770, 730M, Arc A750               |
+| Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake, Arrow Lake    |
+| Intel iGPU                    | Support | iGPU in 13700k,iGPU in 13400, i5-1250P, i7-1260P, i7-1165G7 |

 *Notes:*

 - **Memory**
  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
+
  - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.

 - **Execution Unit (EU)**
@@ -137,11 +138,9 @@ Note: AMD GPU support is highly experimental and is incompatible with F16.
 Additionally, it only supports GPUs with a sub_group_size (warp size) of 32.

 ## Docker
-
-The docker build option is currently limited to *Intel GPU* targets.
+The docker build option is currently limited to *intel GPU* targets.

 ### Build image
-
 ```sh
 # Using FP16
 docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
@@ -149,10 +148,9 @@ docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f

 *Notes*:

-To build in default FP32 *(Slower than FP16 alternative)*, set `--build-arg="GGML_SYCL_F16=OFF"` in the previous command.
+To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.

 You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
-Check the [documentation for Docker](../docker.md) to see the available images.

 ### Run container

@@ -252,7 +250,7 @@ sycl-ls

 - **Intel GPU**

-When targeting an intel GPU, the user should expect one or more devices among the available SYCL devices. Please make sure that at least one GPU is present via `sycl-ls`, for instance `[level_zero:gpu]` in the sample output below:
+When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`level_zero:gpu`] in the sample output below:

 ```
 [opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
@@ -284,7 +282,7 @@ For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:

 #### Intel GPU

-```sh
+```
 ./examples/sycl/build.sh
 ```

@@ -353,7 +351,7 @@ cmake --build build --config Release -j -v

 #### Retrieve and prepare model

-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).
+You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.

 ##### Check device

@@ -400,15 +398,11 @@ Choose one of following methods to run.

 ```sh
 ./examples/sycl/run-llama2.sh 0
-# OR
-./examples/sycl/run-llama3.sh 0
 ```
 - Use multiple devices:

 ```sh
 ./examples/sycl/run-llama2.sh
-# OR
-./examples/sycl/run-llama3.sh
 ```

 2. Command line
@@ -431,13 +425,13 @@ Examples:
 - Use device 0:

 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm none -mg 0
+ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
 ```

 - Use multiple devices:

 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm layer
+ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
 ```

 *Notes:*
@@ -458,7 +452,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 1. Install GPU driver

-Intel GPU drivers instructions guide and download page can be found here: [Get Intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
+Intel GPU drivers instructions guide and download page can be found here: [Get intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).

 2. Install Visual Studio

@@ -635,7 +629,7 @@ Once it is completed, final results will be in **build/Release/bin**

 #### Retrieve and prepare model

-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).
+You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.

 ##### Check device

@@ -654,7 +648,7 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
 build\bin\llama-ls-sycl-device.exe
 ```

-This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *Intel GPU* it would look like the following:
+This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
 ```
 found 2 SYCL devices:
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
@@ -664,14 +658,13 @@ found 2 SYCL devices:
 | 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|

 ```
-
 #### Choose level-zero devices

 |Chosen Device ID|Setting|
 |-|-|
-|0|Default option. You may also want to `set ONEAPI_DEVICE_SELECTOR="level_zero:0"`|
+|0|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
 |1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
-|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"` or `set ONEAPI_DEVICE_SELECTOR="level_zero:*"`|
+|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|

 #### Execute

@@ -680,13 +673,7 @@ Choose one of following methods to run.
 1. Script

 ```
-examples\sycl\win-run-llama-2.bat
-```
-
-or
-
-```
-examples\sycl\win-run-llama-3.bat
+examples\sycl\win-run-llama2.bat
 ```

 2. Command line
@@ -710,13 +697,13 @@ Examples:
 - Use device 0:

 ```
-build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm none -mg 0
+build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
 ```

 - Use multiple devices:

 ```
-build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm layer
+build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
 ```


@@ -727,9 +714,7 @@ Note:
 ```sh
 detect 1 SYCL GPUs: [0] with top Max compute units:512
 ```
-
 Or
-
 ```sh
 use 1 SYCL GPUs: [0] with Max compute units:512
 ```
@@ -741,17 +726,14 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 | Name               | Value                                 | Function                                    |
 |--------------------|---------------------------------------|---------------------------------------------|
-| GGML_SYCL          | ON (mandatory)                        | Enable build with SYCL code path.           |
+| GGML_SYCL          | ON (mandatory)                        | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
 | GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA \| AMD    | Set the SYCL target device type.            |
 | GGML_SYCL_DEVICE_ARCH | Optional (except for AMD)             | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
-| GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)*     | Enable FP16 build with SYCL code path. (1.) |
+| GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)*     | Enable FP16 build with SYCL code path.      |
 | GGML_SYCL_GRAPH    | ON *(default)* \|OFF *(Optional)*     | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
-| GGML_SYCL_DNN      | ON *(default)* \|OFF *(Optional)*     | Enable build with oneDNN.                   |
 | CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |

-1. FP16 is recommended for better prompt processing performance on quantized models. Performance is equivalent in text generation but set `GGML_SYCL_F16=OFF` if you are experiencing issues with FP16 builds.
-
 #### Runtime

 | Name              | Value            | Function                                                                                                                  |
@@ -759,7 +741,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
 | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
 | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
-| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |


@@ -769,7 +750,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 ## Q&A

- Error:  `error while loading shared libraries: libsycl.so: cannot open shared object file: No such file or directory`.
+- Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.

  - Potential cause: Unavailable oneAPI installation or not set ENV variables.
  - Solution: Install *oneAPI base toolkit* and enable its ENV through: `source /opt/intel/oneapi/setvars.sh`.
@@ -798,18 +779,18 @@ use 1 SYCL GPUs: [0] with Max compute units:512

  It's same for other projects including llama.cpp SYCL backend.

- `Native API failed. Native API returns: 39 (UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY)`, `ggml_backend_sycl_buffer_type_alloc_buffer: can't allocate 3503030272 Bytes of memory on device`, or `failed to allocate SYCL0 buffer`
+- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`

-  You are running out of Device Memory.
+  Device Memory is not enough.

  |Reason|Solution|
  |-|-|
-  | The default context is too big. It leads to excessive memory usage.|Set `-c 8192` or a smaller value.|
-  | The model is too big and requires more memory than what is available.|Choose a smaller model or change to a smaller quantization, like Q5 -> Q4;<br>Alternatively, use more than one device to load model.|
+  |Default Context is too big. It leads to more memory usage.|Set `-c 8192` or smaller value.|
+  |Model is big and require more memory than device's.|Choose smaller quantized model, like Q5 -> Q4;<br>Use more than one devices to load model.|

 ### **GitHub contribution**:
-Please add the `SYCL :` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.
+Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.

 ## TODO

- Review ZES_ENABLE_SYSMAN: https://github.com/intel/compute-runtime/blob/master/programmers-guide/SYSMAN.md#support-and-limitations
+- NA
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -22,9 +22,6 @@ Additionally, there the following images, similar to the above:
 - `ghcr.io/ggml-org/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggml-org/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggml-org/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-intel`: Same as `full` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-intel`: Same as `light` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-intel`: Same as `server` but compiled with SYCL support. (platforms: `linux/amd64`)

 The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).

--- a/docs/multimodal.md
+++ b/docs/multimodal.md
@@ -1,80 +0,0 @@
-# Multimodal
-
-llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools support this feature:
- [llama-mtmd-cli](../tools/mtmd/README.md)
- [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API
-
-To enable it, can use use one of the 2 methods below:
-
- Use `-hf` option with a supported model (see a list of pre-quantized model below)
-    - To load a model using `-hf` while disabling multimodal, use `--no-mmproj`
-    - To load a model using `-hf` while using a custom mmproj file, use `--mmproj local_file.gguf`
- Use `-m model.gguf` option with `--mmproj file.gguf` to specify text and multimodal projector respectively
-
-By default, multimodal projector will be offloaded to GPU. To disable this, add `--no-mmproj-offload`
-
-For example:
-
-```sh
-# simple usage with CLI
-llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
-
-# simple usage with server
-llama-server -hf ggml-org/gemma-3-4b-it-GGUF
-
-# using local file
-llama-server -m gemma-3-4b-it-Q4_K_M.gguf --mmproj mmproj-gemma-3-4b-it-Q4_K_M.gguf
-
-# no GPU offload
-llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
-```
-
-## Pre-quantized models
-
-These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/ggml-org
-
-Replaces the `(tool_name)` with the name of binary you want to use. For example, `llama-mtmd-cli` or `llama-server`
-
-NOTE: some models may require large context window, for example: `-c 8192`
-
-```sh
-# Gemma 3
-(tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
-(tool_name) -hf ggml-org/gemma-3-12b-it-GGUF
-(tool_name) -hf ggml-org/gemma-3-27b-it-GGUF
-
-# SmolVLM
-(tool_name) -hf ggml-org/SmolVLM-Instruct-GGUF
-(tool_name) -hf ggml-org/SmolVLM-256M-Instruct-GGUF
-(tool_name) -hf ggml-org/SmolVLM-500M-Instruct-GGUF
-(tool_name) -hf ggml-org/SmolVLM2-2.2B-Instruct-GGUF
-(tool_name) -hf ggml-org/SmolVLM2-256M-Video-Instruct-GGUF
-(tool_name) -hf ggml-org/SmolVLM2-500M-Video-Instruct-GGUF
-
-# Pixtral 12B
-(tool_name) -hf ggml-org/pixtral-12b-GGUF
-
-# Qwen 2 VL
-(tool_name) -hf ggml-org/Qwen2-VL-2B-Instruct-GGUF
-(tool_name) -hf ggml-org/Qwen2-VL-7B-Instruct-GGUF
-
-# Qwen 2.5 VL
-(tool_name) -hf ggml-org/Qwen2.5-VL-3B-Instruct-GGUF
-(tool_name) -hf ggml-org/Qwen2.5-VL-7B-Instruct-GGUF
-(tool_name) -hf ggml-org/Qwen2.5-VL-32B-Instruct-GGUF
-(tool_name) -hf ggml-org/Qwen2.5-VL-72B-Instruct-GGUF
-
-# Mistral Small 3.1 24B (IQ2_M quantization)
-(tool_name) -hf ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF
-
-# InternVL 2.5 and 3
-(tool_name) -hf ggml-org/InternVL2_5-1B-GGUF
-(tool_name) -hf ggml-org/InternVL2_5-4B-GGUF
-(tool_name) -hf ggml-org/InternVL3-1B-Instruct-GGUF
-(tool_name) -hf ggml-org/InternVL3-2B-Instruct-GGUF
-(tool_name) -hf ggml-org/InternVL3-8B-Instruct-GGUF
-(tool_name) -hf ggml-org/InternVL3-14B-Instruct-GGUF
-
-# Llama 4 Scout
-(tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF
-```
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -32,7 +32,6 @@ else()
    add_subdirectory(speculative)
    add_subdirectory(speculative-simple)
    add_subdirectory(gen-docs)
-    add_subdirectory(training)
    if (NOT GGML_BACKEND_DL)
        add_subdirectory(convert-llama2c-to-ggml)
        # these examples use the backends directly and cannot be built with dynamic loading
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -35,14 +35,23 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke

 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+    const struct llama_model * model = llama_get_model(ctx);

    // clear previous kv_cache values (irrelevant for embeddings)
    llama_kv_self_clear(ctx);

    // run model
    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
-    if (llama_encode(ctx, batch) < 0) {
-        LOG_ERR("%s : failed to encode\n", __func__);
+    if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
+        // encoder-only model
+        if (llama_encode(ctx, batch) < 0) {
+            LOG_ERR("%s : failed to encode\n", __func__);
+        }
+    } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
+        // decoder-only model
+        if (llama_decode(ctx, batch) < 0) {
+            LOG_ERR("%s : failed to decode\n", __func__);
+        }
    }

    for (int i = 0; i < batch.n_tokens; i++) {
--- a/examples/parallel/README.md
+++ b/examples/parallel/README.md
@@ -1,14 +1,3 @@
 # llama.cpp/example/parallel

 Simplified simulation of serving incoming requests in parallel
-
-## Example
-
-Generate 128 client requests (`-ns 128`), simulating 8 concurrent clients (`-np 8`). The system prompt is shared (`-pps`), meaning that it is computed once at the start. The client requests consist of 10 junk questions (`-j 10`) followed by the actual question.
-
-```bash
-llama-parallel -m model.gguf -np 8 -ns 128 --top-k 1 -pps --junk 10 -c 16384
-```
-
-> [!NOTE]
-> It's recommended to use base models with this example. Instruction tuned models might not be able to properly follow the custom chat template specified here, so the results might not be as expected.
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -34,61 +34,11 @@ static std::string k_system =
 R"(Transcript of a never ending dialog, where the User interacts with an Assistant.
 The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.

-User:
-Recommend a nice restaurant in the area.
-Assistant:
-I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
-User:
-Who is Richard Feynman?
-Assistant:
-Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
-)";
-
-static std::vector<std::string> k_questions = {
-    "What is the tallest mountain in the world?",
-    "Who was the first person to win two Nobel Prizes?",
-    "Which country invented paper?",
-    "What organ is primarily responsible for pumping blood throughout the body?",
-    "Which planet is known for its prominent ring system?",
-    "Who directed the movie 'Inception'?",
-    "What is the freezing point of water in Fahrenheit?",
-    "Which animal is known to have the longest lifespan?",
-    "What language has the most native speakers worldwide?",
-    "What is the capital city of Canada?",
-    "Who is credited with inventing the World Wide Web?",
-    "Which metal is liquid at room temperature?",
-    "What is the term for an animal that eats both plants and meat?",
-    "Who painted 'The Starry Night'?",
-    "What gas do humans exhale that plants use for photosynthesis?",
-    "What year did World War II end?",
-    "Which continent has the most countries?",
-    "Who wrote the novel 'Frankenstein'?",
-    "What does DNA stand for?",
-    "What is the main ingredient in traditional Japanese miso soup?"
-};
-
-static std::vector<std::string> k_answers = {
-    "The tallest mountain in the world is Mount Everest.",
-    "Marie Curie was the first person to win two Nobel Prizes.",
-    "Paper was invented in China.",
-    "The heart is the organ responsible for pumping blood.",
-    "Saturn is known for its prominent ring system.",
-    "Christopher Nolan directed the movie 'Inception'.",
-    "The freezing point of water in Fahrenheit is 32°F.",
-    "The bowhead whale is known to have the longest lifespan among mammals.",
-    "Mandarin Chinese has the most native speakers in the world.",
-    "The capital city of Canada is Ottawa.",
-    "Tim Berners-Lee is credited with inventing the World Wide Web.",
-    "Mercury is the metal that is liquid at room temperature.",
-    "An animal that eats both plants and meat is called an omnivore.",
-    "'The Starry Night' was painted by Vincent van Gogh.",
-    "Humans exhale carbon dioxide, which plants use in photosynthesis.",
-    "World War II ended in 1945.",
-    "Africa is the continent with the most countries.",
-    "The novel 'Frankenstein' was written by Mary Shelley.",
-    "DNA stands for Deoxyribonucleic Acid.",
-    "The main ingredient in traditional Japanese miso soup is fermented soybean paste."
-};
+User: Recommend a nice restaurant in the area.
+Assistant: I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
+User: Who is Richard Feynman?
+Assistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
+User:)";

 static std::vector<std::string> k_prompts = {
    "What is the meaning of life?",
@@ -99,7 +49,7 @@ static std::vector<std::string> k_prompts = {
    "What is the best way to learn a new language?",
    "How to get a job at Google?",
    "If you could have any superpower, what would it be?",
-    "I want to learn how to play the piano. What would be the best way to do it?",
+    "I want to learn how to play the piano.",
 };

 struct client {
@@ -118,7 +68,6 @@ struct client {
    int64_t t_start_prompt;
    int64_t t_start_gen;

-    int32_t n_past    = 0;
    int32_t n_prompt  = 0;
    int32_t n_decoded = 0;
    int32_t i_batch   = -1;
@@ -158,7 +107,6 @@ int main(int argc, char ** argv) {
    common_params params;

    params.n_predict = 128;
-    params.n_junk = 0;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
        return 1;
@@ -180,12 +128,6 @@ int main(int argc, char ** argv) {

    const bool dump_kv_cache = params.dump_kv_cache;

-    // is the system prompt shared in the cache
-    const bool is_sp_shared = params.is_pp_shared;
-
-    // extra text to insert in each client's prompt in order to make it larger
-    const int32_t n_junk = params.n_junk;
-
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -227,7 +169,6 @@ int main(int argc, char ** argv) {
    }

    std::vector<llama_token> tokens_system;
-
    tokens_system = common_tokenize(ctx, k_system, true);
    const int32_t n_tokens_system = tokens_system.size();

@@ -249,7 +190,7 @@ int main(int argc, char ** argv) {
    LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
    LOG_INF("\n");

-    if (is_sp_shared) {
+    {
        LOG_INF("%s: Evaluating the system prompt ...\n", __func__);

        for (int32_t i = 0; i < n_tokens_system; ++i) {
@@ -287,7 +228,7 @@ int main(int argc, char ** argv) {

            client.i_batch = batch.n_tokens;

-            common_batch_add(batch, client.sampled, client.n_past++, { client.id + 1 }, true);
+            common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);

            client.n_decoded += 1;
        }
@@ -313,23 +254,9 @@ int main(int argc, char ** argv) {
                    client.t_start_gen    = 0;

                    client.input    = k_prompts[rand() % k_prompts.size()];
+                    client.prompt   = client.input + "\nAssistant:";
                    client.response = "";

-                    // construct the prompt:
-                    // [system prompt] + [junk] + [user prompt]
-                    client.n_past = 0;
-                    client.prompt = "";
-                    if (is_sp_shared) {
-                        client.n_past = n_tokens_system;
-                    } else {
-                        client.prompt += k_system;
-                    }
-                    for (int i = 0; i < n_junk; ++i) {
-                        const int r = rand() % k_questions.size();
-                        client.prompt += "User:\n" + k_questions[r] + "\nAssistant:\n " + k_answers[r] + "\n";
-                    }
-                    client.prompt += "User:\n" + client.input + "\nAssistant:\n";
-
                    common_sampler_reset(client.smpl);

                    // do not prepend BOS because we have a system prompt!
@@ -337,7 +264,7 @@ int main(int argc, char ** argv) {
                    tokens_prompt = common_tokenize(ctx, client.prompt, false);

                    for (size_t i = 0; i < tokens_prompt.size(); ++i) {
-                        common_batch_add(batch, tokens_prompt[i], client.n_past++, { client.id + 1 }, false);
+                        common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
                    }

                    // extract the logits only for the last token
@@ -436,9 +363,10 @@ int main(int argc, char ** argv) {
                //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());

                if (client.n_decoded > 2 &&
-                    (llama_vocab_is_eog(vocab, id) ||
-                     (params.n_predict > 0 && client.n_decoded >= params.n_predict) ||
-                     client.response.find("User:") != std::string::npos)) {
+                        (llama_vocab_is_eog(vocab, id) ||
+                         (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
+                         client.response.find("User:") != std::string::npos ||
+                         client.response.find('\n') != std::string::npos)) {
                    // basic reverse prompt
                    const size_t pos = client.response.find("User:");
                    if (pos != std::string::npos) {
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -84,13 +84,13 @@ int main(int argc, char ** argv) {
    model_params.n_gpu_layers = ngl;

    llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
+    const llama_vocab * vocab = llama_model_get_vocab(model);

    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }

-    const llama_vocab * vocab = llama_model_get_vocab(model);
    // tokenize the prompt

    // find the number of tokens in the prompt
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -12,16 +12,16 @@ source /opt/intel/oneapi/setvars.sh

 INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
 MODEL_FILE=models/llama-2-7b.Q4_0.gguf
-NGL=99
-CONTEXT=4096
+NGL=33
+CONEXT=4096

 if [ $# -gt 0 ]; then
    GGML_SYCL_DEVICE=$1
    echo "use $GGML_SYCL_DEVICE as main GPU"
    #use signle GPU only
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT} -mg $GGML_SYCL_DEVICE -sm none

 else
    #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT}
 fi
--- a/examples/sycl/run-llama3.sh
+++ b/examples/sycl/run-llama3.sh
@@ -1,28 +0,0 @@
-#!/bin/bash
-
-#  MIT license
-#  Copyright (C) 2025 Intel Corporation
-#  SPDX-License-Identifier: MIT
-
-# If you want more control, DPC++ Allows selecting a specific device through the
-# following environment variable
-#export ONEAPI_DEVICE_SELECTOR="level_zero:0"
-source /opt/intel/oneapi/setvars.sh
-
-#export GGML_SYCL_DEBUG=1
-
-#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
-
-INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
-MODEL_FILE=models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
-NGL=99 # Layers offloaded to the GPU. If the device runs out of memory, reduce this value according to the model you are using.
-CONTEXT=4096
-
-if [ $# -gt 0 ]; then
-    GGML_SYCL_DEVICE=$1
-    echo "Using $GGML_SYCL_DEVICE as the main GPU"
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
-else
-    #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT}
-fi
--- a/examples/sycl/win-run-llama2.bat
+++ b/examples/sycl/win-run-llama2.bat
@@ -6,4 +6,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force


-.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 99 -s 0
+.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
--- a/examples/sycl/win-run-llama3.bat
+++ b/examples/sycl/win-run-llama3.bat
@@ -1,9 +0,0 @@
-::  MIT license
-::  Copyright (C) 2024 Intel Corporation
-::  SPDX-License-Identifier: MIT
-
-set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
-@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
-
-
-.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -e -ngl 99
--- a/examples/training/CMakeLists.txt
+++ b/examples/training/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET llama-finetune)
-add_executable(${TARGET} finetune.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/training/README.md
+++ b/examples/training/README.md
@@ -1,17 +0,0 @@
-# llama.cpp/examples/training
-
-This directory contains examples related to language model training using llama.cpp/GGML.
-So far finetuning is technically functional (for FP32 models and limited hardware setups) but the code is very much WIP.
-Finetuning of Stories 260K and LLaMA 3.2 1b seems to work with 24 GB of memory.
-**For CPU training, compile llama.cpp without any additional backends such as CUDA.**
-**For CUDA training, use the maximum number of GPU layers.**
-
-Proof of concept:
-
-``` sh
-export model_name=llama_3.2-1b && export quantization=f32
-./build/bin/finetune --file wikitext-2-raw/wiki.test.raw -ngl 999 --model models/${model_name}-${quantization}.gguf -c 512 -b 512 -ub 512
-./build/bin/perplexity --file wikitext-2-raw/wiki.test.raw -ngl 999 --model finetuned-model.gguf
-```
-
-The perplexity value of the finetuned model should be lower after training on the test set for 2 epochs.
--- a/examples/training/finetune.cpp
+++ b/examples/training/finetune.cpp
@@ -1,96 +0,0 @@
-#include "arg.h"
-#include "common.h"
-#include "log.h"
-#include "llama.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-int main(int argc, char ** argv) {
-    common_params params;
-
-    params.escape = false;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
-        return 1;
-    }
-
-    if (params.use_mmap) {
-        LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__);
-        params.use_mmap = false;
-    }
-    if (params.cache_type_k != GGML_TYPE_F32) {
-        LOG_INF("%s: force changing k cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
-        params.cache_type_k = GGML_TYPE_F32;
-    }
-    if (params.cache_type_v != GGML_TYPE_F32) {
-        LOG_INF("%s: force changing v cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
-        params.cache_type_v = GGML_TYPE_F32;
-    }
-
-    common_init();
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    // load the model and apply lora adapter, if any
-    common_init_result llama_init = common_init_from_params(params);
-    llama_model_ptr   & model = llama_init.model;
-    llama_context_ptr & ctx   = llama_init.context;
-
-    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n", __func__);
-        return 1;
-    }
-
-    // print system information
-    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
-    }
-
-    constexpr float val_split = 0.05f;
-
-    std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
-    ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);
-
-    struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr);
-    optimizer_params.adamw.alpha = 1e-7f; // learning rate
-
-    struct llama_opt_params lopt_params {
-        /*n_ctx_train     =*/ 0,
-        /*param_filter    =*/ llama_opt_param_filter_all,
-        /*param_filter_ud =*/ nullptr,
-        /*get_opt_pars    =*/ ggml_opt_get_constant_optimizer_params,
-        /*get_opt_pars_ud =*/ &optimizer_params,
-    };
-    llama_opt_init(ctx.get(), model.get(), lopt_params);
-
-    const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - val_split);
-
-    ggml_opt_result_t result_train = ggml_opt_result_init();
-    ggml_opt_result_t result_eval  = ggml_opt_result_init();
-
-    for (int epoch = 0; epoch < 2; ++epoch) {
-        llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
-            ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
-        fprintf(stderr, "\n");
-
-        ggml_opt_result_reset(result_train);
-        ggml_opt_result_reset(result_eval);
-    }
-    ggml_opt_result_free(result_train);
-    ggml_opt_result_free(result_eval);
-
-    llama_model_save_to_file(model.get(), "finetuned-model.gguf");
-
-    llama_backend_free();
-
-    return 0;
-}
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -193,7 +193,6 @@ option(GGML_RPC                             "ggml: use RPC"
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 option(GGML_SYCL_GRAPH                      "ggml: enable graphs in the SYCL backend"         ON)
-option(GGML_SYCL_DNN                        "ggml: enable oneDNN in the SYCL backend"         ON)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
                                            "ggml: sycl target device")
 set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -248,7 +248,7 @@ extern "C" {
        // preferrably to run on the same backend as the buffer
        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);

-        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
+        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);

        // initialize buffers from a max size graph (optional)
        reserve_graph = build_graph(sched, max_batch_size);
@@ -289,7 +289,7 @@ extern "C" {
    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);

    // Initialize a backend scheduler, backends with low index are given priority over backends with high index
-    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
+    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);

    // Initialize backend buffers from a measure graph
--- a/ggml/include/ggml-opt.h
+++ b/ggml/include/ggml-opt.h
@@ -37,16 +37,13 @@ extern "C" {
    // ====== Dataset ======

    GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
-            enum ggml_type type_data,    // the type for the internal data tensor
-            enum ggml_type type_label,   // the type for the internal labels tensor
-            int64_t        ne_datapoint, // number of elements per datapoint
-            int64_t        ne_label,     // number of elements per label
-            int64_t        ndata,        // total number of datapoints/labels
-            int64_t        ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
+            int64_t ne_datapoint, // number of elements per datapoint
+            int64_t ne_label,     // number of elements per label
+            int64_t ndata,        // total number of datapoints/labels
+            int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
    GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);

    // get underlying tensors that store the data
-    GGML_API int64_t              ggml_opt_dataset_ndata (ggml_opt_dataset_t dataset);
    GGML_API struct ggml_tensor * ggml_opt_dataset_data  (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
    GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label,     ndata]

@@ -59,19 +56,13 @@ extern "C" {
            struct ggml_tensor * data_batch,   // shape = [ne_datapoint, ndata_batch]
            struct ggml_tensor * labels_batch, // shape = [ne_label,     ndata_batch]
            int64_t              ibatch);
-    GGML_API void ggml_opt_dataset_get_batch_host(
-            ggml_opt_dataset_t   dataset,
-            void               * data_batch,
-            size_t               nb_data_batch,
-            void               * labels_batch,
-            int64_t              ibatch);

    // ====== Model / Context ======

    enum ggml_opt_build_type {
-        GGML_OPT_BUILD_TYPE_FORWARD = 10,
-        GGML_OPT_BUILD_TYPE_GRAD    = 20,
-        GGML_OPT_BUILD_TYPE_OPT     = 30,
+        GGML_OPT_BUILD_TYPE_FORWARD,
+        GGML_OPT_BUILD_TYPE_GRAD,
+        GGML_OPT_BUILD_TYPE_OPT,
    };

    // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
@@ -90,22 +81,20 @@ extern "C" {
    // userdata can be used to pass arbitrary data
    typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);

-    // returns the default optimizer params (constant, hard-coded values)
+    // returns the default optimizer params (constant)
    // userdata is not used
    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);

-    // casts userdata to ggml_opt_optimizer_params and returns it
-    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata);
-
    // parameters for initializing a new optimization context
    struct ggml_opt_params {
        ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs

-        // by default the forward graph needs to be reconstructed for each eval
-        // if ctx_compute, inputs, and outputs are set the graphs are instead allocated statically
-        struct ggml_context * ctx_compute;
-        struct ggml_tensor  * inputs;
-        struct ggml_tensor  * outputs;
+        struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
+
+        // the forward graph is defined by inputs and outputs
+        // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
+        struct ggml_tensor * inputs;
+        struct ggml_tensor * outputs;

        enum ggml_opt_loss_type  loss_type;
        enum ggml_opt_build_type build_type;
@@ -118,9 +107,12 @@ extern "C" {

    // get parameters for an optimization context with defaults set where possible
    // parameters for which no sensible defaults exist are supplied as arguments to this function
-    GGML_API struct ggml_opt_params ggml_opt_default_params(
-            ggml_backend_sched_t    backend_sched,
-            enum ggml_opt_loss_type loss_type);
+    GGML_API ggml_opt_params ggml_opt_default_params(
+            ggml_backend_sched_t      backend_sched,
+            struct ggml_context     * ctx_compute,
+            struct ggml_tensor      * inputs,
+            struct ggml_tensor      * outputs,
+            enum ggml_opt_loss_type   loss_type);

    GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
    GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
@@ -128,10 +120,7 @@ extern "C" {
    // set gradients to zero, initilize loss, and optionally reset the optimizer
    GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);

-    GGML_API bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx); // whether the graphs are allocated_statically
-
    // get underlying tensors that store data
-    // if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
    GGML_API struct ggml_tensor * ggml_opt_inputs(  ggml_opt_context_t opt_ctx); // forward graph input tensor
    GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
    GGML_API struct ggml_tensor * ggml_opt_labels(  ggml_opt_context_t opt_ctx); // labels to compare outputs against
@@ -139,12 +128,11 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_opt_pred(    ggml_opt_context_t opt_ctx); // predictions made by outputs
    GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels

-    // get the gradient accumulator for a node from the forward graph
    GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);

    // ====== Optimization Result ======

-    GGML_API ggml_opt_result_t ggml_opt_result_init(void);
+    GGML_API ggml_opt_result_t ggml_opt_result_init();
    GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
    GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);

@@ -156,20 +144,11 @@ extern "C" {

    // ====== Computation ======

-    // if not using static graphs, this function must be called prior to ggml_opt_alloc
-    GGML_API void ggml_opt_prepare_alloc(
-        ggml_opt_context_t    opt_ctx,
-        struct ggml_context * ctx_compute,
-        struct ggml_cgraph  * gf,
-        struct ggml_tensor  * inputs,
-        struct ggml_tensor  * outputs);
+    // do forward pass, increment result if not NULL
+    GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);

-    // allocate the next graph for evaluation, either forward or forward + backward
-    // must be called exactly once prior to calling ggml_opt_eval
-    GGML_API void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward);
-
-    // do forward pass, increment result if not NULL, do backward pass if allocated
-    GGML_API void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+    // do forward pass, increment result if not NULL, do backward pass
+    GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);

    // ############################################################################
    // ## The high-level functions start here. They do not depend on any private ##
@@ -221,9 +200,9 @@ extern "C" {
    // fit model defined by inputs and outputs to dataset
    GGML_API void ggml_opt_fit(
            ggml_backend_sched_t            backend_sched,  // backend scheduler for constructing the compute graphs
-            struct ggml_context           * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
-            struct ggml_tensor            * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
-            struct ggml_tensor            * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
+            ggml_context                  * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
+            ggml_tensor                   * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
+            ggml_tensor                   * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
            ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
            enum ggml_opt_loss_type         loss_type,      // loss to minimize
            ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -768,7 +768,7 @@ extern "C" {
    // Tensor flags
    GGML_API void ggml_set_input(struct ggml_tensor * tensor);
    GGML_API void ggml_set_output(struct ggml_tensor * tensor);
-    GGML_API void ggml_set_param(struct ggml_tensor * tensor);
+    GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
    GGML_API void ggml_set_loss(struct ggml_tensor * tensor);

    //
@@ -938,7 +938,7 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_repeat_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            struct ggml_tensor  * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride
+            struct ggml_tensor  * b);

    // concat a and b along dim
    // used in stable-diffusion
@@ -2049,14 +2049,15 @@ extern "C" {

    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
    GGML_API void ggml_build_backward_expand(
-        struct ggml_context *  ctx,        // context for gradient computation
-        struct ggml_cgraph  *  cgraph,
-        struct ggml_tensor  ** grad_accs);
+        struct ggml_context * ctx_static,  // context for static gradients (loss + gradient accumulation)
+        struct ggml_context * ctx_compute, // context for gradient computation
+        struct ggml_cgraph  * cgraph,
+        bool                  accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static

    // graph allocation in a context
    GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
    GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
-    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads);
+    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
    GGML_API void                 ggml_graph_cpy       (struct ggml_cgraph * src, struct ggml_cgraph * dst);
    GGML_API void                 ggml_graph_reset     (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
    GGML_API void                 ggml_graph_clear     (struct ggml_cgraph * cgraph);
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -674,8 +674,6 @@ struct ggml_backend_sched {
    char * context_buffer;
    size_t context_buffer_size;

-    bool op_offload;
-
    int debug;
 };

@@ -768,7 +766,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
        if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
            int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
            // check if a backend with higher prio wants to offload the op
-            if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
+            if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
                for (int b = 0; b < src_backend_id; b++) {
                    if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
                        SET_CAUSE(tensor, "1.off");
@@ -1111,7 +1109,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg

            const int node_backend_id = tensor_backend_id(node);

-            assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
+            assert(node_backend_id != -1); // all nodes should be assigned by now

            // check if we should start a new split based on the sources of the current node
            bool need_new_split = false;
@@ -1454,8 +1452,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
        ggml_backend_buffer_type_t * bufts,
        int n_backends,
        size_t graph_size,
-        bool parallel,
-        bool op_offload) {
+        bool parallel) {
    GGML_ASSERT(n_backends > 0);
    GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
    GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
@@ -1500,7 +1497,6 @@ ggml_backend_sched_t ggml_backend_sched_new(
    }

    sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
-    sched->op_offload = op_offload;

    ggml_backend_sched_reset(sched);

--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -65,7 +65,6 @@
 #include <aclnnop/aclnn_eq_tensor.h>
 #include <aclnnop/aclnn_gt_scalar.h>
 #include <aclnnop/aclnn_pow.h>
-#include <aclnnop/aclnn_grouped_matmul_v2.h>
 #include <float.h>

 #include <cmath>
@@ -2588,149 +2587,3 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){

    ggml_cann_release_resources(ctx, acl_src, acl_dst, alpha);
 }
-
-/**
- * @brief Performs expert-specific matrix multiplication (MoE) with
- * floating-point precision using the CANN backend.
- *
- * This function executes a matrix multiplication operation tailored for
- * Mixture of Experts (MoE) models, where the input tensor is multiplied
- * with expert-specific weight matrices. It uses the CANN backend for
- * efficient computation and stores the result in the destination tensor `dst`.
- * The operation may leverage identity-based optimizations or routing masks
- * as part of sparse expert selection.
- *
- * @param ctx The context for executing CANN backend operations.
- * @param dst The destination tensor where the MoE multiplication result
- * will be stored.
- *
- * @note This function assumes floating-point data types and is designed for
- * MoE architectures, possibly involving sparse expert routing.
- */
-static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    //dst   [M, K, N, 1]
-    ggml_tensor * src0 = dst->src[0];  //src0	[D, M, A, 1]
-    ggml_tensor * src1 = dst->src[1];  //src1	[D, B, N, 1], B = K or B = 1
-    ggml_tensor * ids  = dst->src[2];  //ids	[K, N]
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    // copy index from npu to cpu
-    int64_t n_as = ne02; // A
-    int64_t n_ids = ids->ne[0]; // K
-
-    std::vector<char> ids_host(ggml_nbytes(ids));
-    ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
-        ACL_MEMCPY_DEVICE_TO_HOST);
-    ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
-
-    char * src0_original = (char *) src0->data;
-    char * src1_original = (char *) src1->data;
-    char * dst_original  = (char *)  dst->data;
-    size_t ori_src0_nb[4] = {nb00, nb01, nb02, nb03};
-
-    // src0 is F16, src1 is F32, dst is F32
-    ggml_cann_pool_alloc src0_cast_allocator;
-    if (src0->type == GGML_TYPE_F16) {
-        src0_cast_allocator.alloc(ctx.pool(), sizeof(float) * ggml_nelements(src0));
-        void* src0_cast_buf = src0_cast_allocator.get();
-
-        size_t cast_nb[GGML_MAX_DIMS];
-        cast_nb[0] = sizeof(float_t);
-        for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            cast_nb[i] = cast_nb[i - 1] * src0->ne[i - 1];
-        }
-
-        aclTensor* acl_src0_f16 = ggml_cann_create_tensor(src0);
-        aclTensor* acl_cast = ggml_cann_create_tensor(src0_cast_buf,
-            ACL_FLOAT, sizeof(float), src0->ne, cast_nb, 4);
-        GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src0_f16, ACL_FLOAT, acl_cast);
-        ggml_cann_release_resources(ctx, acl_cast, acl_src0_f16);
-
-        src0_original = (char *) src0_cast_buf;
-        memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb));
-    }
-
-    std::vector<aclTensor*> src0_tensor_vec;
-    std::vector<aclTensor*> src1_tensor_vec;
-    std::vector<aclTensor*> dst_tensor_vec;
-    for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
-        for (int64_t id = 0; id < n_ids; id++) {
-            // src0_row [M, D] -> weight && permute
-            int64_t src0_ne[2] = {ne01, ne00};
-            size_t src0_nb[2] = {ori_src0_nb[1], ori_src0_nb[0]};
-            // src1_row [D, 1] -> input
-            int64_t src1_ne[2] = {ne10, 1};
-            size_t src1_nb[2] = {nb10, nb11};
-            // dst_row [M, 1] -> out
-            int64_t dst_ne[2] = {ne0, 1};
-            size_t dst_nb[2] = {nb0, nb1};
-
-            // expert index
-            int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
-            GGML_ASSERT(i02 >= 0 && i02 < n_as);
-
-            // If B = 1 (broadcast), always use 0; otherwise, use id.
-            int64_t i11 = (ne11 == 1 ? 0 : id);
-            int64_t i12 = iid1;
-
-            int64_t i1 = id;
-            int64_t i2 = i12;
-
-            void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
-            void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
-            void* dst_tmp_ptr  = dst_original  + i1*nb1   + i2*nb2;
-
-            aclTensor* acl_src0 = ggml_cann_create_tensor(src0_tmp_ptr,
-                ACL_FLOAT, sizeof(float),
-                src0_ne, src0_nb, 2);
-            aclTensor* acl_src1 = ggml_cann_create_tensor(src1_tmp_ptr,
-                ACL_FLOAT, sizeof(float),
-                src1_ne, src1_nb, 2);
-            aclTensor* acl_dst = ggml_cann_create_tensor(dst_tmp_ptr,
-                ACL_FLOAT, sizeof(float),
-                dst_ne, dst_nb, 2);
-
-            src0_tensor_vec.push_back(acl_src0);
-            src1_tensor_vec.push_back(acl_src1);
-            dst_tensor_vec.push_back(acl_dst);
-        }
-    }
-
-    // GroupedMatmulV2 required tensor_list.size < 128
-    size_t GROUP_SIZE = 128;
-    std::vector<std::vector<aclTensor*>> src0_tensor_vec_vec;
-    std::vector<std::vector<aclTensor*>> src1_tensor_vec_vec;
-    std::vector<std::vector<aclTensor*>> dst_tensor_vec_vec;
-
-    // split and call GroupedMatmulV2
-    for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
-        size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
-        std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
-        std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
-        std::vector<aclTensor*> dst_tensor_vec_split(dst_tensor_vec.begin() + i, dst_tensor_vec.begin() + end);
-
-        aclTensorList* src0_tensor_list = aclCreateTensorList(src0_tensor_vec_split.data(), src0_tensor_vec_split.size());
-        aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
-        aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
-
-        GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV2, src1_tensor_list, src0_tensor_list,
-            nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
-
-        ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);
-    }
-    return;
-}
-
-void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    const enum ggml_type type = dst->src[0]->type;
-    switch (type) {
-        case GGML_TYPE_F32:
-        case GGML_TYPE_F16:
-            ggml_cann_mul_mat_id_fp(ctx, dst);
-            break;
-        default:
-            GGML_ABORT("Unsupported type for mul_mat_id");
-            break;
-    }
-}
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -978,33 +978,6 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe
    }
 }

-/**
- * @brief   Performs sparse expert-based matrix multiplication using the CANN backend.
- *
- * @details This function implements a MoE-style batched matrix multiplication, where each input token
- *          is routed to one or more experts, and each expert corresponds to a specific [D, M] weight matrix
- *          in the source tensor `src0`. The routing indices are provided via the `ids` tensor.
- *
- *          For each token (from `src1`), the function selects the corresponding expert(s) as specified by `ids`,
- *          performs the matrix multiplication with the selected expert's weight submatrix (from `src0`),
- *          and stores the results in `dst`. This operation is optimized and executed on the CANN backend.
- *
- *          Dimensions:
- *              - src0: [D, M, A, 1], where A is the number of experts
- *              - src1: [D, B, N, 1], where N is batch size and B is the slot count per sample
- *              - ids : [K, N],       where K is the number of experts each token is routed to
- *              - dst : [M, K, N, 1], output tensor storing the result of expert × token multiplication
- *
- *          The function handles two main modes:
- *              - If `ne12 == 1`, a simpler per-token loop is used.
- *              - TODO: If `ne12 > 1`, grouped multiplication and memory copying is used for efficiency.
- *
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the expert-weighted token outputs are stored.
- *            Expected to be of shape [M, K, N, 1].
- */
-void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
 /**
 * @brief Applies a element-wise operation to two input tensors using the CANN
 * backend.
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1672,8 +1672,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
            ggml_cann_mul_mat(ctx, dst);
            break;
        case GGML_OP_MUL_MAT_ID:
-            ggml_cann_mul_mat_id(ctx, dst);
-            break;
+            return false;
        case GGML_OP_SCALE:
            ggml_cann_scale(ctx, dst);
            break;
@@ -2031,13 +2030,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            }
        }
        case GGML_OP_MUL_MAT_ID:
-            switch (op->src[0]->type) {
-                case GGML_TYPE_F16:
-                case GGML_TYPE_F32:
-                    return true;
-                default:
-                    return false;
-            }
+            return false;
        // embedding
        case GGML_OP_GET_ROWS: {
            switch (op->src[0]->type) {
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -385,9 +385,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)

        # Fetch KleidiAI sources:
        include(FetchContent)
-        set(KLEIDIAI_COMMIT_TAG "v1.6.0")
+        set(KLEIDIAI_COMMIT_TAG "v1.5.0")
        set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
-        set(KLEIDIAI_ARCHIVE_MD5  "75b4ad68f25ab673dcc01065e5a0b05f")
+        set(KLEIDIAI_ARCHIVE_MD5  "ea22e1aefb800e9bc8c74d91633cc58e")

        if (POLICY CMP0135)
            cmake_policy(SET CMP0135 NEW)
@@ -428,7 +428,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            ${KLEIDIAI_SRC}/kai/ukernels/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
-            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)

        set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
@@ -439,19 +438,17 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
        string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)

-        set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP})
+        set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS})

-        list(APPEND GGML_KLEIDIAI_SOURCES
-            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
-            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
-            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
-            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
+        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c)
+        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c)
+        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c)
+        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)

        if (NOT DOTPROD_ENABLED MATCHES -1)
-            list(APPEND GGML_KLEIDIAI_SOURCES
-                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
-                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c
-                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
+            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c)
+            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c)
+            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
        endif()

        if (NOT I8MM_ENABLED MATCHES -1)
@@ -459,13 +456,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        endif()

        if (NOT SME_ENABLED MATCHES -1)
-            list(APPEND GGML_KLEIDIAI_SOURCES
-                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
-                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
-                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
-                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
-                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c)
-            set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
+            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c)
+            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c)
+            set(PRIVATE_ARCH_FLAGS "${PRIVATE_ARCH_FLAGS}+sve+sve2")
        endif()

        set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -8519,11 +8519,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi

 void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
-#ifdef __ARM_FEATURE_MATMUL_INT8
-    assert((nrc == 2) || (nrc == 1));
-#else
    assert(nrc == 1);
-#endif
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
@@ -8534,197 +8530,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi

    const int nb = n / QK_K;

-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    if (nrc == 2) {
-        const block_q6_K * GGML_RESTRICT x0 = x;
-        const block_q6_K * GGML_RESTRICT x1 = (const block_q6_K *) ((const uint8_t *)vx + bx);
-        const block_q8_K * GGML_RESTRICT y0 = y;
-        const block_q8_K * GGML_RESTRICT y1 = (const block_q8_K *) ((const uint8_t *)vy + by);
-
-        float32x4_t vfsum = vdupq_n_f32(0.0f);
-
-        for (int i = 0; i < nb; ++i, ++x0, ++x1, ++y0, ++y1) {
-            const uint8_t * GGML_RESTRICT ql0 = x0->ql;
-            const uint8_t * GGML_RESTRICT ql1 = x1->ql;
-            const uint8_t * GGML_RESTRICT qh0 = x0->qh;
-            const uint8_t * GGML_RESTRICT qh1 = x1->qh;
-            const  int8_t * GGML_RESTRICT qy0 = y0->qs;
-            const  int8_t * GGML_RESTRICT qy1 = y1->qs;
-
-            const uint8x16_t mone = vdupq_n_u8(0x30);
-            const uint8x16_t  m4b = vdupq_n_u8(0x0f);
-
-            int32x4_t visum = vdupq_n_s32(0);
-
-            // process 8 blocks per iteration, totally 16 blocks
-            for (int j = 0; j < 2; ++j, qh0 += 32, ql0 += 64, qh1 += 32, ql1 += 64) {
-                int8x16_t vx0[8], vx1[8];
-
-                // de-quantize vx0[8]
-                {
-                    const uint8x16x2_t qh_bits = vld1q_u8_x2(qh0);
-                    const uint8x16x4_t ql_bits = vld1q_u8_x4(ql0);
-
-                    uint8x16_t q6h_0 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 4));
-                    uint8x16_t q6h_1 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 4));
-                    uint8x16_t q6h_2 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 2));
-                    uint8x16_t q6h_3 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 2));
-
-                    vx0[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[0], m4b), q6h_0));
-                    vx0[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[1], m4b), q6h_1));
-                    vx0[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[2], m4b), q6h_2));
-                    vx0[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[3], m4b), q6h_3));
-
-                    q6h_0 = vandq_u8(mone, qh_bits.val[0]);
-                    q6h_1 = vandq_u8(mone, qh_bits.val[1]);
-                    q6h_2 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[0], 2));
-                    q6h_3 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[1], 2));
-
-                    vx0[4] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[0], 4), q6h_0));
-                    vx0[5] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[1], 4), q6h_1));
-                    vx0[6] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[2], 4), q6h_2));
-                    vx0[7] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[3], 4), q6h_3));
-                }
-
-                // de-quantize vx1[8]
-                {
-                    const uint8x16x2_t qh_bits = vld1q_u8_x2(qh1);
-                    const uint8x16x4_t ql_bits = vld1q_u8_x4(ql1);
-
-                    uint8x16_t q6h_0 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 4));
-                    uint8x16_t q6h_1 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 4));
-                    uint8x16_t q6h_2 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 2));
-                    uint8x16_t q6h_3 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 2));
-
-                    vx1[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[0], m4b), q6h_0));
-                    vx1[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[1], m4b), q6h_1));
-                    vx1[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[2], m4b), q6h_2));
-                    vx1[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[3], m4b), q6h_3));
-
-                    q6h_0 = vandq_u8(mone, qh_bits.val[0]);
-                    q6h_1 = vandq_u8(mone, qh_bits.val[1]);
-                    q6h_2 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[0], 2));
-                    q6h_3 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[1], 2));
-
-                    vx1[4] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[0], 4), q6h_0));
-                    vx1[5] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[1], 4), q6h_1));
-                    vx1[6] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[2], 4), q6h_2));
-                    vx1[7] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[3], 4), q6h_3));
-                }
-
-                // process 16 elements (one block with same scale) per iteration
-                // - vx = concat(ql, qh) - 32
-                // - r1,r2,r3,r4 = smmla(vx, vy)
-                for (int k = 0; k < 8; ++k) {
-                    const int blk = j * 8 + k;
-
-                    const int8x16_t vy0 = vld1q_s8(qy0);
-                    const int8x16_t vy1 = vld1q_s8(qy1);
-                    qy0 += 16;
-                    qy1 += 16;
-
-                    const int32x4_t block_scale = {
-                        x0->scales[blk],
-                        x0->scales[blk],
-                        x1->scales[blk],
-                        x1->scales[blk],
-                    };
-
-                    // calculate four results at once with outer product
-                    const int8x16_t vx_l = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(vx0[k]), vreinterpretq_s64_s8(vx1[k])));
-                    const int8x16_t vx_h = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(vx0[k]), vreinterpretq_s64_s8(vx1[k])));
-                    const int8x16_t vy_l = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(vy0), vreinterpretq_s64_s8(vy1)));
-                    const int8x16_t vy_h = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(vy0), vreinterpretq_s64_s8(vy1)));
-                    int32x4_t vr = vdupq_n_s32(0);
-                    vr = vmmlaq_s32(vr, vx_l, vy_l);
-                    vr = vmmlaq_s32(vr, vx_h, vy_h);
-
-                    // apply block scale, will NOT overflow
-                    // block_scale * sum_256(int6*int8) <= 2^(8+8+6+8) = 30 bits
-                    visum = vmlaq_s32(visum, vr, block_scale);
-                }
-            }
-
-            // adjust bias, apply superblock scale
-            {
-                int32_t bias[4];
-#ifdef __ARM_FEATURE_SVE
-                const svbool_t pg16_8 = svptrue_pat_b16(SV_VL8);
-                const svbool_t pg8_8 = svptrue_pat_b8(SV_VL8);
-                const svint16_t y0_q8sums_0 = svld1_s16(pg16_8, y0->bsums);
-                const svint16_t y0_q8sums_1 = svld1_s16(pg16_8, y0->bsums + 8);
-                const svint16_t y1_q8sums_0 = svld1_s16(pg16_8, y1->bsums);
-                const svint16_t y1_q8sums_1 = svld1_s16(pg16_8, y1->bsums + 8);
-                const svint16_t x0_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x0->scales));
-                const svint16_t x0_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x0->scales + 8));
-                const svint16_t x1_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x1->scales));
-                const svint16_t x1_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x1->scales + 8));
-                const svint64_t zero = svdup_n_s64(0);
-                bias[0] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x0_q6scales_0),
-                                                                               svdot_s64(zero, y0_q8sums_1, x0_q6scales_1)));
-                bias[1] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x0_q6scales_0),
-                                                                               svdot_s64(zero, y1_q8sums_1, x0_q6scales_1)));
-                bias[2] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x1_q6scales_0),
-                                                                               svdot_s64(zero, y0_q8sums_1, x1_q6scales_1)));
-                bias[3] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x1_q6scales_0),
-                                                                               svdot_s64(zero, y1_q8sums_1, x1_q6scales_1)));
-#else
-                // NEON doesn't support int16 dot product, fallback to separated mul and add
-                const int16x8x2_t q8sums0 = vld1q_s16_x2(y0->bsums);
-                const int16x8x2_t q8sums1 = vld1q_s16_x2(y1->bsums);
-
-                int8x16_t scales_s8 = vld1q_s8(x0->scales);
-                const int16x8x2_t q6scales0 = {{vmovl_s8(vget_low_s8(scales_s8)), vmovl_s8(vget_high_s8(scales_s8))}};
-                scales_s8 = vld1q_s8(x1->scales);
-                const int16x8x2_t q6scales1 = {{vmovl_s8(vget_low_s8(scales_s8)), vmovl_s8(vget_high_s8(scales_s8))}};
-
-                int32x4_t prod;
-                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[0]), vget_low_s16 (q6scales0.val[0])),
-                                           vmull_s16(vget_high_s16(q8sums0.val[0]), vget_high_s16(q6scales0.val[0]))),
-                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[1]), vget_low_s16 (q6scales0.val[1])),
-                                           vmull_s16(vget_high_s16(q8sums0.val[1]), vget_high_s16(q6scales0.val[1]))));
-                bias[0] = vaddvq_s32(prod);
-                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[0]), vget_low_s16 (q6scales0.val[0])),
-                                           vmull_s16(vget_high_s16(q8sums1.val[0]), vget_high_s16(q6scales0.val[0]))),
-                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[1]), vget_low_s16 (q6scales0.val[1])),
-                                           vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales0.val[1]))));
-                bias[1] = vaddvq_s32(prod);
-                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[0]), vget_low_s16 (q6scales1.val[0])),
-                                           vmull_s16(vget_high_s16(q8sums0.val[0]), vget_high_s16(q6scales1.val[0]))),
-                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[1]), vget_low_s16 (q6scales1.val[1])),
-                                           vmull_s16(vget_high_s16(q8sums0.val[1]), vget_high_s16(q6scales1.val[1]))));
-                bias[2] = vaddvq_s32(prod);
-                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[0]), vget_low_s16 (q6scales1.val[0])),
-                                           vmull_s16(vget_high_s16(q8sums1.val[0]), vget_high_s16(q6scales1.val[0]))),
-                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[1]), vget_low_s16 (q6scales1.val[1])),
-                                           vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales1.val[1]))));
-                bias[3] = vaddvq_s32(prod);
-
-#endif
-                const int32x4_t vibias = vmulq_n_s32(vld1q_s32(bias), 32);
-
-                const float32x4_t superblock_scale = {
-                    GGML_FP16_TO_FP32(x0->d) * y0->d,
-                    GGML_FP16_TO_FP32(x0->d) * y1->d,
-                    GGML_FP16_TO_FP32(x1->d) * y0->d,
-                    GGML_FP16_TO_FP32(x1->d) * y1->d,
-                };
-
-                visum = vsubq_s32(visum, vibias);
-                vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale);
-            }
-        }
-
-        // vfsum = ABCD -> ACBD
-        // AC -> s, BD -> (s+bs)
-        vfsum = vzip1q_f32(vfsum, vextq_f32(vfsum, vfsum, 2));
-        vst1_f32(s,      vget_low_f32 (vfsum));
-        vst1_f32(s + bs, vget_high_f32(vfsum));
-
-        return;
-    }
-#endif
-
 #ifdef __ARM_FEATURE_SVE
    const int vector_length = ggml_cpu_get_sve_cnt()*8;
    float sum = 0;
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -282,11 +282,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
        .from_float               = quantize_row_q6_K,
        .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
-#if defined (__ARM_FEATURE_MATMUL_INT8)
-        .nrows                    = 2,
-#else
        .nrows                    = 1,
-#endif
    },
    [GGML_TYPE_IQ2_XXS] = {
        .from_float               = NULL,
--- a/ggml/src/ggml-cpu/kleidiai/kernels.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kernels.cpp
@@ -4,22 +4,16 @@

 // KleidiAI micro-kernels
 #include "kai_matmul_clamp_f32_qsi8d32p_qsi4c32p_interface.h"
+#include "kai_lhs_quant_pack_qsi8d32p_f32.h"
+#include "kai_lhs_quant_pack_qsi8d32p_f32_neon.h"
+#include "kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
+#include "kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h"
 #include "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h"
 #include "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h"
 #include "kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h"
 #include "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h"
 #include "kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h"
 #include "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h"
-#include "kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.h"
-
-#include "kai_lhs_pack_bf16p2vlx2_f32_sme.h"
-#include "kai_lhs_quant_pack_qsi8d32p_f32.h"
-#include "kai_lhs_quant_pack_qsi8d32p_f32_neon.h"
-
-#include "kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.h"
-#include "kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
-#include "kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h"
-
 #include "kai_common.h"

 #include "kernels.h"
@@ -67,53 +61,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
        },
        /* .required_cpu       = */ CPU_FEATURE_SME,
-        /* .lhs_type           = */ GGML_TYPE_F32,
-        /* .rhs_type           = */ GGML_TYPE_Q4_0,
-        /* .op_type            = */ GGML_TYPE_F32,
-    },
-    {
-        /* SME GEMM */
-        /* .kern_info = */ {
-            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .get_lhs_offset        = */ kai_get_lhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .run_kernel            = */ kai_run_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-        },
-        /* SME GEMV */
-        /* .kern_info = */ {
-            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .get_lhs_offset        = */ kai_get_lhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-            /* .run_kernel            = */ kai_run_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
-        },
-        /* .lhs_info = */ {
-            /* .get_offset            = */ kai_get_lhs_offset_lhs_pack_bf16p2vlx2_f32_sme,
-            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_pack_bf16p2vlx2_f32_sme,
-            /* .packed_size           = */ kai_get_lhs_packed_size_lhs_pack_bf16p2vlx2_f32_sme,
-            /* .pack_func             = */ kai_run_lhs_pack_bf16p2vlx2_f32_sme,
-        },
-        /* .rhs_info = */ {
-            /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
-            /* .pack_func   = */ kai_run_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
-        },
-        /* .required_cpu       = */ CPU_FEATURE_SME,
-        /* .lhs_type           = */ GGML_TYPE_F32,
-        /* .rhs_type           = */ GGML_TYPE_F16,
-        /* .op_type            = */ GGML_TYPE_F32,
    },
 #endif
 #if defined(__APPLE__)
@@ -158,9 +105,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
        },
        /* .required_cpu       = */ CPU_FEATURE_DOTPROD,
-        /* .lhs_type           = */ GGML_TYPE_F32,
-        /* .rhs_type           = */ GGML_TYPE_Q4_0,
-        /* .op_type            = */ GGML_TYPE_F32,
    },
 #endif
 #if defined(__ARM_FEATURE_MATMUL_INT8)
@@ -204,9 +148,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
        },
        /* .required_cpu       = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
-        /* .lhs_type           = */ GGML_TYPE_F32,
-        /* .rhs_type           = */ GGML_TYPE_Q4_0,
-        /* .op_type            = */ GGML_TYPE_F32,
    },
 #endif
 #else
@@ -251,9 +192,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
        },
        /* .required_cpu       = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
-        /* .lhs_type           = */ GGML_TYPE_F32,
-        /* .rhs_type           = */ GGML_TYPE_Q4_0,
-        /* .op_type            = */ GGML_TYPE_F32,
    },
 #endif
 #if defined(__ARM_FEATURE_DOTPROD)
@@ -297,33 +235,12 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
        },
        /* .required_cpu       = */ CPU_FEATURE_DOTPROD,
-        /* .lhs_type           = */ GGML_TYPE_F32,
-        /* .rhs_type           = */ GGML_TYPE_Q4_0,
-        /* .op_type            = */ GGML_TYPE_F32,
    },
 #endif
 #endif
 };

-ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor) {
-    ggml_kleidiai_kernels * kernel = nullptr;
-
-    if (tensor->op == GGML_OP_MUL_MAT && tensor->src[0] != nullptr && tensor->src[1] != nullptr) {
-        for (size_t i = 0; i < NELEMS(gemm_gemv_kernels); ++i) {
-            if ((cpu_features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu &&
-                gemm_gemv_kernels[i].lhs_type == tensor->src[1]->type &&
-                gemm_gemv_kernels[i].rhs_type == tensor->src[0]->type &&
-                gemm_gemv_kernels[i].op_type  == tensor->type) {
-                kernel = &gemm_gemv_kernels[i];
-                break;
-            }
-        }
-    }
-
-    return kernel;
-}
-
-ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features) {
+ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature features) {
    ggml_kleidiai_kernels * kernels = nullptr;

    for (size_t i = 0; i < NELEMS(gemm_gemv_kernels); ++i) {
--- a/ggml/src/ggml-cpu/kleidiai/kernels.h
+++ b/ggml/src/ggml-cpu/kleidiai/kernels.h
@@ -4,10 +4,6 @@

 #pragma once

-#include <functional>
-#include <variant>
-#include "ggml.h"
-
 enum cpu_feature {
    CPU_FEATURE_NONE    = 0,
    CPU_FEATURE_DOTPROD = 1,
@@ -30,53 +26,26 @@ struct kernel_info {
    size_t (*get_nr)(void);
    size_t (*get_kr)(void);
    size_t (*get_sr)(void);
-    std::variant<
-        std::function<size_t(size_t n_idx, size_t k, size_t bl)>,
-        std::function<size_t(size_t m_idx, size_t k)>
-    > get_lhs_offset;
-    std::variant<
-        std::function<size_t(size_t n_idx, size_t k, size_t bl)>,
-        std::function<size_t(size_t n_idx, size_t k)>
-    > get_rhs_packed_offset;
+    size_t (*get_lhs_offset)(size_t m_idx, size_t k, size_t bl);
+    size_t (*get_rhs_packed_offset)(size_t n_idx, size_t k, size_t bl);
    size_t (*get_dst_offset)(size_t m_idx, size_t n_idx, size_t stride);
    size_t (*get_dst_size)(size_t m, size_t n);
-    std::variant<
-        std::function<void(size_t m, size_t n, size_t k, size_t bl, const void* lhs_packed, const void* rhs_packed,
-            float* dst, size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max)>,
-        std::function<void(size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed, void* dst, size_t dst_stride_row,
-            size_t dst_stride_col, float clamp_min, float clamp_max)>
-    > run_kernel;
+    void (*run_kernel)(size_t m, size_t n, size_t k, size_t bl, const void* lhs_packed, const void* rhs_packed,
+                         float* dst, size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max);
 };

 struct lhs_packing_info {
    size_t (*get_offset)(size_t m_idx, size_t lhs_stride);
-    std::variant<
-        std::function<size_t(size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr)>,
-        std::function<size_t(size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr)>
-    > get_packed_offset;
-    std::variant<
-        std::function<size_t(size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr)>,
-        std::function<size_t(size_t m, size_t k, size_t mr, size_t kr, size_t sr)>
-    > packed_size;
-    std::variant<
-        std::function<void(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
-            size_t lhs_stride, void* lhs_packed)>,
-        std::function<void(size_t m, size_t k, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const void* lhs, size_t lhs_stride,
-        void* lhs_packed)>
-    > pack_func;
+    size_t (*get_packed_offset)(size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
+    size_t (*packed_size)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
+    void (*pack_func)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
+                      size_t lhs_stride, void* lhs_packed);
 };

 struct rhs_packing_info {
-    std::variant<
-        std::function<size_t(size_t n, size_t k, size_t nr, size_t kr, size_t bl)>,
-        std::function<size_t(size_t n, size_t k)>
-    > packed_size;
-    std::variant<
-        std::function<void(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl, const uint8_t* rhs,
-            const float* bias, void* rhs_packed, size_t extra_bytes, const struct kai_rhs_pack_qs4cxs1s0_param* params)>,
-        std::function<void(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs,
-            const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params)>
-    > pack_func;
+    size_t (*packed_size)(size_t n, size_t k, size_t nr, size_t kr, size_t bl);
+    void (*pack_func)(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl, const uint8_t* rhs,
+                      const float* bias, void* rhs_packed, size_t extra_bytes, const struct kai_rhs_pack_qs4cxs1s0_param* params);
 };

 struct ggml_kleidiai_kernels {
@@ -86,10 +55,6 @@ struct ggml_kleidiai_kernels {
    rhs_packing_info rhs_info;

    cpu_feature required_cpu;
-    ggml_type lhs_type;
-    ggml_type rhs_type;
-    ggml_type op_type;
 };

-ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor);
-ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features);
+ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features);
--- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
@@ -3,9 +3,7 @@
 //
 #include <arm_neon.h>
 #include <assert.h>
-#include <atomic>
 #include <cfloat>
-#include <stdexcept>
 #include <stdint.h>
 #include <string.h>
 #if defined(__linux__)
@@ -36,9 +34,8 @@
 #include "ggml-common.h"

 struct ggml_kleidiai_context {
-    cpu_feature features;
    ggml_kleidiai_kernels * kernels;
-} static ctx = { CPU_FEATURE_NONE, NULL };
+} static ctx = { NULL };

 static void init_kleidiai_context(void) {

@@ -50,18 +47,18 @@ static void init_kleidiai_context(void) {
        const char *env_var = getenv("GGML_KLEIDIAI_SME");
        int sme_enabled = 0;

-        ctx.features  = (ggml_cpu_has_dotprod()     ? CPU_FEATURE_DOTPROD : CPU_FEATURE_NONE) |
-                        (ggml_cpu_has_matmul_int8() ? CPU_FEATURE_I8MM    : CPU_FEATURE_NONE) |
-                        (ggml_cpu_has_sve()         ? CPU_FEATURE_SVE     : CPU_FEATURE_NONE);
+        cpu_feature features  = (ggml_cpu_has_dotprod()     ? CPU_FEATURE_DOTPROD : CPU_FEATURE_NONE) |
+                                (ggml_cpu_has_matmul_int8() ? CPU_FEATURE_I8MM    : CPU_FEATURE_NONE) |
+                                (ggml_cpu_has_sve()         ? CPU_FEATURE_SVE     : CPU_FEATURE_NONE);

        if (env_var) {
            sme_enabled = atoi(env_var);
        }

        if (sme_enabled != 0) {
-            ctx.features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
+            features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
        }
-        ctx.kernels = ggml_kleidiai_select_kernels_q4_0(ctx.features);
+        ctx.kernels = ggml_kleidiai_select_kernels(features);
    }
    ggml_critical_section_end();
 }
@@ -71,275 +68,95 @@ static inline int64_t ggml_ne(const ggml_tensor * tensor, int dim) {
    return tensor->ne[dim];
 }

-template<typename Ret, typename Variant, typename... Args>
-static Ret variant_call(const Variant & var, Args&&... args) {
-    return std::visit([&](auto&& func) -> Ret {
-        if constexpr (std::is_invocable_r_v<Ret, decltype(func), Args...>) {
-            return func(std::forward<Args>(args)...);
-        } else {
-            throw std::runtime_error("Invalid function type in variant_call");
-        }
-    }, var);
-}
-
 namespace ggml::cpu::kleidiai {
-
-static size_t round_down(size_t x, size_t y) {
-    return y == 0 ? x : x - (x % y);
-}
-
-static void transpose_f32kxn_f16nxk(size_t n, size_t k, float * dst, const uint16_t * src, size_t rhs_stride) {
-    size_t src_stride = rhs_stride / sizeof(uint16_t);
-    size_t dst_stride = n;
-
-    for (size_t k_idx = 0; k_idx < k; ++k_idx) {
-        for (size_t n_idx = 0; n_idx < n; ++n_idx) {
-            uint16_t v = *(src + k_idx + n_idx * src_stride);
-            *(dst + n_idx + k_idx * dst_stride) = kai_cast_f32_f16(v);
-        }
-    }
-}
-
 class tensor_traits : public ggml::cpu::tensor_traits {
    bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
-        ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, op);
-        GGML_ASSERT(kernels);
-        kernel_info * kernel = op->src[1]->ne[1] == 1 ? &kernels->gemv : &kernels->gemm;
+        GGML_ASSERT(ctx.kernels);
+        kernel_info * kernel = op->src[1]->ne[1] == 1 ? &ctx.kernels->gemv : &ctx.kernels->gemm;

        size_t k = op->src[0]->ne[0];
-        size_t n = op->src[0]->ne[1];
        size_t m = op->src[1]->ne[1];

        size_t mr = kernel->get_mr();
        size_t kr = kernel->get_kr();
        size_t sr = kernel->get_sr();

-        if (kernels->rhs_type == GGML_TYPE_Q4_0) {
-            size = variant_call<size_t>(kernels->lhs_info.packed_size, m, k, QK4_0, mr, kr, sr);
-        } else if (kernels->rhs_type == GGML_TYPE_F16) {
-            size = variant_call<size_t>(kernels->lhs_info.packed_size, m, k, mr, kr, sr) +
-                   variant_call<size_t>(kernels->rhs_info.packed_size, n, k) +
-                   k * n * sizeof(float) + n * sizeof(float);
-        } else {
-            GGML_ASSERT(false);
-        }
+        size = ctx.kernels->lhs_info.packed_size(m, k, QK4_0, mr, kr, sr);

        return true;
    }

-
    bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * dst) override {
        if (dst->op == GGML_OP_MUL_MAT) {
-            if (dst->src[0]->type == GGML_TYPE_Q4_0) {
-                return compute_forward_q4_0(params, dst);
-            } else if (dst->src[0]->type == GGML_TYPE_F16) {
-                return compute_forward_kv_cache(params, dst);
-            }
-        }
-        return false;
-    }
+            const ggml_tensor * src0 = dst->src[0];
+            const ggml_tensor * src1 = dst->src[1];

-    bool compute_forward_kv_cache(ggml_compute_params * params, struct ggml_tensor * dst) {
-        static std::atomic_flag first_to_arrive = ATOMIC_FLAG_INIT;
+            GGML_TENSOR_BINARY_OP_LOCALS

-        const ggml_tensor * src0 = dst->src[0];
-        const ggml_tensor * src1 = dst->src[1];
+            GGML_ASSERT(ctx.kernels);
+            kernel_info * kernel = src1->ne[1] == 1 ? &ctx.kernels->gemv : &ctx.kernels->gemm;
+            lhs_packing_info * lhs_info = &ctx.kernels->lhs_info;

-        GGML_TENSOR_BINARY_OP_LOCALS
+            GGML_ASSERT(kernel);

-        ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
-        GGML_ASSERT(kernels);
+            const int ith = params->ith;
+            const int nth = params->nth;

-        kernel_info * kernel = src1->ne[1] == 1 ? &kernels->gemv : &kernels->gemm;
-        GGML_ASSERT(kernel);
+            const size_t k = ne00;
+            const size_t m = ne11;
+            const size_t n = ne01;

-        const int nth = params->nth;
-        const int ith = params->ith;
+            const size_t n_step = kernel->get_n_step();
+            const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
+            const size_t n_start = ith * num_n_per_thread;

-        const int64_t lhs_batch_size0 = ne12;
-        const int64_t rhs_batch_size0 = ne02;
-        const int64_t batch_size      = rhs_batch_size0;
-
-        const int64_t r = lhs_batch_size0 / rhs_batch_size0;
-
-        const int64_t m = ne11 * r;
-        const int64_t n = ne01;
-        const int64_t k = ne00;
-
-        const size_t lhs_stride = src1->nb[1];
-        const size_t rhs_stride = src0->nb[1];
-        const size_t dst_stride = dst->nb[1];
-
-        const int64_t mr = static_cast<int64_t>(kernel->get_mr());
-        const int64_t nr = static_cast<int64_t>(kernel->get_nr());
-        const int64_t kr = static_cast<int64_t>(kernel->get_kr());
-        const int64_t sr = static_cast<int64_t>(kernel->get_sr());
-
-        const size_t lhs_packed_size = variant_call<size_t>(kernels->lhs_info.packed_size, m, k, mr, kr, sr);
-        const size_t rhs_packed_size = variant_call<size_t>(kernels->rhs_info.packed_size, n, k);
-        const size_t kxn_size        = k * n * sizeof(float);
-        const size_t bias_size       = n * sizeof(float);
-
-        const size_t wsize_required = lhs_packed_size + rhs_packed_size + kxn_size + bias_size;
-        GGML_ASSERT(wsize_required <= params->wsize);
-
-        uint8_t * lhs_packed = static_cast<uint8_t *>(params->wdata);
-        uint8_t * rhs_packed = lhs_packed + lhs_packed_size;
-        uint8_t * rhs_kxn    = rhs_packed + rhs_packed_size;
-        uint8_t * bias       = rhs_kxn + kxn_size;
-
-        for (int64_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
-            const uint8_t * lhs_batch = static_cast<const uint8_t *>(src1->data) + batch_idx * m * lhs_stride;
-            const uint8_t * rhs_batch = static_cast<const uint8_t *>(src0->data) + batch_idx * n * rhs_stride;
-            uint8_t * dst_batch       = static_cast<uint8_t *>(dst->data) + batch_idx * m * dst_stride;
-
-            // LHS packing
-            {
-                const int64_t m_roundup_mr = kai_roundup(m, mr);
-                const int64_t num_threads  = KAI_MIN(m_roundup_mr / mr, nth);
-
-                if (ith < num_threads) {
-                    const int64_t num_m_per_thread0   = round_down(m_roundup_mr / num_threads, mr);
-                    const int64_t num_m_per_threadN_1 = m - (num_threads - 1) * num_m_per_thread0;
-
-                    const int64_t m_start          = ith * num_m_per_thread0;
-                    const int64_t num_m_per_thread = (ith == num_threads - 1) ? num_m_per_threadN_1 : num_m_per_thread0;
-
-                    const size_t lhs_offset        = variant_call<size_t>(kernels->gemm.get_lhs_offset, m_start, lhs_stride);
-                    const size_t lhs_packed_offset = variant_call<size_t>(kernels->lhs_info.get_packed_offset, m_start, k, mr, kr, sr);
-
-                    const void * src_ptr = static_cast<const uint8_t *>(lhs_batch) + lhs_offset;
-                    void * dst_ptr       = static_cast<uint8_t *>(lhs_packed) + lhs_packed_offset;
-
-                    variant_call<void>(kernels->lhs_info.pack_func, num_m_per_thread, k, mr, kr, sr, 0, src_ptr, lhs_stride, dst_ptr);
-                }
+            size_t n_to_process = num_n_per_thread;
+            if ((n_start + n_to_process) > n) {
+                n_to_process = n - n_start;
            }

-            // RHS packing
-            if (first_to_arrive.test_and_set(std::memory_order_acquire) == false) {
-                // First thread to reach this point handles RHS packing
-                memset(bias, 0, n * sizeof(float));
-                transpose_f32kxn_f16nxk(n, k, reinterpret_cast<float *>(rhs_kxn),
-                                        reinterpret_cast<const uint16_t *>(rhs_batch), rhs_stride);
+            const uint8_t * lhs        = static_cast<const uint8_t *>(src1->data);
+            uint8_t * lhs_packed       = (uint8_t*)params->wdata;
+            const uint8_t * rhs_packed = static_cast<const uint8_t *>(src0->data);

-                variant_call<void>(kernels->rhs_info.pack_func, 1, n, k, nr, kr, sr, n * sizeof(float),
-                             rhs_kxn, bias, nullptr, rhs_packed, 0, nullptr);
+            size_t mr = kernel->get_mr();
+            size_t kr = kernel->get_kr();
+            size_t sr = kernel->get_sr();
+
+            // Calculate number of columns to be processed per thread
+            const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
+            const size_t m_start = ith * num_m_per_thread;
+            size_t m_to_process = num_m_per_thread;
+            if ((m_start + m_to_process) > m) {
+                m_to_process = m - m_start;
+            }
+
+            if(m_start < m) {
+                // Transform LHS
+                const size_t src_stride        = src1->nb[1];
+                const float * src_ptr          = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
+                const size_t lhs_packed_offset = lhs_info->get_packed_offset(m_start, k, QK4_0, mr, kr, sr);
+                void * lhs_packed_ptr          = static_cast<void *>(lhs_packed + lhs_packed_offset);
+
+                lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
            }

            ggml_barrier(params->threadpool);

-            first_to_arrive.clear(std::memory_order_release);
+            // Perform the operation
+            const size_t dst_stride        = dst->nb[1];
+            const size_t lhs_packed_offset = lhs_info->get_packed_offset(0, k, QK4_0, mr, kr, sr);
+            const size_t rhs_packed_offset = kernel->get_rhs_packed_offset(n_start, k, QK4_0);
+            const size_t dst_offset        = kernel->get_dst_offset(0, n_start, dst_stride);
+            const void * rhs_ptr           = static_cast<const void *>(rhs_packed + rhs_packed_offset);
+            const void* lhs_ptr            = (const void*)((const char *)lhs_packed + lhs_packed_offset);
+            float *dst_ptr                 = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);

-            // Perform the matmul
-            {
-                const int64_t m_to_process = m;
-                const int64_t m_start      = 0;
-
-                const int64_t n_step      = static_cast<int64_t>(kernel->get_n_step());
-                const int64_t num_threads = KAI_MIN(n / n_step, nth);
-
-                if (ith < num_threads) {
-                    const int64_t num_n_per_thread0   = round_down(n / num_threads, n_step);
-                    const int64_t num_n_per_threadN_1 = n - (num_threads - 1) * num_n_per_thread0;
-
-                    const int64_t n_start      = ith * num_n_per_thread0;
-                    const int64_t n_to_process = (ith == num_threads - 1) ? num_n_per_threadN_1 : num_n_per_thread0;
-
-                    const size_t lhs_packed_offset = variant_call<size_t>(kernel->get_lhs_offset, m_start, k);
-                    const size_t rhs_packed_offset = variant_call<size_t>(kernel->get_rhs_packed_offset, n_start, k);
-                    const size_t dst_offset        = kernel->get_dst_offset(m_start, n_start, dst_stride);
-
-                    const void * lhs_ptr = lhs_packed + lhs_packed_offset;
-                    const void * rhs_ptr = rhs_packed + rhs_packed_offset;
-                    float * dst_ptr      = reinterpret_cast<float *>(dst_batch + dst_offset);
-
-                    variant_call<void>(kernel->run_kernel, m_to_process, n_to_process, k, lhs_ptr, rhs_ptr, dst_ptr, dst_stride, sizeof(float), -FLT_MAX, FLT_MAX);
-                }
-            }
-
-            if (batch_idx != batch_size - 1) {
-                // This barrier is necessary when the batch size is larger than 1. While processing a batch,
-                // the work data buffer (params->wdata) is used as temporary storage which means that only
-                // a single batch can be processed at any given time. No barrier is needed for the last
-                // batch since GGML inserts a barrier between the execution of every operator.
-                ggml_barrier(params->threadpool);
-            }
+            kernel->run_kernel(m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr,
+                               dst_stride, sizeof(float), -FLT_MAX, FLT_MAX);
+            return true;
        }
-
-        return true;
-    }
-
-    bool compute_forward_q4_0(struct ggml_compute_params * params, struct ggml_tensor * dst) {
-        const ggml_tensor * src0 = dst->src[0];
-        const ggml_tensor * src1 = dst->src[1];
-
-        GGML_TENSOR_BINARY_OP_LOCALS
-
-        ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
-        GGML_ASSERT(kernels);
-
-        kernel_info * kernel = src1->ne[1] == 1 ? &kernels->gemv : &kernels->gemm;
-        lhs_packing_info * lhs_info = &kernels->lhs_info;
-
-        GGML_ASSERT(kernel);
-
-        const int ith = params->ith;
-        const int nth = params->nth;
-
-        const size_t k = ne00;
-        const size_t m = ne11;
-        const size_t n = ne01;
-
-        size_t mr = kernel->get_mr();
-        size_t kr = kernel->get_kr();
-        size_t sr = kernel->get_sr();
-
-        const uint8_t * lhs        = static_cast<const uint8_t *>(src1->data);
-        uint8_t * lhs_packed       = (uint8_t*)params->wdata;
-        const uint8_t * rhs_packed = static_cast<const uint8_t *>(src0->data);
-
-        const size_t n_step = kernel->get_n_step();
-        const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
-        const size_t n_start = ith * num_n_per_thread;
-
-        size_t n_to_process = num_n_per_thread;
-        if ((n_start + n_to_process) > n) {
-            n_to_process = n - n_start;
-        }
-
-        // Calculate number of columns to be processed per thread
-        const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
-        const size_t m_start = ith * num_m_per_thread;
-        size_t m_to_process = num_m_per_thread;
-        if ((m_start + m_to_process) > m) {
-            m_to_process = m - m_start;
-        }
-
-        if (m_start < m) {
-            // Transform LHS
-            const size_t src_stride        = src1->nb[1];
-            const float * src_ptr          = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
-            const size_t lhs_packed_offset = variant_call<size_t>(lhs_info->get_packed_offset, m_start, k, QK4_0, mr, kr, sr);
-            void * lhs_packed_ptr          = static_cast<void *>(lhs_packed + lhs_packed_offset);
-
-            variant_call<void>(lhs_info->pack_func, m_to_process, k, QK4_0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
-        }
-
-        ggml_barrier(params->threadpool);
-
-        // Perform the operation
-        const size_t dst_stride        = dst->nb[1];
-        const size_t lhs_packed_offset = variant_call<size_t>(lhs_info->get_packed_offset, 0, k, QK4_0, mr, kr, sr);
-        const size_t rhs_packed_offset = variant_call<size_t>(kernel->get_rhs_packed_offset, n_start, k, QK4_0);
-        const size_t dst_offset        = kernel->get_dst_offset(0, n_start, dst_stride);
-        const void * rhs_ptr           = static_cast<const void *>(rhs_packed + rhs_packed_offset);
-        const void* lhs_ptr            = (const void*)((const char *)lhs_packed + lhs_packed_offset);
-        float *dst_ptr                 = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);
-
-        variant_call<void>(kernel->run_kernel, m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
-                           sizeof(float), -FLT_MAX, FLT_MAX);
-
-        return true;
+        return false;
    }

 public:
@@ -352,13 +169,13 @@ public:
        size_t sr      = ctx.kernels->gemm.get_sr();

 #ifndef NDEBUG
-        const size_t repacked_size = variant_call<size_t>(ctx.kernels->rhs_info.packed_size, n, k, nr, kr, QK4_0);
+        const size_t repacked_size = ctx.kernels->rhs_info.packed_size(n, k, nr, kr, QK4_0);
        GGML_ASSERT(repacked_size <= data_size && "repacked size larger than the packed size!");
 #endif
        struct kai_rhs_pack_qs4cxs1s0_param params;
        params.lhs_zero_point = 1;
        params.rhs_zero_point = 8;
-        variant_call<void>(ctx.kernels->rhs_info.pack_func, 1, n, k, nr, kr, sr, QK4_0, (const uint8_t*)data, nullptr, tensor->data, 0, &params);
+        ctx.kernels->rhs_info.pack_func(1, n, k, nr, kr, sr, QK4_0, (const uint8_t *)data, NULL, tensor->data, 0, &params);

        return 0;

@@ -372,7 +189,7 @@ static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struc
 }
 }  // namespace ggml::cpu::kleidiai

-static enum ggml_status ggml_backend_cpu_kleidiai_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+GGML_API enum ggml_status ggml_backend_cpu_kleidiai_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
    tensor->extra = (void *) ggml::cpu::kleidiai::get_tensor_traits(buffer, tensor);

    GGML_UNUSED(buffer);
@@ -421,11 +238,12 @@ static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alignment(ggml_backend_b
 namespace ggml::cpu::kleidiai {
 class extra_buffer_type : ggml::cpu::extra_buffer_type {
    bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
-        if (op->op == GGML_OP_MUL_MAT &&
-            op->src[0]->type == GGML_TYPE_Q4_0 &&
-            op->src[0]->buffer &&
-            (ggml_n_dims(op->src[0]) == 2) &&
-            op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels) {
+        if (    op->op == GGML_OP_MUL_MAT &&
+                op->src[0]->type == GGML_TYPE_Q4_0 &&
+                op->src[0]->buffer &&
+                (ggml_n_dims(op->src[0]) == 2) &&
+                op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels
+                ) {
            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
                return false;
            }
@@ -442,19 +260,6 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
            if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
                return (ggml::cpu::tensor_traits *) op->src[0]->extra;
            }
-            else if (ggml_kleidiai_select_kernels(ctx.features, op) &&
-                     op->src[0]->op == GGML_OP_VIEW &&
-                     (op->src[1]->op == GGML_OP_PERMUTE || op->src[1]->op ==  GGML_OP_SOFT_MAX) &&
-                     op->src[1]->ne[1] > 1) {
-                if ((op->src[0]->nb[0] != 2) ||
-                    (op->src[1]->nb[0] != 4) ||
-                    (op->src[0]->nb[1] * op->src[0]->ne[1] != op->src[0]->nb[2]) ||
-                    (op->src[1]->nb[1] * op->src[1]->ne[1] != op->src[1]->nb[2])) {
-                    return nullptr;
-                }
-
-                return ggml::cpu::kleidiai::get_tensor_traits(NULL, NULL);
-            }
        }
        return nullptr;
    }
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -118,7 +118,7 @@ if (CUDAToolkit_FOUND)

    set(CUDA_CXX_FLAGS "")

-    set(CUDA_FLAGS -use_fast_math -extended-lambda)
+    set(CUDA_FLAGS -use_fast_math)

    if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
        # Options are:
--- a/ggml/src/ggml-cuda/acc.cu
+++ b/ggml/src/ggml-cuda/acc.cu
@@ -1,61 +1,47 @@
 #include "acc.cuh"

-static __global__ void acc_f32(const float * x, const float * y, float * dst, const int64_t ne,
-        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
-        const int64_t s11, const int64_t s12, const int64_t s13, const int64_t offset) {
-    const int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
-
+static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
+    const int ne10, const int ne11, const int ne12,
+    const int nb1, const int nb2, int offset) {
+    const int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i >= ne) {
        return;
    }
-
-    int64_t src1_idx = i - offset;
-
-    int64_t tmp = src1_idx;
-    const int64_t i13 = tmp / s13;
-    tmp -= i13 * s13;
-    const int64_t i12 = tmp / s12;
-    tmp -= i12 * s12;
-    const int64_t i11 = tmp / s11;
-    tmp -= i11 * s11;
-    const int64_t i10 = tmp;
-
-    float val = x[i];
-    if (src1_idx >= 0 && i10 < ne10 && i11 < ne11 && i12 < ne12 && i13 < ne13) {
-        val += y[((i13*ne12 + i12) * ne11 + i11) * ne10 + i10];
+    int src1_idx = i - offset;
+    int oz = src1_idx / nb2;
+    int oy = (src1_idx - (oz * nb2)) / nb1;
+    int ox = src1_idx % nb1;
+    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
+        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
+    } else {
+        dst[i] = x[i];
    }
-    dst[i] = val;
 }

-static void acc_f32_cuda(const float * x, const float * y, float * dst, const int64_t n_elements,
-        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
-        const int64_t s1, const int64_t s2, const int64_t s3, const int64_t offset, cudaStream_t stream) {
-    const int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
-    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, ne13, s1, s2, s3, offset);
+static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
+    const int ne10, const int ne11, const int ne12,
+    const int nb1, const int nb2, const int offset, cudaStream_t stream) {
+    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
+    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
 }

 void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
-
-    const float * src0_d = (const float *) src0->data;
-    const float * src1_d = (const float *) src1->data;
-    float       * dst_d  = (float       *)  dst->data;
-
+    const float * src0_d = (const float *)src0->data;
+    const float * src1_d = (const float *)src1->data;
+    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported

-    GGML_ASSERT(ggml_is_contiguous(src1));
-    GGML_ASSERT(dst->nb[0] == ggml_element_size(dst));
-    GGML_ASSERT(ggml_is_contiguously_allocated(dst));
+    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
+    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
+    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
+    int offset = dst->op_params[3] / 4; // offset in bytes

-    const int64_t s1     = dst->op_params[0] / sizeof(float);
-    const int64_t s2     = dst->op_params[1] / sizeof(float);
-    const int64_t s3     = dst->op_params[2] / sizeof(float);
-    const int64_t offset = dst->op_params[3] / sizeof(float);
-
-    acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], s1, s2, s3, offset, stream);
+    acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
 }
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -296,25 +296,6 @@ static __device__ void no_device_code(
 #define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
 #endif // __CUDA_ARCH__

-// The compiler is always able to unroll loops if they contain continue expressions.
-// In such cases loop unrolling can still be achieved via recursion:
-template <int n>
-struct ggml_cuda_unroll {
-    template <typename Func, typename... Args>
-    __device__ void operator()(const Func & f, Args... args) const {
-        f(n - 1, args...);
-        ggml_cuda_unroll<n - 1>{}(f, args...);
-    }
-};
-
-template <>
-struct ggml_cuda_unroll<1> {
-    template <typename Func, typename... Args>
-    __device__ void operator()(const Func & f, Args... args) const {
-        f(0, args...);
-    }
-};
-
 template<int width = WARP_SIZE>
 static __device__ __forceinline__ int warp_reduce_sum(int x) {
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
--- a/ggml/src/ggml-cuda/cp-async.cuh
+++ b/ggml/src/ggml-cuda/cp-async.cuh
@@ -2,17 +2,6 @@

 #include "common.cuh"

-
-static __device__ __forceinline__ unsigned int ggml_cuda_cvta_generic_to_shared(void * generic_ptr) {
-#ifdef CP_ASYNC_AVAILABLE
-    return __cvta_generic_to_shared(generic_ptr);
-#else
-    GGML_UNUSED(generic_ptr);
-    NO_DEVICE_CODE;
-    return 0;
-#endif // CP_ASYNC_AVAILABLE
-}
-
 // Copies data from global to shared memory, cg == cache global.
 // Both the src and dst pointers must be aligned to 16 bit.
 // Shared memory uses 32 bit addressing, the pointer is passed as unsigned int.
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -516,7 +516,7 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
        nullptr;
 }

-template<int D, int ncols1, int ncols2> // D == head size
+template<int D, int ncols1, int ncols2, int KQ_stride> // D == head size
 __launch_bounds__(D, 1)
 static __global__ void flash_attn_stream_k_fixup(
        float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne11) {
@@ -665,27 +665,23 @@ static void on_no_fattn_vec_case(const int D) {
        fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
        GGML_ABORT("fatal error");
    } else {
-        fprintf(stderr, "Unsupported KV type combination for head_size %d.\n", D);
+        fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
        fprintf(stderr, "Only f16 is supported.\n");
        GGML_ABORT("fatal error");
    }
 }

-template <int DV, int ncols1, int ncols2>
+template <int D, int ncols1, int ncols2, int KQ_stride>
 void launch_fattn(
    ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, const int nwarps, const size_t nbytes_shared,
    const int KQ_row_granularity, const bool need_f16_K, const bool need_f16_V, const bool stream_k, const int warp_size = WARP_SIZE
 ) {
    constexpr int ncols = ncols1 * ncols2;

-    const bool is_mla = DV == 512; // TODO better parameterization
-
    const ggml_tensor * Q = dst->src[0];
    const ggml_tensor * K = dst->src[1];
    const ggml_tensor * V = dst->src[2];

-    GGML_ASSERT(V || is_mla);
-
    const ggml_tensor * mask = dst->src[3];

    ggml_tensor * KQV = dst;
@@ -693,13 +689,9 @@ void launch_fattn(
    GGML_ASSERT(Q->type == GGML_TYPE_F32);
    GGML_ASSERT(KQV->type == GGML_TYPE_F32);

-    GGML_ASSERT(      Q->nb[0] == ggml_element_size(Q));
-    GGML_ASSERT(      K->nb[0] == ggml_element_size(K));
-    GGML_ASSERT(!V || V->nb[0] == ggml_element_size(V));
-
    GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
    GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
-        "the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
+                                "the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");

    GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding.");

@@ -721,10 +713,10 @@ void launch_fattn(
    size_t nb12 = K->nb[2];
    size_t nb13 = K->nb[3];

-    const char * V_data = V ? (const char *) V->data : nullptr;
-    size_t nb21 = V ? V->nb[1] : nb11;
-    size_t nb22 = V ? V->nb[2] : nb12;
-    size_t nb23 = V ? V->nb[3] : nb13;
+    const char * V_data = (const char *) V->data;
+    size_t nb21 = V->nb[1];
+    size_t nb22 = V->nb[2];
+    size_t nb23 = V->nb[3];

    if (need_f16_K && K->type != GGML_TYPE_F16) {
        GGML_ASSERT(ggml_is_contiguously_allocated(K));
@@ -741,7 +733,7 @@ void launch_fattn(
        nb13 = nb13*bs*sizeof(half)/ts;
    }

-    if (V && need_f16_V && V->type != GGML_TYPE_F16) {
+    if (need_f16_V && V->type != GGML_TYPE_F16) {
        GGML_ASSERT(ggml_is_contiguously_allocated(V));
        V_f16.alloc(ggml_nelements(V));
        to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
@@ -762,13 +754,10 @@ void launch_fattn(
    const int ntiles_total = ntiles_x * (Q->ne[2] / ncols2) * Q->ne[3];

    const dim3 block_dim(warp_size, nwarps, 1);
-    int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
-    CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
-
    dim3 blocks_num;
    if (stream_k) {
        // For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup.
-        const int max_blocks = max_blocks_per_sm*nsm;
+        const int max_blocks = 2*nsm;
        const int tiles_nwaves = (ntiles_total + max_blocks - 1) / max_blocks;
        const int tiles_efficiency_percent = 100 * ntiles_total / (max_blocks*tiles_nwaves);

@@ -780,11 +769,14 @@ void launch_fattn(
        blocks_num.y = 1;
        blocks_num.z = 1;

-        dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
+        dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + D) * sizeof(float));
    } else {
        GGML_ASSERT(K->ne[1] % KQ_row_granularity == 0);
        const int ntiles_KQ = K->ne[1] / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size.

+        int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
+        CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
+
        // parallel_blocks should be at least large enough to achieve max. occupancy for a single wave:
        parallel_blocks = std::max((nsm * max_blocks_per_sm) / ntiles_total, 1);

@@ -861,19 +853,19 @@ void launch_fattn(

    if (stream_k) {
        if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
-            const dim3 block_dim_combine(DV, 1, 1);
+            const dim3 block_dim_combine(D, 1, 1);
            const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2};

-            flash_attn_stream_k_fixup<DV, ncols1, ncols2>
+            flash_attn_stream_k_fixup<D, ncols1, ncols2, KQ_stride>
                <<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
                ((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], K->ne[1]);
        }
    } else if (parallel_blocks > 1) {
-        const dim3 block_dim_combine(DV, 1, 1);
+        const dim3 block_dim_combine(D, 1, 1);
        const dim3 blocks_num_combine(Q->ne[1], 1, blocks_num.z);
        const size_t nbytes_shared_combine = parallel_blocks*sizeof(float2);

-        flash_attn_combine_results<DV>
+        flash_attn_combine_results<D>
            <<<blocks_num_combine, block_dim_combine, nbytes_shared_combine, main_stream>>>
            (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks);
    }
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
--- a/ggml/src/ggml-cuda/fattn-tile-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu
@@ -307,7 +307,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
            constexpr int    nwarps        = 8;
            constexpr size_t nbytes_shared = 0;
            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1>
+            launch_fattn<D, cols_per_block, 1, -1>
                (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
        } break;
        case 128: {
@@ -315,7 +315,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
            constexpr int    nwarps        = 8;
            constexpr size_t nbytes_shared = 0;
            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1>
+            launch_fattn<D, cols_per_block, 1, -1>
                (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
        } break;
        default: {
--- a/ggml/src/ggml-cuda/fattn-tile-f32.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu
@@ -318,7 +318,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
            constexpr int    nwarps        = 8;
            constexpr size_t nbytes_shared = 0;
            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1>
+            launch_fattn<D, cols_per_block, 1, -1>
                (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
        } break;
        case 128: {
@@ -326,7 +326,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
            constexpr int    nwarps        = 8;
            constexpr size_t nbytes_shared = 0;
            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1>
+            launch_fattn<D, cols_per_block, 1, -1>
                (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
        } break;
        default: {
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -168,7 +168,6 @@ static __global__ void flash_attn_vec_ext_f16(
    for (int j = 0; j < ncols; ++j) {
        KQ[j*D + tid] = -HALF_MAX_HALF;
    }
-    __syncthreads();

    half2 VKQ[ncols] = {{0.0f, 0.0f}};

@@ -316,7 +315,7 @@ void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx,
    constexpr bool need_f16_K = D != 128;
    constexpr bool need_f16_V = D != 128 && D != 64;
    constexpr size_t nbytes_shared = 0;
-    launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
+    launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
 }

 template <int D, ggml_type type_K, ggml_type type_V>
--- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@@ -310,7 +310,7 @@ void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx,
    constexpr bool need_f16_K = D != 128;
    constexpr bool need_f16_V = D != 128 && D != 64;
    constexpr size_t nbytes_shared = 0;
-    launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
+    launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
 }

 template <int D, ggml_type type_K, ggml_type type_V>
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@@ -490,7 +490,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
        fattn_kernel = flash_attn_ext_f16<
            D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), KQ_acc_t, use_logit_softcap>;
    }
-    launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, 0, FATTN_KQ_STRIDE, true, true, false, warp_size);
+    launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, 0, FATTN_KQ_STRIDE, true, true, false, warp_size);
 }

 void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -8,33 +8,58 @@
 #include "fattn-wmma-f16.cuh"
 #include "fattn.cuh"

-template <int DKQ, int DV, int ncols2>
+template <int D, int ncols2>
 static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
    const ggml_tensor * Q = dst->src[0];

-    if constexpr (ncols2 <= 8) {
-        if (Q->ne[1] <= 8/ncols2) {
-            ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 8/ncols2, ncols2>(ctx, dst);
-            return;
-        }
+    if (Q->ne[1] <= 8/ncols2) {
+        ggml_cuda_flash_attn_ext_mma_f16_case<D, 8/ncols2, ncols2>(ctx, dst);
+        return;
    }

    if (Q->ne[1] <= 16/ncols2) {
-        ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 16/ncols2, ncols2>(ctx, dst);
+        ggml_cuda_flash_attn_ext_mma_f16_case<D, 16/ncols2, ncols2>(ctx, dst);
        return;
    }

-    if (ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_TURING || Q->ne[1] <= 32/ncols2) {
-        ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 32/ncols2, ncols2>(ctx, dst);
+    if (Q->ne[1] <= 32/ncols2) {
+        ggml_cuda_flash_attn_ext_mma_f16_case<D, 32/ncols2, ncols2>(ctx, dst);
        return;
    }

-    ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 64/ncols2, ncols2>(ctx, dst);
+    ggml_cuda_flash_attn_ext_mma_f16_case<D, 64/ncols2, ncols2>(ctx, dst);
 }

-template <int DKQ, int DV>
-static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+template <int ncols2>
+static void ggml_cuda_flash_attn_ext_mma_f16_switch_hs(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * Q = dst->src[0];
+
+    switch (Q->ne[0]) {
+        case 64:
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 64, ncols2>(ctx, dst);
+            break;
+        case 80:
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 80, ncols2>(ctx, dst);
+            break;
+        case 96:
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 96, ncols2>(ctx, dst);
+            break;
+        case 112:
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<112, ncols2>(ctx, dst);
+            break;
+        case 128:
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<128, ncols2>(ctx, dst);
+            break;
+        case 256:
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<256, ncols2>(ctx, dst);
+            break;
+        default:
+            GGML_ABORT("fatal error");
+            break;
+    }
+}
+
+static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * KQV  = dst;
    const ggml_tensor * Q    = dst->src[0];
    const ggml_tensor * K    = dst->src[1];
@@ -43,79 +68,27 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con
    float max_bias = 0.0f;
    memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));

-    const bool use_gqa_opt = mask && max_bias == 0.0f;
+    const float use_gqa_opt = mask && max_bias == 0.0f;

    GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
    const int gqa_ratio = Q->ne[2] / K->ne[2];

    if (use_gqa_opt && gqa_ratio % 8 == 0) {
-        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 8>(ctx, dst);
+        ggml_cuda_flash_attn_ext_mma_f16_switch_hs<8>(ctx, dst);
        return;
    }

-    if (use_gqa_opt && gqa_ratio % 4 == 0) {
-        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 4>(ctx, dst);
+    if (use_gqa_opt && gqa_ratio == 4) {
+        ggml_cuda_flash_attn_ext_mma_f16_switch_hs<4>(ctx, dst);
        return;
    }

-    if (use_gqa_opt && gqa_ratio % 2 == 0) {
-        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
+    if (use_gqa_opt && gqa_ratio == 2) {
+        ggml_cuda_flash_attn_ext_mma_f16_switch_hs<2>(ctx, dst);
        return;
    }

-    ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 1>(ctx, dst);
-}
-
-static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * KQV  = dst;
-    const ggml_tensor * Q    = dst->src[0];
-    const ggml_tensor * K    = dst->src[1];
-    const ggml_tensor * V    = dst->src[2];
-    const ggml_tensor * mask = dst->src[3];
-
-    switch (Q->ne[0]) {
-        case 64:
-            GGML_ASSERT(V->ne[0] == 64);
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2< 64,  64>(ctx, dst);
-            break;
-        case 80:
-            GGML_ASSERT(V->ne[0] == 80);
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2< 80,  80>(ctx, dst);
-            break;
-        case 96:
-            GGML_ASSERT(V->ne[0] == 96);
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2< 96,  96>(ctx, dst);
-            break;
-        case 112:
-            GGML_ASSERT(V->ne[0] == 112);
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<112, 112>(ctx, dst);
-            break;
-        case 128:
-            GGML_ASSERT(V->ne[0] == 128);
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<128, 128>(ctx, dst);
-            break;
-        case 256:
-            GGML_ASSERT(V->ne[0] == 256);
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<256, 256>(ctx, dst);
-            break;
-        case 576: {
-            // For Deepseek, go straight to the ncols1 switch to avoid compiling unnecessary kernels.
-            GGML_ASSERT(V->ne[0] == 512);
-            float max_bias = 0.0f;
-            memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
-
-            const bool use_gqa_opt = mask && max_bias == 0.0f;
-            GGML_ASSERT(use_gqa_opt);
-
-            GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
-            const int gqa_ratio = Q->ne[2] / K->ne[2];
-            GGML_ASSERT(gqa_ratio % 16 == 0);
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
-        } break;
-        default:
-            GGML_ABORT("fatal error");
-            break;
-    }
+    ggml_cuda_flash_attn_ext_mma_f16_switch_hs<1>(ctx, dst);
 }

 #define FATTN_VEC_F16_CASE(D, type_K, type_V)                               \
@@ -326,7 +299,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
    const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
    const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && cc < GGML_CUDA_CC_ADA_LOVELACE && !mma_needs_data_conversion;
-    const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0;
+    const bool can_use_vector_kernel = Q->ne[0] % (2*warp_size) == 0;
    if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
        if (prec == GGML_PREC_DEFAULT) {
            ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
--- a/ggml/src/ggml-cuda/getrows.cu
+++ b/ggml/src/ggml-cuda/getrows.cu
@@ -10,11 +10,10 @@ static __global__ void k_get_rows(
        /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
        const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {

-    // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
-    const int i00 = (blockIdx.y * blockDim.x + threadIdx.x)*2;
-    const int i10 =  blockIdx.x;
-    const int i11 =  blockIdx.z / ne12;
-    const int i12 =  blockIdx.z % ne12;
+    const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
+    const int i10 =  blockDim.y*blockIdx.y + threadIdx.y;
+    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
+    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;

    if (i00 >= ne00) {
        return;
@@ -47,11 +46,10 @@ static __global__ void k_get_rows_float(
        /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
        const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {

-    // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
-    const int i00 = blockIdx.y * blockDim.x + threadIdx.x;
-    const int i10 = blockIdx.x;
-    const int i11 = blockIdx.z / ne12;
-    const int i12 = blockIdx.z % ne12;
+    const int i00 =  blockIdx.x*blockDim.x + threadIdx.x;
+    const int i10 =  blockDim.y*blockIdx.y + threadIdx.y;
+    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
+    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;

    if (i00 >= ne00) {
        return;
@@ -96,8 +94,8 @@ static void get_rows_cuda_q(
        const size_t nb1, const size_t nb2, const size_t nb3,
        cudaStream_t stream) {
    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
-    const int block_num_y = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
-    const dim3 block_nums(ne10, block_num_y, ne11*ne12);
+    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
+    const dim3 block_nums(block_num_x, ne10, ne11*ne12);

    // strides in elements
    // const size_t s0 = nb0 / sizeof(dst_t);
@@ -129,8 +127,8 @@ static void get_rows_cuda_float(
        const size_t nb1, const size_t nb2, const size_t nb3,
        cudaStream_t stream) {
    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
-    const int block_num_y = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
-    const dim3 block_nums(ne10, block_num_y, ne11*ne12);
+    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
+    const dim3 block_nums(block_num_x, ne10, ne11*ne12);

    // strides in elements
    // const size_t s0 = nb0 / sizeof(dst_t);
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1909,19 +1909,13 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
 static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);

-    // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
-    // But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
-    // Therefore, in such cases use cuBLAS.
-    const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
-        && ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
-
    bool use_mul_mat_vec   = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
-    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
+    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
-    bool use_mul_mat_q     = ggml_is_quantized(src0->type) && !bad_padding_clear
+    bool use_mul_mat_q     = ggml_is_quantized(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;

    bool any_gpus_with_slow_fp16   = false;
@@ -3221,16 +3215,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            return false;
 #endif // FLASH_ATTN_AVAILABLE
            if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
-                const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
-                if (!new_mma_available(cc)) {
-                    return false;
-                }
-                const int gqa_ratio = op->src[0]->ne[2] / op->src[1]->ne[2];
-                return op->src[1]->ne[0] == 576 && op->src[2]->ne[0] == 512 && op->src[3] && gqa_ratio % 16 == 0;
+                // different head sizes of K and V are not supported yet
+                return false;
            }
            if (op->src[0]->ne[0] == 192) {
                return false;
            }
+            if (op->src[0]->ne[0] == 576) {
+                // DeepSeek MLA
+                return false;
+            }
            if (op->src[0]->ne[3] != 1) {
                return false;
            }
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -91,11 +91,11 @@ void ggml_cuda_mul_mat_q(

    // If src0 is a temporary compute buffer, clear any potential padding.
    if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
+        GGML_ASSERT(ggml_is_contiguously_allocated(src0));
+        GGML_ASSERT(!src0->view_src);
        const size_t size_data  = ggml_nbytes(src0);
        const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
        if (size_alloc > size_data) {
-            GGML_ASSERT(ggml_is_contiguously_allocated(src0));
-            GGML_ASSERT(!src0->view_src);
            CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
        }
    }
@@ -122,7 +122,6 @@ void ggml_cuda_mul_mat_q(
            const int64_t s13 = src1->nb[3] / ts_src1;
            quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type,
                ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
-            CUDA_CHECK(cudaGetLastError());
        }

        const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
@@ -206,7 +205,6 @@ void ggml_cuda_mul_mat_q(
        const int64_t s13 = src1->nb[2] / ts_src1;
        quantize_mmq_q8_1_cuda(src1_d, ids_src1_dev, src1_q8_1.get(), src0->type,
            ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
-        CUDA_CHECK(cudaGetLastError());
    }

    const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -515,11 +515,11 @@ void ggml_cuda_mul_mat_vec_q(

    // If src0 is a temporary compute buffer, clear any potential padding.
    if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
+        GGML_ASSERT(ggml_is_contiguously_allocated(src0));
+        GGML_ASSERT(!src0->view_src);
        const size_t size_data  = ggml_nbytes(src0);
        const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
        if (size_alloc > size_data) {
-            GGML_ASSERT(ggml_is_contiguously_allocated(src0));
-            GGML_ASSERT(!src0->view_src);
            CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
        }
    }
--- a/ggml/src/ggml-cuda/quantize.cu
+++ b/ggml/src/ggml-cuda/quantize.cu
@@ -56,13 +56,13 @@ static __global__ void quantize_mmq_q8_1(
    constexpr int vals_per_scale = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 64 : 32;
    constexpr int vals_per_sum   = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 16 : 32;

-    const int64_t i0 = ((int64_t)blockDim.x*blockIdx.y + threadIdx.x)*4;
+    const int64_t i0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4;

    if (i0 >= ne0) {
        return;
    }

-    const int64_t i1 = blockIdx.x;
+    const int64_t i1 = blockIdx.y;
    const int64_t i2 = blockIdx.z % ne2;
    const int64_t i3 = blockIdx.z / ne2;

@@ -75,8 +75,8 @@ static __global__ void quantize_mmq_q8_1(

    block_q8_1_mmq * y = (block_q8_1_mmq *) vy;

-    const int64_t ib0 = blockIdx.z*((int64_t)gridDim.x*gridDim.y*blockDim.x/QK8_1); // first block of channel
-    const int64_t ib  = ib0 + (i0 / (4*QK8_1))*ne1 + blockIdx.x;                    // block index in channel
+    const int64_t ib0 = blockIdx.z*((int64_t)gridDim.y*gridDim.x*blockDim.x/QK8_1); // first block of channel
+    const int64_t ib  = ib0 + (i0 / (4*QK8_1))*ne1 + blockIdx.y;                    // block index in channel
    const int64_t iqs = i0 % (4*QK8_1);                                             // quant index in block

    // Load 4 floats per thread and calculate max. abs. value between them:
@@ -166,9 +166,8 @@ void quantize_mmq_q8_1_cuda(
    GGML_ASSERT(ne00 % 4 == 0);
    GGML_ASSERT(ne0 % (4*QK8_1) == 0);

-    // ne1 tends to assume the highest values, therefore use it as the "x" dimension of the CUDA grid:
-    const int64_t block_num_y = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
-    const dim3 num_blocks(ne1, block_num_y, ne2*ne3);
+    const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
+    const dim3 num_blocks(block_num_x, ne1, ne2*ne3);
    const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE_MMQ, 1, 1);
    switch (mmq_get_q8_1_ds_layout(type_src0)) {
        case MMQ_Q8_1_DS_LAYOUT_D4:
--- a/ggml/src/ggml-cuda/sum.cu
+++ b/ggml/src/ggml-cuda/sum.cu
@@ -31,7 +31,7 @@ void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(ggml_is_contiguously_allocated(src0));
+    GGML_ASSERT(ggml_is_contiguous(src0));

    const float * src0_d = (const float *) src0->data;
    float * dst_d = (float *) dst->data;
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu
@@ -1,5 +0,0 @@
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-mma-f16.cuh"
-
-DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 64, 1, 8);
-DECL_FATTN_MMA_F16_CASE(80, 80, 1, 8);
-DECL_FATTN_MMA_F16_CASE(96, 96, 1, 8);
-DECL_FATTN_MMA_F16_CASE(112, 112, 1, 8);
-DECL_FATTN_MMA_F16_CASE(128, 128, 1, 8);
-DECL_FATTN_MMA_F16_CASE(256, 256, 1, 8);
+DECL_FATTN_MMA_F16_CASE(64, 1, 8);
+DECL_FATTN_MMA_F16_CASE(80, 1, 8);
+DECL_FATTN_MMA_F16_CASE(96, 1, 8);
+DECL_FATTN_MMA_F16_CASE(112, 1, 8);
+DECL_FATTN_MMA_F16_CASE(128, 1, 8);
+DECL_FATTN_MMA_F16_CASE(256, 1, 8);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 64, 16, 1);
-DECL_FATTN_MMA_F16_CASE(80, 80, 16, 1);
-DECL_FATTN_MMA_F16_CASE(96, 96, 16, 1);
-DECL_FATTN_MMA_F16_CASE(112, 112, 16, 1);
-DECL_FATTN_MMA_F16_CASE(128, 128, 16, 1);
-DECL_FATTN_MMA_F16_CASE(256, 256, 16, 1);
+DECL_FATTN_MMA_F16_CASE(64, 16, 1);
+DECL_FATTN_MMA_F16_CASE(80, 16, 1);
+DECL_FATTN_MMA_F16_CASE(96, 16, 1);
+DECL_FATTN_MMA_F16_CASE(112, 16, 1);
+DECL_FATTN_MMA_F16_CASE(128, 16, 1);
+DECL_FATTN_MMA_F16_CASE(256, 16, 1);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 64, 16, 2);
-DECL_FATTN_MMA_F16_CASE(80, 80, 16, 2);
-DECL_FATTN_MMA_F16_CASE(96, 96, 16, 2);
-DECL_FATTN_MMA_F16_CASE(112, 112, 16, 2);
-DECL_FATTN_MMA_F16_CASE(128, 128, 16, 2);
-DECL_FATTN_MMA_F16_CASE(256, 256, 16, 2);
+DECL_FATTN_MMA_F16_CASE(64, 16, 2);
+DECL_FATTN_MMA_F16_CASE(80, 16, 2);
+DECL_FATTN_MMA_F16_CASE(96, 16, 2);
+DECL_FATTN_MMA_F16_CASE(112, 16, 2);
+DECL_FATTN_MMA_F16_CASE(128, 16, 2);
+DECL_FATTN_MMA_F16_CASE(256, 16, 2);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 64, 16, 4);
-DECL_FATTN_MMA_F16_CASE(80, 80, 16, 4);
-DECL_FATTN_MMA_F16_CASE(96, 96, 16, 4);
-DECL_FATTN_MMA_F16_CASE(112, 112, 16, 4);
-DECL_FATTN_MMA_F16_CASE(128, 128, 16, 4);
-DECL_FATTN_MMA_F16_CASE(256, 256, 16, 4);
+DECL_FATTN_MMA_F16_CASE(64, 16, 4);
+DECL_FATTN_MMA_F16_CASE(80, 16, 4);
+DECL_FATTN_MMA_F16_CASE(96, 16, 4);
+DECL_FATTN_MMA_F16_CASE(112, 16, 4);
+DECL_FATTN_MMA_F16_CASE(128, 16, 4);
+DECL_FATTN_MMA_F16_CASE(256, 16, 4);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu
@@ -1,5 +0,0 @@
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-mma-f16.cuh"
-
-DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 64, 2, 4);
-DECL_FATTN_MMA_F16_CASE(80, 80, 2, 4);
-DECL_FATTN_MMA_F16_CASE(96, 96, 2, 4);
-DECL_FATTN_MMA_F16_CASE(112, 112, 2, 4);
-DECL_FATTN_MMA_F16_CASE(128, 128, 2, 4);
-DECL_FATTN_MMA_F16_CASE(256, 256, 2, 4);
+DECL_FATTN_MMA_F16_CASE(64, 2, 4);
+DECL_FATTN_MMA_F16_CASE(80, 2, 4);
+DECL_FATTN_MMA_F16_CASE(96, 2, 4);
+DECL_FATTN_MMA_F16_CASE(112, 2, 4);
+DECL_FATTN_MMA_F16_CASE(128, 2, 4);
+DECL_FATTN_MMA_F16_CASE(256, 2, 4);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 64, 2, 8);
-DECL_FATTN_MMA_F16_CASE(80, 80, 2, 8);
-DECL_FATTN_MMA_F16_CASE(96, 96, 2, 8);
-DECL_FATTN_MMA_F16_CASE(112, 112, 2, 8);
-DECL_FATTN_MMA_F16_CASE(128, 128, 2, 8);
-DECL_FATTN_MMA_F16_CASE(256, 256, 2, 8);
+DECL_FATTN_MMA_F16_CASE(64, 2, 8);
+DECL_FATTN_MMA_F16_CASE(80, 2, 8);
+DECL_FATTN_MMA_F16_CASE(96, 2, 8);
+DECL_FATTN_MMA_F16_CASE(112, 2, 8);
+DECL_FATTN_MMA_F16_CASE(128, 2, 8);
+DECL_FATTN_MMA_F16_CASE(256, 2, 8);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 64, 32, 1);
-DECL_FATTN_MMA_F16_CASE(80, 80, 32, 1);
-DECL_FATTN_MMA_F16_CASE(96, 96, 32, 1);
-DECL_FATTN_MMA_F16_CASE(112, 112, 32, 1);
-DECL_FATTN_MMA_F16_CASE(128, 128, 32, 1);
-DECL_FATTN_MMA_F16_CASE(256, 256, 32, 1);
+DECL_FATTN_MMA_F16_CASE(64, 32, 1);
+DECL_FATTN_MMA_F16_CASE(80, 32, 1);
+DECL_FATTN_MMA_F16_CASE(96, 32, 1);
+DECL_FATTN_MMA_F16_CASE(112, 32, 1);
+DECL_FATTN_MMA_F16_CASE(128, 32, 1);
+DECL_FATTN_MMA_F16_CASE(256, 32, 1);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 64, 32, 2);
-DECL_FATTN_MMA_F16_CASE(80, 80, 32, 2);
-DECL_FATTN_MMA_F16_CASE(96, 96, 32, 2);
-DECL_FATTN_MMA_F16_CASE(112, 112, 32, 2);
-DECL_FATTN_MMA_F16_CASE(128, 128, 32, 2);
-DECL_FATTN_MMA_F16_CASE(256, 256, 32, 2);
+DECL_FATTN_MMA_F16_CASE(64, 32, 2);
+DECL_FATTN_MMA_F16_CASE(80, 32, 2);
+DECL_FATTN_MMA_F16_CASE(96, 32, 2);
+DECL_FATTN_MMA_F16_CASE(112, 32, 2);
+DECL_FATTN_MMA_F16_CASE(128, 32, 2);
+DECL_FATTN_MMA_F16_CASE(256, 32, 2);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu
@@ -1,5 +0,0 @@
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-mma-f16.cuh"
-
-DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 64, 4, 2);
-DECL_FATTN_MMA_F16_CASE(80, 80, 4, 2);
-DECL_FATTN_MMA_F16_CASE(96, 96, 4, 2);
-DECL_FATTN_MMA_F16_CASE(112, 112, 4, 2);
-DECL_FATTN_MMA_F16_CASE(128, 128, 4, 2);
-DECL_FATTN_MMA_F16_CASE(256, 256, 4, 2);
+DECL_FATTN_MMA_F16_CASE(64, 4, 2);
+DECL_FATTN_MMA_F16_CASE(80, 4, 2);
+DECL_FATTN_MMA_F16_CASE(96, 4, 2);
+DECL_FATTN_MMA_F16_CASE(112, 4, 2);
+DECL_FATTN_MMA_F16_CASE(128, 4, 2);
+DECL_FATTN_MMA_F16_CASE(256, 4, 2);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 64, 4, 4);
-DECL_FATTN_MMA_F16_CASE(80, 80, 4, 4);
-DECL_FATTN_MMA_F16_CASE(96, 96, 4, 4);
-DECL_FATTN_MMA_F16_CASE(112, 112, 4, 4);
-DECL_FATTN_MMA_F16_CASE(128, 128, 4, 4);
-DECL_FATTN_MMA_F16_CASE(256, 256, 4, 4);
+DECL_FATTN_MMA_F16_CASE(64, 4, 4);
+DECL_FATTN_MMA_F16_CASE(80, 4, 4);
+DECL_FATTN_MMA_F16_CASE(96, 4, 4);
+DECL_FATTN_MMA_F16_CASE(112, 4, 4);
+DECL_FATTN_MMA_F16_CASE(128, 4, 4);
+DECL_FATTN_MMA_F16_CASE(256, 4, 4);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 64, 4, 8);
-DECL_FATTN_MMA_F16_CASE(80, 80, 4, 8);
-DECL_FATTN_MMA_F16_CASE(96, 96, 4, 8);
-DECL_FATTN_MMA_F16_CASE(112, 112, 4, 8);
-DECL_FATTN_MMA_F16_CASE(128, 128, 4, 8);
-DECL_FATTN_MMA_F16_CASE(256, 256, 4, 8);
+DECL_FATTN_MMA_F16_CASE(64, 4, 8);
+DECL_FATTN_MMA_F16_CASE(80, 4, 8);
+DECL_FATTN_MMA_F16_CASE(96, 4, 8);
+DECL_FATTN_MMA_F16_CASE(112, 4, 8);
+DECL_FATTN_MMA_F16_CASE(128, 4, 8);
+DECL_FATTN_MMA_F16_CASE(256, 4, 8);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 64, 64, 1);
-DECL_FATTN_MMA_F16_CASE(80, 80, 64, 1);
-DECL_FATTN_MMA_F16_CASE(96, 96, 64, 1);
-DECL_FATTN_MMA_F16_CASE(112, 112, 64, 1);
-DECL_FATTN_MMA_F16_CASE(128, 128, 64, 1);
-DECL_FATTN_MMA_F16_CASE(256, 256, 64, 1);
+DECL_FATTN_MMA_F16_CASE(64, 64, 1);
+DECL_FATTN_MMA_F16_CASE(80, 64, 1);
+DECL_FATTN_MMA_F16_CASE(96, 64, 1);
+DECL_FATTN_MMA_F16_CASE(112, 64, 1);
+DECL_FATTN_MMA_F16_CASE(128, 64, 1);
+DECL_FATTN_MMA_F16_CASE(256, 64, 1);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 64, 8, 1);
-DECL_FATTN_MMA_F16_CASE(80, 80, 8, 1);
-DECL_FATTN_MMA_F16_CASE(96, 96, 8, 1);
-DECL_FATTN_MMA_F16_CASE(112, 112, 8, 1);
-DECL_FATTN_MMA_F16_CASE(128, 128, 8, 1);
-DECL_FATTN_MMA_F16_CASE(256, 256, 8, 1);
+DECL_FATTN_MMA_F16_CASE(64, 8, 1);
+DECL_FATTN_MMA_F16_CASE(80, 8, 1);
+DECL_FATTN_MMA_F16_CASE(96, 8, 1);
+DECL_FATTN_MMA_F16_CASE(112, 8, 1);
+DECL_FATTN_MMA_F16_CASE(128, 8, 1);
+DECL_FATTN_MMA_F16_CASE(256, 8, 1);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 64, 8, 2);
-DECL_FATTN_MMA_F16_CASE(80, 80, 8, 2);
-DECL_FATTN_MMA_F16_CASE(96, 96, 8, 2);
-DECL_FATTN_MMA_F16_CASE(112, 112, 8, 2);
-DECL_FATTN_MMA_F16_CASE(128, 128, 8, 2);
-DECL_FATTN_MMA_F16_CASE(256, 256, 8, 2);
+DECL_FATTN_MMA_F16_CASE(64, 8, 2);
+DECL_FATTN_MMA_F16_CASE(80, 8, 2);
+DECL_FATTN_MMA_F16_CASE(96, 8, 2);
+DECL_FATTN_MMA_F16_CASE(112, 8, 2);
+DECL_FATTN_MMA_F16_CASE(128, 8, 2);
+DECL_FATTN_MMA_F16_CASE(256, 8, 2);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 64, 8, 4);
-DECL_FATTN_MMA_F16_CASE(80, 80, 8, 4);
-DECL_FATTN_MMA_F16_CASE(96, 96, 8, 4);
-DECL_FATTN_MMA_F16_CASE(112, 112, 8, 4);
-DECL_FATTN_MMA_F16_CASE(128, 128, 8, 4);
-DECL_FATTN_MMA_F16_CASE(256, 256, 8, 4);
+DECL_FATTN_MMA_F16_CASE(64, 8, 4);
+DECL_FATTN_MMA_F16_CASE(80, 8, 4);
+DECL_FATTN_MMA_F16_CASE(96, 8, 4);
+DECL_FATTN_MMA_F16_CASE(112, 8, 4);
+DECL_FATTN_MMA_F16_CASE(128, 8, 4);
+DECL_FATTN_MMA_F16_CASE(256, 8, 4);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 64, 8, 8);
-DECL_FATTN_MMA_F16_CASE(80, 80, 8, 8);
-DECL_FATTN_MMA_F16_CASE(96, 96, 8, 8);
-DECL_FATTN_MMA_F16_CASE(112, 112, 8, 8);
-DECL_FATTN_MMA_F16_CASE(128, 128, 8, 8);
-DECL_FATTN_MMA_F16_CASE(256, 256, 8, 8);
+DECL_FATTN_MMA_F16_CASE(64, 8, 8);
+DECL_FATTN_MMA_F16_CASE(80, 8, 8);
+DECL_FATTN_MMA_F16_CASE(96, 8, 8);
+DECL_FATTN_MMA_F16_CASE(112, 8, 8);
+DECL_FATTN_MMA_F16_CASE(128, 8, 8);
+DECL_FATTN_MMA_F16_CASE(256, 8, 8);
--- a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
+++ b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
@@ -18,7 +18,7 @@ SOURCE_FATTN_MMA_START = """// This file has been autogenerated by generate_cu_f

 """

-SOURCE_FATTN_MMA_CASE = "DECL_FATTN_MMA_F16_CASE({head_size_kq}, {head_size_v}, {ncols1}, {ncols2});\n"
+SOURCE_FATTN_MMA_CASE = "DECL_FATTN_MMA_F16_CASE({head_size}, {ncols1}, {ncols2});\n"

 TYPES_MMQ = [
    "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
@@ -57,21 +57,18 @@ for vkq_size in [16, 32]:
                with open(f"fattn-vec-f{vkq_size}-instance-hs{head_size}-{get_short_name(type_k)}-{get_short_name(type_v)}.cu", "w") as f:
                    f.write(SOURCE_FATTN_VEC.format(vkq_size=vkq_size, head_size=head_size, type_k=type_k, type_v=type_v))

-for ncols in [8, 16, 32, 64]:
-    for ncols2 in [1, 2, 4, 8, 16]:
-        if ncols2 > ncols:
-            continue
+for ncols in [8, 16, 32, 64, 128]:
+    for ncols2 in [1, 2, 4, 8]:
        ncols1 = ncols // ncols2
+        if ncols == 128:
+            continue  # Too much register pressure.
        with open(f"fattn-mma-f16-instance-ncols1_{ncols1}-ncols2_{ncols2}.cu", "w") as f:
            f.write(SOURCE_FATTN_MMA_START)

-            for head_size_kq in [64, 80, 96, 112, 128, 256, 576]:
-                if head_size_kq != 576 and ncols2 == 16:
-                    continue
-                if head_size_kq == 576 and ncols2 != 16:
-                    continue
-                head_size_v = head_size_kq if head_size_kq != 576 else 512
-                f.write(SOURCE_FATTN_MMA_CASE.format(ncols1=ncols1, ncols2=ncols2, head_size_kq=head_size_kq, head_size_v=head_size_v))
+            for head_size in [64, 80, 96, 112, 128, 256]:
+                if ncols == 128 and head_size == 256:
+                    continue  # Needs too much shared memory.
+                f.write(SOURCE_FATTN_MMA_CASE.format(ncols1=ncols1, ncols2=ncols2, head_size=head_size))

 for type in TYPES_MMQ:
    with open(f"mmq-instance-{get_short_name(type)}.cu", "w") as f:
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -207,10 +207,6 @@ typedef struct {
    float    attn_factor;
    float    beta_fast;
    float    beta_slow;
-    int32_t  sect_0;
-    int32_t  sect_1;
-    int32_t  sect_2;
-    int32_t  sect_3;
 } ggml_metal_kargs_rope;

 typedef struct {
@@ -303,42 +299,21 @@ typedef struct {
 } ggml_metal_kargs_mul_mv_ext;

 typedef struct {
-    int32_t  ne10;
-    int32_t  ne11;  // n_expert_used (bcast)
-    uint64_t nb11;
-    uint64_t nb12;
-    int32_t  neh11; // n_tokens
-    uint64_t nbh11;
-    int32_t  ne20;  // n_expert_used
-    uint64_t nb21;
-} ggml_metal_kargs_mul_mm_id_map0;
-
-typedef struct {
-    int32_t  ne20; // n_expert_used
-    int32_t  neh0;
-    int32_t  neh1;
-    uint64_t nbh1;
-    uint64_t nbh2;
-    int32_t  ne0;
-    uint64_t nb1;
-    uint64_t nb2;
-} ggml_metal_kargs_mul_mm_id_map1;
-
-typedef struct {
+    int32_t  nei0;
+    int32_t  nei1;
+    uint64_t nbi1;
    int32_t  ne00;
    int32_t  ne02;
    uint64_t nb01;
    uint64_t nb02;
-    uint64_t nb03;
-    int32_t  neh12;
-    uint64_t nbh10;
-    uint64_t nbh11;
-    uint64_t nbh12;
-    uint64_t nbh13;
-    int32_t  neh0;
-    int32_t  neh1;
-    int16_t  r2;
-    int16_t  r3;
+    int32_t  ne11;
+    int32_t  ne12;
+    int32_t  ne13;
+    uint64_t nb10;
+    uint64_t nb11;
+    uint64_t nb12;
+    int32_t  ne0;
+    int32_t  ne1;
 } ggml_metal_kargs_mul_mm_id;

 typedef struct {
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -306,36 +306,30 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,
    GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,
    GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,
-    GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,
-    GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16,
-    GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32,
-    GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16,
    GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,
    GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,
    GGML_METAL_KERNEL_TYPE_IM2COL_F16,
@@ -415,13 +409,6 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512,
-    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64,
-    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64,
-    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64,
-    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64,
-    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64,
-    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64,
-    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96,
@@ -663,8 +650,7 @@ static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) {
    }

    if (mem_pool->heaps_to_remove.count > 0) {
-        // remove in reverse order
-        for (NSUInteger i = [mem_pool->heaps_to_remove count] - 1; ; --i) {
+        for (NSUInteger i = 0; i < [mem_pool->heaps_to_remove count]; i++) {
            NSUInteger index = [[mem_pool->heaps_to_remove objectAtIndex:i] intValue];
            ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:index];

@@ -673,10 +659,6 @@ static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) {

            [mem_pool->heaps removeObjectAtIndex:index];
            [ptr release];
-
-            if (i == 0) {
-                break;
-            }
        }

        [mem_pool->heaps_to_remove removeAllObjects];
@@ -690,7 +672,7 @@ static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) {
 }

 static id<MTLBuffer> ggml_metal_mem_pool_alloc(struct ggml_metal_mem_pool * mem_pool, size_t size) {
-    const size_t alignment = 256;
+    const size_t alignment = 32;

    const size_t size_aligned = GGML_PAD(size, alignment);

@@ -1260,36 +1242,30 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,                mul_mm_iq1_m_f32,                has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,               mul_mm_iq4_nl_f32,               has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,               mul_mm_iq4_xs_f32,               has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16,              mul_mm_id_map0_f16,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32,              mul_mm_id_map1_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16,               mul_mm_id_f32_f16,               has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F16,               mul_mm_id_f16_f16,               has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F16,              mul_mm_id_bf16_f16,              has_simdgroup_mm && use_bfloat);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F16,              mul_mm_id_q4_0_f16,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F16,              mul_mm_id_q4_1_f16,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F16,              mul_mm_id_q5_0_f16,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F16,              mul_mm_id_q5_1_f16,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F16,              mul_mm_id_q8_0_f16,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F16,              mul_mm_id_q2_K_f16,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F16,              mul_mm_id_q3_K_f16,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F16,              mul_mm_id_q4_K_f16,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F16,              mul_mm_id_q5_K_f16,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F16,              mul_mm_id_q6_K_f16,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F16,           mul_mm_id_iq2_xxs_f16,           has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F16,            mul_mm_id_iq2_xs_f16,            has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F16,           mul_mm_id_iq3_xxs_f16,           has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F16,             mul_mm_id_iq3_s_f16,             has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F16,             mul_mm_id_iq2_s_f16,             has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F16,             mul_mm_id_iq1_s_f16,             has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16,             mul_mm_id_iq1_m_f16,             has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16,            mul_mm_id_iq4_nl_f16,            has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16,            mul_mm_id_iq4_xs_f16,            has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,               mul_mm_id_f32_f32,               has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,               mul_mm_id_f16_f32,               has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32,              mul_mm_id_bf16_f32,              has_simdgroup_mm && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,              mul_mm_id_q4_0_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,              mul_mm_id_q4_1_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,              mul_mm_id_q5_0_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,              mul_mm_id_q5_1_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,              mul_mm_id_q8_0_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,              mul_mm_id_q2_K_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,              mul_mm_id_q3_K_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,              mul_mm_id_q4_K_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,              mul_mm_id_q5_K_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,              mul_mm_id_q6_K_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,           mul_mm_id_iq2_xxs_f32,           has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,            mul_mm_id_iq2_xs_f32,            has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,           mul_mm_id_iq3_xxs_f32,           has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,             mul_mm_id_iq3_s_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,             mul_mm_id_iq2_s_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,             mul_mm_id_iq1_s_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32,             mul_mm_id_iq1_m_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,            mul_mm_id_iq4_nl_f32,            has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,            mul_mm_id_iq4_xs_f32,            has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,                   rope_norm_f32,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,                   rope_norm_f16,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,                  rope_multi_f32,                  true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16,                  rope_multi_f16,                  true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32,                 rope_vision_f32,                 true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16,                 rope_vision_f16,                 true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,                   rope_neox_f32,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,                   rope_neox_f16,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16,                      im2col_f16,                      true);
@@ -1369,13 +1345,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128, flash_attn_ext_q8_0_hk192_hv128, has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,        flash_attn_ext_q8_0_h256,        has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512, flash_attn_ext_q8_0_hk576_hv512, has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64,      flash_attn_ext_vec_f16_h64,      has_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64,     flash_attn_ext_vec_bf16_h64,     has_simdgroup_reduction && use_bfloat);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64,     flash_attn_ext_vec_q4_0_h64,     has_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64,     flash_attn_ext_vec_q4_1_h64,     has_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64,     flash_attn_ext_vec_q5_0_h64,     has_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64,     flash_attn_ext_vec_q5_1_h64,     has_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64,     flash_attn_ext_vec_q8_0_h64,     has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96,      flash_attn_ext_vec_f16_h96,      has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96,     flash_attn_ext_vec_bf16_h96,     has_simdgroup_reduction && use_bfloat);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96,     flash_attn_ext_vec_q4_0_h96,     has_simdgroup_reduction);
@@ -1659,7 +1628,16 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
        case GGML_OP_NORM:
            return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
        case GGML_OP_ROPE:
-            return true;
+            {
+                const int mode = ((const int32_t *) op->op_params)[2];
+                if (mode & GGML_ROPE_TYPE_MROPE) {
+                    return false;
+                }
+                if (mode & GGML_ROPE_TYPE_VISION) {
+                    return false;
+                }
+                return true;
+            }
        case GGML_OP_IM2COL:
            return op->src[0]->type == GGML_TYPE_F16;
        case GGML_OP_POOL_1D:
@@ -3021,7 +2999,7 @@ static bool ggml_metal_encode_node(
                    [encoder setBuffer:id_dst  offset:offs_dst     atIndex:3];

                    [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-                    [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                    [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                } else {
                    id<MTLComputePipelineState> pipeline = nil;

@@ -3241,6 +3219,8 @@ static bool ggml_metal_encode_node(
            } break;
        case GGML_OP_MUL_MAT_ID:
            {
+                const int n_as = src0->ne[2];
+
                // src2 = ids
                const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t);

@@ -3254,21 +3234,24 @@ static bool ggml_metal_encode_node(
                GGML_ASSERT(ne03 == 1);
                GGML_ASSERT(ne13 == 1);

-                const uint32_t r2 = 1;
-                const uint32_t r3 = 1;
-
                // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                // to the matrix-vector kernel
                // ne20 = n_used_experts
-                // ne21 = n_rows (batch size)
-                const int ne21_mm_id_min = 32;
+                // ne21 = n_rows
+                const int dst_rows = ne20*ne21;
+                const int dst_rows_min = n_as;
+                const int dst_rows_max = (device.maxThreadgroupMemoryLength/2 - 8192)/4;
+
+                // max size of the rowids array in the kernel shared buffer
+                //GGML_ASSERT(dst_rows <= dst_rows_max);

                // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
                if ([device supportsFamily:MTLGPUFamilyApple7] &&
                        ne00 % 32 == 0 && ne00 >= 64 &&
-                        (ne21 >= ne21_mm_id_min)) {
-                    GGML_ASSERT(ne00 % 4 == 0);
+                        //ne01 / ne02 >= 512 &&    // NOTE: this is based on Mixtral shapes, might need adjustments
+                        dst_rows >  dst_rows_min &&
+                        dst_rows <= dst_rows_max) {

                    // some Metal matrix data types require aligned pointers
                    // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
@@ -3279,169 +3262,62 @@ static bool ggml_metal_encode_node(
                        default: break;
                    }

-                    const int64_t neh10 = ne10; // n_embd
-                    const int64_t neh11 = ne21; // n_tokens
-                    const int64_t neh12 = ne02; // n_expert
+                    id<MTLComputePipelineState> pipeline = nil;

-                    const uint64_t nbh10 = ggml_type_size(GGML_TYPE_F16);
-                    const uint64_t nbh11 = nbh10*neh10;
-                    const uint64_t nbh12 = nbh11*neh11;
-                    const uint64_t nbh13 = nbh12*neh12;
-
-                    const size_t s_src1 = ggml_type_size(GGML_TYPE_F16)*neh10*neh11*neh12;
-                    id<MTLBuffer> h_src1 = ggml_metal_mem_pool_alloc(mem_pool, s_src1);
-                    if (!h_src1) {
-                        GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_src1);
-                        return false;
+                    switch (src0->type) {
+                        case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32    ].pipeline; break;
+                        case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32    ].pipeline; break;
+                        case GGML_TYPE_BF16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32   ].pipeline; break;
+                        case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32   ].pipeline; break;
+                        case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32   ].pipeline; break;
+                        case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32   ].pipeline; break;
+                        case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32   ].pipeline; break;
+                        case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32   ].pipeline; break;
+                        case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32   ].pipeline; break;
+                        case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
+                        case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
+                        case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
+                        case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
+                        case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break;
+                        default: GGML_ABORT("MUL_MAT_ID not implemented");
                    }

-                    const int64_t neh0 = ne0;
-                    const int64_t neh1 = ne21;
-                    const int64_t neh2 = ne02;
+                    ggml_metal_kargs_mul_mm_id args = {
+                        /*.nei0 =*/ ne20,
+                        /*.nei1 =*/ ne21,
+                        /*.nbi1 =*/ nb21,
+                        /*.ne00 =*/ ne00,
+                        /*.ne02 =*/ ne02,
+                        /*.nb01 =*/ nb01,
+                        /*.nb02 =*/ nb02,
+                        /*.ne11 =*/ ne11,
+                        /*.ne12 =*/ ne12,
+                        /*.ne13 =*/ ne13,
+                        /*.nb10 =*/ nb10,
+                        /*.nb11 =*/ nb11,
+                        /*.nb12 =*/ nb12,
+                        /*.ne0  =*/ ne0,
+                        /*.ne1  =*/ ne1,
+                    };

-                    const uint64_t nbh0 = ggml_type_size(GGML_TYPE_F32);
-                    const uint64_t nbh1 = nbh0*neh0;
-                    const uint64_t nbh2 = nbh1*neh1;
-                  //const uint64_t nbh3 = nbh2*neh2;
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBytes:&args    length:sizeof(args) atIndex:0];
+                    [encoder setBuffer:id_src0 offset:offs_src0    atIndex:1];
+                    [encoder setBuffer:id_src1 offset:offs_src1    atIndex:2];
+                    [encoder setBuffer:id_dst  offset:offs_dst     atIndex:3];
+                    [encoder setBuffer:id_src2 offset:offs_src2    atIndex:4];

-                    const size_t s_dst = ggml_type_size(GGML_TYPE_F32)*neh0*neh1*neh2;
-                    id<MTLBuffer> h_dst = ggml_metal_mem_pool_alloc(mem_pool, s_dst);
-                    if (!h_dst) {
-                        GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_dst);
-                        return false;
-                    }
+                    [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + dst_rows*4/*sizeof(ushort2)*/, 16) atIndex:0];

-                    // tokens per expert
-                    const size_t s_tpe = ggml_type_size(GGML_TYPE_I32)*ne02;
-                    id<MTLBuffer> h_tpe = ggml_metal_mem_pool_alloc(mem_pool, s_tpe);
-                    if (!h_tpe) {
-                        GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_tpe);
-                        return false;
-                    }
-
-                    // id map
-                    // [n_expert_used, n_tokens]
-                    const size_t s_ids = ggml_type_size(GGML_TYPE_I32)*ne20*ne21;
-                    id<MTLBuffer> h_ids = ggml_metal_mem_pool_alloc(mem_pool, s_ids);
-                    if (!h_ids) {
-                        GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_ids);
-                        return false;
-                    }
-
-                    {
-                        const int nth = MIN(1024, ne10/4);
-
-                        ggml_metal_kargs_mul_mm_id_map0 args = {
-                            ne10,
-                            ne11,  // n_expert_used (bcast)
-                            nb11,
-                            nb12,
-                            neh11, // n_tokens
-                            nbh11,
-                            ne20,  // n_expert_used
-                            nb21,
-                        };
-
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBytes:&args    length:sizeof(args) atIndex:0];
-                        [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
-                        [encoder setBuffer:id_src2 offset:offs_src2    atIndex:2];
-                        [encoder setBuffer: h_src1 offset:0            atIndex:3];
-                        [encoder setBuffer: h_tpe  offset:0            atIndex:4];
-                        [encoder setBuffer: h_ids  offset:0            atIndex:5];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne02, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    }
-
-                    {
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        switch (src0->type) {
-                            case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16    ].pipeline; break;
-                            case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F16    ].pipeline; break;
-                            case GGML_TYPE_BF16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F16   ].pipeline; break;
-                            case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F16   ].pipeline; break;
-                            case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F16   ].pipeline; break;
-                            case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F16   ].pipeline; break;
-                            case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F16   ].pipeline; break;
-                            case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F16   ].pipeline; break;
-                            case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F16   ].pipeline; break;
-                            case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F16   ].pipeline; break;
-                            case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F16   ].pipeline; break;
-                            case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F16   ].pipeline; break;
-                            case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F16   ].pipeline; break;
-                            case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F16].pipeline; break;
-                            case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F16 ].pipeline; break;
-                            case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F16].pipeline; break;
-                            case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F16  ].pipeline; break;
-                            case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F16  ].pipeline; break;
-                            case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F16  ].pipeline; break;
-                            case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16  ].pipeline; break;
-                            case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16 ].pipeline; break;
-                            case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16 ].pipeline; break;
-                            default: GGML_ABORT("MUL_MAT_ID not implemented");
-                        }
-
-                        ggml_metal_kargs_mul_mm_id args = {
-                            /*.ne00  =*/ ne00,
-                            /*.ne02  =*/ ne02,
-                            /*.nb01  =*/ nb01,
-                            /*.nb02  =*/ nb02,
-                            /*.nb03  =*/ nb03,
-                            /*.neh12 =*/ neh12,
-                            /*.nbh10 =*/ nbh10,
-                            /*.nbh11 =*/ nbh11,
-                            /*.nbh12 =*/ nbh12,
-                            /*.nbh13 =*/ nbh13,
-                            /*.neh0  =*/ neh0,
-                            /*.neh1  =*/ neh1,
-                            /*.r2    =*/ r2,
-                            /*.r3    =*/ r3,
-                        };
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBytes:&args    length:sizeof(args) atIndex:0];
-                        [encoder setBuffer:id_src0 offset:offs_src0    atIndex:1];
-                        [encoder setBuffer: h_src1 offset:0            atIndex:2];
-                        [encoder setBuffer: h_tpe  offset:0            atIndex:3];
-                        [encoder setBuffer: h_dst  offset:0            atIndex:4];
-
-                        [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-                        [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, ne02) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
-                    }
-
-                    {
-                        GGML_ASSERT(ne0 % 4 == 0);
-
-                        const int nth = MIN(1024, ne0/4);
-
-                        ggml_metal_kargs_mul_mm_id_map1 args = {
-                            ne20, // n_expert_used
-                            neh0,
-                            neh1,
-                            nbh1,
-                            nbh2,
-                            ne0,
-                            nb1,
-                            nb2,
-                        };
-
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBytes:&args   length:sizeof(args) atIndex:0];
-                        [encoder setBuffer: h_dst offset:0            atIndex:1];
-                        [encoder setBuffer: h_ids offset:0            atIndex:2];
-                        [encoder setBuffer:id_dst offset:offs_dst     atIndex:3];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne20, ne21, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    }
+                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, n_as) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                } else {
                    id<MTLComputePipelineState> pipeline = nil;

@@ -3635,7 +3511,7 @@ static bool ggml_metal_encode_node(
                    [encoder setBuffer:id_src2 offset:offs_src2 atIndex:4];

                    const int64_t _ne1 = 1;
-                    const int64_t ne123 = ne20*ne21;
+                    const int64_t ne123 = dst_rows;

                    if (smem > 0) {
                        [encoder setThreadgroupMemoryLength:smem atIndex:0];
@@ -3839,7 +3715,6 @@ static bool ggml_metal_encode_node(
            } break;
        case GGML_OP_ROPE:
            {
-
                // make sure we have one or more position id(ne10) per token(ne02)
                GGML_ASSERT(ne10 % ne02 == 0);
                GGML_ASSERT(ne10 >= ne02);
@@ -3866,42 +3741,20 @@ static bool ggml_metal_encode_node(
                memcpy(&beta_fast,   (const int32_t *) dst->op_params +  9, sizeof(float));
                memcpy(&beta_slow,   (const int32_t *) dst->op_params + 10, sizeof(float));

-                const bool is_neox   = mode & GGML_ROPE_TYPE_NEOX;
-                const bool is_mrope  = mode & GGML_ROPE_TYPE_MROPE;
-                const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
-
-                // mrope
-                const int sect_0 = ((const int32_t *) dst->op_params)[11];
-                const int sect_1 = ((const int32_t *) dst->op_params)[12];
-                const int sect_2 = ((const int32_t *) dst->op_params)[13];
-                const int sect_3 = ((const int32_t *) dst->op_params)[14];
+                const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;

                id<MTLComputePipelineState> pipeline = nil;

-                if (is_neox) {
+                if (!is_neox) {
                    switch (src0->type) {
-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
-                        default: GGML_ABORT("fatal error");
-                    };
-                } else if (is_mrope && !is_vision) {
-                    GGML_ASSERT(ne10*4 >= ne02); // need at least 4 pos per token
-                    switch (src0->type) {
-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32].pipeline; break;
-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16].pipeline; break;
-                        default: GGML_ABORT("fatal error");
-                    };
-                } else if (is_vision) {
-                    GGML_ASSERT(ne10*4 >= ne02); // need at least 4 pos per token
-                    switch (src0->type) {
-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32].pipeline; break;
-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16].pipeline; break;
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
                        default: GGML_ABORT("fatal error");
                    };
                } else {
                    switch (src0->type) {
-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
                        default: GGML_ABORT("fatal error");
                    };
                }
@@ -3932,10 +3785,6 @@ static bool ggml_metal_encode_node(
                    /*.attn_factor =*/ attn_factor,
                    /*.beta_fast   =*/ beta_fast,
                    /*.beta_slow   =*/ beta_slow,
-                    /* sect_0      =*/ sect_0,
-                    /* sect_1      =*/ sect_1,
-                    /* sect_2      =*/ sect_2,
-                    /* sect_3      =*/ sect_3,
                };

                [encoder setComputePipelineState:pipeline];
@@ -4372,7 +4221,7 @@ static bool ggml_metal_encode_node(
                // TODO: add vec kernels for (ne00%64 == 0) and maybe also for (ne00%32 == 0)
                //       for now avoiding mainly to keep the number of templates/kernels a bit lower
                //       these are now trivial to add after: https://github.com/ggml-org/llama.cpp/pull/12612
-                if (ne01 >= 20 || (ne00%128 != 0 && ne00 != 64 && ne00 != 96 && ne00 != 192 && ne00 != 576)) {
+                if (ne01 >= 4 || (ne00%128 != 0 && ne00 != 96 && ne00 != 192 && ne00 != 576)) {
                    switch (src1->type) {
                        case GGML_TYPE_F16:
                            {
@@ -4553,24 +4402,6 @@ static bool ggml_metal_encode_node(
                    use_vec_kernel = true;

                    switch (ne00) {
-                        case 64:
-                            {
-                                switch (src1->type) {
-                                    case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64].pipeline; break;
-                                    case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64].pipeline; break;
-                                    case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64].pipeline; break;
-                                    case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64].pipeline; break;
-                                    case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64].pipeline; break;
-                                    case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64].pipeline; break;
-                                    case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64].pipeline; break;
-                                    default:
-                                        {
-                                            GGML_LOG_ERROR("unsupported type: %d\n", src1->type);
-                                            GGML_LOG_ERROR("add template specialization for this type\n");
-                                            GGML_ABORT("add template specialization for this type");
-                                        }
-                                }
-                            } break;
                        case 96:
                            {
                                switch (src1->type) {
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2713,148 +2713,8 @@ kernel void kernel_rope_neox(
    }
 }

-template<typename T>
-kernel void kernel_rope_multi(
-        constant ggml_metal_kargs_rope & args,
-        device const char * src0,
-        device const char * src1,
-        device const char * src2,
-        device       char * dst,
-        ushort  tiitg[[thread_index_in_threadgroup]],
-        ushort3 tptg [[threads_per_threadgroup]],
-        uint3   tgpig[[threadgroup_position_in_grid]]) {
-    const int i3 = tgpig[2];
-    const int i2 = tgpig[1];
-    const int i1 = tgpig[0];
-
-    float corr_dims[2];
-    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
-
-    device const int32_t * pos = (device const int32_t *) src1;
-
-    const float inv_ndims = -1.f/args.n_dims;
-
-    float cos_theta;
-    float sin_theta;
-
-    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
-        if (i0 < args.n_dims) {
-            const int ic = i0/2;
-
-            // mrope theta calculations
-            // note: the rest is the same as kernel_rope_neox
-            const int sect_dims = args.sect_0 + args.sect_1 + args.sect_2 + args.sect_3;
-            const int sec_w01   = args.sect_0 + args.sect_1;               // end of section 1
-            const int sec_w012  = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2
-            const int sector    = ic % sect_dims;
-
-            float theta_base;
-            if (sector < args.sect_0) {
-                theta_base = (float) pos[i2];
-            } else if (sector < sec_w01) {
-                theta_base = (float) pos[i2 + args.ne02];
-            } else if (sector < sec_w012) {
-                theta_base = (float) pos[i2 + args.ne02 * 2];
-            } else {
-                theta_base = (float) pos[i2 + args.ne02 * 3];
-            }
-            // end of mrope
-
-            const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);
-
-            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
-
-            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
-
-            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
-            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
-
-            const float x0 = src[0];
-            const float x1 = src[args.n_dims/2];
-
-            dst_data[0]             = x0*cos_theta - x1*sin_theta;
-            dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
-        } else {
-            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
-            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
-
-            dst_data[0] = src[0];
-            dst_data[1] = src[1];
-        }
-    }
-}
-
-template<typename T>
-kernel void kernel_rope_vision(
-        constant ggml_metal_kargs_rope & args,
-        device const char * src0,
-        device const char * src1,
-        device const char * src2,
-        device       char * dst,
-        ushort  tiitg[[thread_index_in_threadgroup]],
-        ushort3 tptg [[threads_per_threadgroup]],
-        uint3   tgpig[[threadgroup_position_in_grid]]) {
-    const int i3 = tgpig[2];
-    const int i2 = tgpig[1];
-    const int i1 = tgpig[0];
-
-    float corr_dims[2];
-    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
-
-    device const int32_t * pos = (device const int32_t *) src1;
-
-    const float inv_ndims = -1.f/args.n_dims;
-
-    float cos_theta;
-    float sin_theta;
-
-    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
-        if (i0 < 2*args.n_dims) { // different from kernel_rope_multi
-            const int ic = i0/2;
-
-            // mrope theta calculations (only support 2 dimensions)
-            const int sect_dims = args.sect_0 + args.sect_1;
-            const int sector    = ic % sect_dims;
-
-            float p;
-            float theta_base;
-            if (sector < args.sect_1) {
-                p = (float) sector;
-                theta_base = (float) pos[i2];
-            } else {
-                p = (float) sector - args.sect_0;
-                theta_base = (float) pos[i2 + args.ne02];
-            }
-
-            const float theta = theta_base * pow(args.freq_base, 2.0f * inv_ndims * p);
-            // end of mrope
-
-            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
-
-            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
-
-            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
-            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
-
-            const float x0 = src[0];
-            const float x1 = src[args.n_dims]; // different from kernel_rope_multi
-
-            dst_data[0]           = x0*cos_theta - x1*sin_theta;
-            dst_data[args.n_dims] = x0*sin_theta + x1*cos_theta; // different from kernel_rope_multi
-        } else {
-            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
-            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
-
-            dst_data[0] = src[0];
-            dst_data[1] = src[1];
-        }
-    }
-}
-
 typedef decltype(kernel_rope_norm<float>) kernel_rope_norm_t;
 typedef decltype(kernel_rope_neox<float>) kernel_rope_neox_t;
-typedef decltype(kernel_rope_multi<float>) kernel_rope_multi_t;
-typedef decltype(kernel_rope_vision<float>) kernel_rope_vision_t;

 template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm<float>;
 template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm<half>;
@@ -2862,12 +2722,6 @@ template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_
 template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox<float>;
 template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox<half>;

-template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi<float>;
-template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi<half>;
-
-template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
-template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
-
 typedef void (im2col_t)(
        device const float * x,
        device        char * dst,
@@ -3887,11 +3741,6 @@ kernel void kernel_flash_attn_ext_vec(
                sm[tiisg] = pm[ic + tiisg];
            }

-            // skip -INF blocks
-            if (simd_max(sm[tiisg]) == -INFINITY) {
-                continue;
-            }
-
            // Q*K^T
            {
                // each simdgroup processes 1 query and NE (NW/NL) head elements
@@ -4124,16 +3973,6 @@ kernel void kernel_flash_attn_ext_vec(

 typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 128, 128, 4>) flash_attn_ext_vec_t;

-template [[host_name("kernel_flash_attn_ext_vec_f16_h64")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4,             1, dequantize_f16_t4,  half4,       1, dequantize_f16_t4,  64, 64, 8>;
-#if defined(GGML_METAL_USE_BF16)
-template [[host_name("kernel_flash_attn_ext_vec_bf16_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4,           1, dequantize_bf16_t4, bfloat4,     1, dequantize_bf16_t4, 64, 64, 8>;
-#endif
-template [[host_name("kernel_flash_attn_ext_vec_q4_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0,        8, dequantize_q4_0_t4, block_q4_0,  8, dequantize_q4_0_t4, 64, 64, 8>;
-template [[host_name("kernel_flash_attn_ext_vec_q4_1_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1,        8, dequantize_q4_1_t4, block_q4_1,  8, dequantize_q4_1_t4, 64, 64, 8>;
-template [[host_name("kernel_flash_attn_ext_vec_q5_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0,        8, dequantize_q5_0_t4, block_q5_0,  8, dequantize_q5_0_t4, 64, 64, 8>;
-template [[host_name("kernel_flash_attn_ext_vec_q5_1_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1,        8, dequantize_q5_1_t4, block_q5_1,  8, dequantize_q5_1_t4, 64, 64, 8>;
-template [[host_name("kernel_flash_attn_ext_vec_q8_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0,        8, dequantize_q8_0_t4, block_q8_0,  8, dequantize_q8_0_t4, 64, 64, 8>;
-
 template [[host_name("kernel_flash_attn_ext_vec_f16_h96")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4,             1, dequantize_f16_t4,  half4,       1, dequantize_f16_t4,  96, 96, 4>;
 #if defined(GGML_METAL_USE_BF16)
 template [[host_name("kernel_flash_attn_ext_vec_bf16_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4,           1, dequantize_bf16_t4, bfloat4,     1, dequantize_bf16_t4, 96, 96, 4>;
@@ -6497,219 +6336,127 @@ kernel void kernel_mul_mm(
    }
 }

-template<typename T4>
-kernel void kernel_mul_mm_id_map0(
-        constant ggml_metal_kargs_mul_mm_id_map0 & args,
-        device  const char * src1,
-        device  const char * src2,
-        device        char * hsrc1,
-        device        char * htpe,
-        device        char * hids,
-        uint3   tgpig[[threadgroup_position_in_grid]],
-        ushort3 tpitg[[thread_position_in_threadgroup]],
-        ushort3   ntg[[threads_per_threadgroup]]) {
-    const int ide = tgpig[0]; // expert id
-
-    int n_all = 0;
-
-    device int32_t * ids_i32 = (device int32_t *) (hids);
-
-    for (int i21 = 0; i21 < args.neh11; i21++) { // n_tokens
-        device const int32_t * src2_i32 = (device const int32_t *) (src2 + i21*args.nb21);
-
-        for (int i20 = 0; i20 < args.ne20; i20++) { // n_expert_used
-            if (src2_i32[i20] != ide) {
-                continue;
-            }
-
-            device const float4 *  src1_f32x4 = (device const float4 *) ( src1 + i21*args.nb12 + (i20%args.ne11)*args.nb11);
-            device       T4     * hsrc1_f32x4 = (device       T4     *) (hsrc1 + (ide*args.neh11 + n_all)*args.nbh11);
-
-            for (int64_t i00 = tpitg.x; i00 < args.ne10/4; i00 += ntg.x) {
-                hsrc1_f32x4[i00] = (T4) (src1_f32x4[i00]);
-            }
-
-            if (tpitg.x == 0) {
-                ids_i32[i21*args.ne20 + i20] = ide*args.neh11 + n_all;
-            }
-
-            ++n_all;
-        }
-    }
-
-    if (tpitg.x == 0) {
-        device int32_t * tpe_i32 = (device int32_t *) (htpe);
-        tpe_i32[ide] = n_all;
-    }
-}
-
-typedef decltype(kernel_mul_mm_id_map0<half4>) kernel_mul_mm_id_map0_t;
-
-template [[host_name("kernel_mul_mm_id_map0_f16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<half4>;
-
-template<typename T>
-kernel void kernel_mul_mm_id_map1(
-        constant ggml_metal_kargs_mul_mm_id_map1 & args,
-        device  const char * hdst,
-        device  const char * hids,
-        device        char * dst,
-        uint3   tgpig[[threadgroup_position_in_grid]],
-        ushort3 tpitg[[thread_position_in_threadgroup]],
-        ushort3   ntg[[threads_per_threadgroup]]) {
-    const int i20 = tgpig[0]; // used expert
-    const int i21 = tgpig[1]; // token
-
-    device const int32_t * ids_i32    = (device const int32_t *) (hids);
-    device       float4  * dst_f32x4  = (device       float4  *) (dst + i20*args.nb1 + i21*args.nb2);
-
-    const int id = ids_i32[i21*args.ne20 + i20];
-
-    const int ide = id / args.neh1;
-    const int idt = id % args.neh1;
-
-    device const float4 * hdst_f32x4 = (device const float4 *) (hdst + idt*args.nbh1 + ide*args.nbh2);
-
-    for (int64_t i0 = tpitg.x; i0 < args.neh0/4; i0 += ntg.x) {
-        dst_f32x4[i0] = hdst_f32x4[i0];
-    }
-}
-
-typedef decltype(kernel_mul_mm_id_map1<float>) kernel_mul_mm_id_map1_t;
-
-template [[host_name("kernel_mul_mm_id_map1_f32")]] kernel kernel_mul_mm_id_map1_t kernel_mul_mm_id_map1<float>;
-
-template<typename T, typename T4x4, typename simdgroup_T8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread T4x4 &)>
-kernel void kernel_mul_mm_id(
-        constant ggml_metal_kargs_mul_mm_id & args,
-        device const char * src0,
-        device const char * src1,
-        device const char * tpe,
-        device       char * dst,
-        threadgroup  char * shmem [[threadgroup(0)]],
+// same as kernel_mul_mm_impl, but src1 and dst are accessed via indices stored in rowids
+// TODO: this kernel needs to be reimplemented from scratch for better performance
+template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
+void kernel_mul_mm_id_impl(
+        int32_t  ne00,
+        int32_t  ne02,
+        uint64_t nb01,
+        uint64_t nb02,
+        int32_t  ne11,
+        int32_t  ne12,
+        uint64_t nb10,
+        uint64_t nb11,
+        uint64_t nb12,
+        int32_t  ne0,
+        int32_t  ne1,
+        int64_t  ne0ne1,
+        device   const char * src0,
+        device   const char * src1,
+        threadgroup ushort2 * rowids,
+        device         char * dst,
+        threadgroup    char * shmem,
        uint3  tgpig[[threadgroup_position_in_grid]],
        ushort tiitg[[thread_index_in_threadgroup]],
        ushort sgitg[[simdgroup_index_in_threadgroup]]) {

-    threadgroup T    * sa = (threadgroup T    *)(shmem);
-    threadgroup half * sb = (threadgroup half *)(shmem + 4096);
+    threadgroup half  * sa = (threadgroup half  *)(shmem);
+    threadgroup float * sb = (threadgroup float *)(shmem + 4096);

    const int r0 = tgpig.y;
    const int r1 = tgpig.x;
-    const int im = tgpig.z;

-    device const int32_t * tpe_i32 = (device const int32_t *) (tpe);
-
-    const int neh1 = tpe_i32[im];
-
-    if (r1*BLOCK_SIZE_N >= neh1) {
-        return;
-    }
+    if (r1*BLOCK_SIZE_N >= ne1) return;

    // if this block is of 64x32 shape or smaller
-    const short n_rows = (args.neh0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.neh0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
-    const short n_cols = (     neh1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (     neh1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
+    short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
+    short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;

    // a thread shouldn't load data outside of the matrix
-    const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
-    const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
+    short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
+    short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;

-    simdgroup_T8x8     ma[4];
-    simdgroup_half8x8  mb[2];
+    simdgroup_half8x8  ma[4];
+    simdgroup_float8x8 mb[2];
    simdgroup_float8x8 mc[8];
-
-    for (short i = 0; i < 8; i++){
+    for (int i = 0; i < 8; i++){
        mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
    }
-
    short il = (tiitg % THREAD_PER_ROW);

-    const int i12 = im%args.neh12;
-    const int i13 = im/args.neh12;
+    ushort offset1 = il/nl;

-    const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
-    const short    offset1 = il/nl;
+    threadgroup const auto & id = rowids[r1 * BLOCK_SIZE_N + thread_col];

-    device const block_q * x = (device const block_q *)(src0
-        + args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;
+    device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01) + offset1;
+    device const float   * y = (device const float   *)(src1
+        + nb12 * id[1]
+        + nb11 * (id[0] % ne11)
+        + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));

-    device const half   * y = (device const half   *)(src1
-        + args.nbh13*i13
-        + args.nbh12*i12
-        + args.nbh11*(r1*BLOCK_SIZE_N + thread_col)
-        + args.nbh10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
-
-    for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
+    for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
        // load data and store to threadgroup memory
-        T4x4 temp_a;
+        half4x4 temp_a;
        dequantize_func(x, il, temp_a);
-
        threadgroup_barrier(mem_flags::mem_threadgroup);

-        #pragma unroll(16)
-        for (short i = 0; i < 16; i++) {
-            *(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
-            +                     (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
-            +                     (tiitg/THREAD_PER_ROW)%8  + (i&7)*8) = temp_a[i/4][i%4];
+        for (int i = 0; i < 16; i++) {
+            *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
+            +                     (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
+            +                     (tiitg / THREAD_PER_ROW) % 8  + (i & 7) * 8) = temp_a[i/4][i%4];
        }

-        *(threadgroup half2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = *((device half2x4 *) y);
+        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);

        il = (il + 2 < nl) ? il + 2 : il % 2;
-        x  = (il < 2) ? x + (2 + nl - 1)/nl : x;
+        x  = (il < 2) ? x + (2+nl-1)/nl : x;
        y += BLOCK_SIZE_K;

        threadgroup_barrier(mem_flags::mem_threadgroup);

        // load matrices from threadgroup memory and conduct outer products
-        threadgroup const T    * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
-        threadgroup const half * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
+        threadgroup half  * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
+        threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));

-        #pragma unroll(4)
-        for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
+        #pragma unroll(BLOCK_SIZE_K/8)
+        for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
            #pragma unroll(4)
-            for (short i = 0; i < 4; i++) {
+            for (int i = 0; i < 4; i++) {
                simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
            }
-
            simdgroup_barrier(mem_flags::mem_none);
-
            #pragma unroll(2)
-            for (short i = 0; i < 2; i++) {
+            for (int i = 0; i < 2; i++) {
                simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
            }

+            lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
+            lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
+
            #pragma unroll(8)
-            for (short i = 0; i < 8; i++){
+            for (int i = 0; i < 8; i++){
                simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
            }
-
-            lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
-            lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
        }
    }

-    if ((r0 + 1) * BLOCK_SIZE_M <= args.neh0 && (r1 + 1) * BLOCK_SIZE_N <= neh1) {
-        device float * C = (device float *) dst +
-            (BLOCK_SIZE_M * r0 + 32*(sgitg &  1)) + \
-            (BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.neh0 + im*args.neh1*args.neh0;
-
-        for (short i = 0; i < 8; i++) {
-            simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.neh0 * (i/4), args.neh0);
-        }
-    } else {
-        // block is smaller than 64x32, we should avoid writing data outside of the matrix
+    {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        threadgroup float * temp_str = ((threadgroup float *) shmem) \
-                                     + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
-        for (short i = 0; i < 8; i++) {
-            simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
+                                      + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
+        for (int i = 0; i < 8; i++) {
+            simdgroup_store(mc[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
        }

        threadgroup_barrier(mem_flags::mem_threadgroup);

        if (sgitg == 0) {
            for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
-                device float  * D  = (device float  *) dst + (r0*BLOCK_SIZE_M) + (r1*BLOCK_SIZE_N + j)*args.neh0 + im*args.neh1*args.neh0;
+                threadgroup const auto & jid = rowids[r1 * BLOCK_SIZE_N + j];
+                int64_t joff = jid[0]*ne0 + jid[1]*ne0ne1;
+
+                device float  * D  = (device float  *) dst + (r0*BLOCK_SIZE_M) + joff;
                device float4 * D4 = (device float4 *) D;

                threadgroup float  * C  = temp_str + (j*BLOCK_SIZE_M);
@@ -6729,6 +6476,66 @@ kernel void kernel_mul_mm_id(
    }
 }

+template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
+kernel void kernel_mul_mm_id(
+        constant ggml_metal_kargs_mul_mm_id & args,
+        device const char * src0s,
+        device const char * src1,
+        device       char * dst,
+        device const char * ids,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiitg[[thread_index_in_threadgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    const int32_t i02 = tgpig.z;
+
+    tgpig.z = 0;
+
+    device const char * src0 = src0s + i02*args.nb02;
+
+    // row indices
+    threadgroup ushort2 * rowids = (threadgroup ushort2 *)(shmem + 8192);
+
+    // TODO: parallelize this loop
+    int32_t _ne1 = 0;
+    for (ushort ii1 = 0; ii1 < args.nei1; ii1++) {
+        for (ushort ii0 = 0; ii0 < args.nei0; ii0++) {
+            int32_t id = ((device int32_t *) (ids + ii1*args.nbi1))[ii0];
+            if (id == i02) {
+                if (tiitg == 0) {
+                    rowids[_ne1] = ushort2(ii0, ii1);
+                }
+                _ne1++;
+            }
+        }
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    kernel_mul_mm_id_impl<block_q, nl, dequantize_func>(
+        args.ne00,
+        args.ne02,
+        args.nb01,
+        args.nb02,
+        args.ne11,
+        args.ne12,
+        args.nb10,
+        args.nb11,
+        args.nb12,
+        args.ne0,
+        _ne1,
+        (int64_t)args.ne0*args.ne1,
+        src0,
+        src1,
+        rowids,
+        dst,
+        shmem,
+        tgpig,
+        tiitg,
+        sgitg);
+}
+
 #define QK_NL 16

 //
@@ -6769,64 +6576,63 @@ template [[host_name("kernel_get_rows_iq4_xs")]]  kernel get_rows_q_t kernel_get
 // matrix-matrix multiplication
 //

-typedef decltype(kernel_mul_mm<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>) mul_mm_t;
+typedef decltype(kernel_mul_mm<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>) mat_mm_t;

-template [[host_name("kernel_mul_mm_f32_f32")]]     kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   float4x4,      1,     dequantize_f32>;
-template [[host_name("kernel_mul_mm_f16_f32")]]     kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half4x4,       1,     dequantize_f16>;
+template [[host_name("kernel_mul_mm_f32_f32")]]     kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   float4x4,      1,     dequantize_f32>;
+template [[host_name("kernel_mul_mm_f16_f32")]]     kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half4x4,       1,     dequantize_f16>;
 #if defined(GGML_METAL_USE_BF16)
-template [[host_name("kernel_mul_mm_bf16_f32")]]    kernel mul_mm_t kernel_mul_mm<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4,     1,     dequantize_bf16>;
+template [[host_name("kernel_mul_mm_bf16_f32")]]    kernel mat_mm_t kernel_mul_mm<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4,     1,     dequantize_bf16>;
 #endif
-template [[host_name("kernel_mul_mm_q4_0_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_0,    2,     dequantize_q4_0>;
-template [[host_name("kernel_mul_mm_q4_1_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_1,    2,     dequantize_q4_1>;
-template [[host_name("kernel_mul_mm_q5_0_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_0,    2,     dequantize_q5_0>;
-template [[host_name("kernel_mul_mm_q5_1_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_1,    2,     dequantize_q5_1>;
-template [[host_name("kernel_mul_mm_q8_0_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q8_0,    2,     dequantize_q8_0>;
-template [[host_name("kernel_mul_mm_q2_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q2_K,    QK_NL, dequantize_q2_K>;
-template [[host_name("kernel_mul_mm_q3_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q3_K,    QK_NL, dequantize_q3_K>;
-template [[host_name("kernel_mul_mm_q4_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_K,    QK_NL, dequantize_q4_K>;
-template [[host_name("kernel_mul_mm_q5_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K>;
-template [[host_name("kernel_mul_mm_q6_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K>;
-template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
-template [[host_name("kernel_mul_mm_iq2_xs_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
-template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
-template [[host_name("kernel_mul_mm_iq3_s_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq3_s,   QK_NL, dequantize_iq3_s>;
-template [[host_name("kernel_mul_mm_iq2_s_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_s,   QK_NL, dequantize_iq2_s>;
-template [[host_name("kernel_mul_mm_iq1_s_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq1_s,   QK_NL, dequantize_iq1_s>;
-template [[host_name("kernel_mul_mm_iq1_m_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq1_m,   QK_NL, dequantize_iq1_m>;
-template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
-template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
+template [[host_name("kernel_mul_mm_q4_0_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_0,    2,     dequantize_q4_0>;
+template [[host_name("kernel_mul_mm_q4_1_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_1,    2,     dequantize_q4_1>;
+template [[host_name("kernel_mul_mm_q5_0_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_0,    2,     dequantize_q5_0>;
+template [[host_name("kernel_mul_mm_q5_1_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_1,    2,     dequantize_q5_1>;
+template [[host_name("kernel_mul_mm_q8_0_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q8_0,    2,     dequantize_q8_0>;
+template [[host_name("kernel_mul_mm_q2_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q2_K,    QK_NL, dequantize_q2_K>;
+template [[host_name("kernel_mul_mm_q3_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q3_K,    QK_NL, dequantize_q3_K>;
+template [[host_name("kernel_mul_mm_q4_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_K,    QK_NL, dequantize_q4_K>;
+template [[host_name("kernel_mul_mm_q5_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K>;
+template [[host_name("kernel_mul_mm_q6_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K>;
+template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
+template [[host_name("kernel_mul_mm_iq2_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
+template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
+template [[host_name("kernel_mul_mm_iq3_s_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq3_s,   QK_NL, dequantize_iq3_s>;
+template [[host_name("kernel_mul_mm_iq2_s_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_s,   QK_NL, dequantize_iq2_s>;
+template [[host_name("kernel_mul_mm_iq1_s_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq1_s,   QK_NL, dequantize_iq1_s>;
+template [[host_name("kernel_mul_mm_iq1_m_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq1_m,   QK_NL, dequantize_iq1_m>;
+template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
+template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;

 //
 // indirect matrix-matrix multiplication
 //

-typedef decltype(kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>) mul_mm_id;
+typedef decltype(kernel_mul_mm_id<float4x4, 1, dequantize_f32>) mat_mm_id_t;

-template [[host_name("kernel_mul_mm_id_f32_f16")]]     kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   float4x4,      1,     dequantize_f32>;
-template [[host_name("kernel_mul_mm_id_f16_f16")]]     kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half4x4,       1,     dequantize_f16>;
+template [[host_name("kernel_mul_mm_id_f32_f32")]]     kernel mat_mm_id_t kernel_mul_mm_id<float4x4,      1,     dequantize_f32>;
+template [[host_name("kernel_mul_mm_id_f16_f32")]]     kernel mat_mm_id_t kernel_mul_mm_id<half4x4,       1,     dequantize_f16>;
 #if defined(GGML_METAL_USE_BF16)
-template [[host_name("kernel_mul_mm_id_bf16_f16")]]    kernel mul_mm_id kernel_mul_mm_id<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4,     1,     dequantize_bf16>;
+template [[host_name("kernel_mul_mm_id_bf16_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<bfloat4x4,     1,     dequantize_bf16>;
 #endif
-template [[host_name("kernel_mul_mm_id_q4_0_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q4_0,    2,     dequantize_q4_0>;
-template [[host_name("kernel_mul_mm_id_q4_1_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q4_1,    2,     dequantize_q4_1>;
-template [[host_name("kernel_mul_mm_id_q5_0_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q5_0,    2,     dequantize_q5_0>;
-template [[host_name("kernel_mul_mm_id_q5_1_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q5_1,    2,     dequantize_q5_1>;
-template [[host_name("kernel_mul_mm_id_q8_0_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q8_0,    2,     dequantize_q8_0>;
-template [[host_name("kernel_mul_mm_id_q2_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q2_K,    QK_NL, dequantize_q2_K>;
-template [[host_name("kernel_mul_mm_id_q3_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q3_K,    QK_NL, dequantize_q3_K>;
-template [[host_name("kernel_mul_mm_id_q4_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q4_K,    QK_NL, dequantize_q4_K>;
-template [[host_name("kernel_mul_mm_id_q5_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K>;
-template [[host_name("kernel_mul_mm_id_q6_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K>;
-template [[host_name("kernel_mul_mm_id_iq2_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
-template [[host_name("kernel_mul_mm_id_iq2_xs_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
-template [[host_name("kernel_mul_mm_id_iq3_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
-template [[host_name("kernel_mul_mm_id_iq3_s_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq3_s,   QK_NL, dequantize_iq3_s>;
-template [[host_name("kernel_mul_mm_id_iq2_s_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq2_s,   QK_NL, dequantize_iq2_s>;
-template [[host_name("kernel_mul_mm_id_iq1_s_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq1_s,   QK_NL, dequantize_iq1_s>;
-template [[host_name("kernel_mul_mm_id_iq1_m_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq1_m,   QK_NL, dequantize_iq1_m>;
-template [[host_name("kernel_mul_mm_id_iq4_nl_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
-template [[host_name("kernel_mul_mm_id_iq4_xs_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
-
+template [[host_name("kernel_mul_mm_id_q4_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_0,    2,     dequantize_q4_0>;
+template [[host_name("kernel_mul_mm_id_q4_1_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_1,    2,     dequantize_q4_1>;
+template [[host_name("kernel_mul_mm_id_q5_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q5_0,    2,     dequantize_q5_0>;
+template [[host_name("kernel_mul_mm_id_q5_1_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q5_1,    2,     dequantize_q5_1>;
+template [[host_name("kernel_mul_mm_id_q8_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q8_0,    2,     dequantize_q8_0>;
+template [[host_name("kernel_mul_mm_id_q2_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q2_K,    QK_NL, dequantize_q2_K>;
+template [[host_name("kernel_mul_mm_id_q3_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q3_K,    QK_NL, dequantize_q3_K>;
+template [[host_name("kernel_mul_mm_id_q4_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_K,    QK_NL, dequantize_q4_K>;
+template [[host_name("kernel_mul_mm_id_q5_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q5_K,    QK_NL, dequantize_q5_K>;
+template [[host_name("kernel_mul_mm_id_q6_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q6_K,    QK_NL, dequantize_q6_K>;
+template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
+template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
+template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
+template [[host_name("kernel_mul_mm_id_iq3_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s,   QK_NL, dequantize_iq3_s>;
+template [[host_name("kernel_mul_mm_id_iq2_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_s,   QK_NL, dequantize_iq2_s>;
+template [[host_name("kernel_mul_mm_id_iq1_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s,   QK_NL, dequantize_iq1_s>;
+template [[host_name("kernel_mul_mm_id_iq1_m_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_m,   QK_NL, dequantize_iq1_m>;
+template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl,  2,     dequantize_iq4_nl>;
+template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;

 //
 // matrix-vector multiplication
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -4855,6 +4855,8 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
            if (!any_on_device) {
                return false;
            }
+            GGML_ASSERT(ggml_is_contiguous(src0));
+            GGML_ASSERT(ggml_is_contiguous(src1));
            func = ggml_cl_add;
            break;
        case GGML_OP_MUL:
--- a/ggml/src/ggml-opt.cpp
+++ b/ggml/src/ggml-opt.cpp
@@ -28,19 +28,16 @@ struct ggml_opt_dataset {
 };

 struct ggml_opt_context {
-    ggml_backend_sched_t       backend_sched        = nullptr;
-    ggml_cgraph              * allocated_graph      = nullptr;
-    ggml_cgraph              * allocated_graph_copy = nullptr;
-    struct ggml_context      * ctx_static           = nullptr;
-    struct ggml_context      * ctx_cpu              = nullptr;
-    struct ggml_context      * ctx_compute          = nullptr;
-    struct ggml_context      * ctx_copy             = nullptr;
-    ggml_backend_buffer_t      buf_static           = nullptr;
-    ggml_backend_buffer_t      buf_cpu              = nullptr;
-    std::mt19937               rng;
-    enum ggml_opt_loss_type    loss_type;
-    enum ggml_opt_build_type   build_type;
-    enum ggml_opt_build_type   build_type_alloc;
+    ggml_backend_sched_t    backend_sched        = nullptr;
+    ggml_cgraph           * allocated_graph      = nullptr;
+    ggml_cgraph           * allocated_graph_copy = nullptr;
+    struct ggml_context   * ctx_static           = nullptr;
+    struct ggml_context   * ctx_static_cpu       = nullptr;
+    struct ggml_context   * ctx_compute          = nullptr;
+    struct ggml_context   * ctx_copy             = nullptr;
+    ggml_backend_buffer_t   buf_static           = nullptr;
+    ggml_backend_buffer_t   buf_static_cpu       = nullptr;
+    std::mt19937            rng;

    struct ggml_tensor * inputs  = nullptr;
    struct ggml_tensor * outputs = nullptr;
@@ -53,11 +50,6 @@ struct ggml_opt_context {
    struct ggml_cgraph * gf      = nullptr;
    struct ggml_cgraph * gb_grad = nullptr;
    struct ggml_cgraph * gb_opt  = nullptr;
-    bool static_graphs           = false;
-    bool eval_ready              = false;
-    std::vector<struct ggml_tensor *> grad_accs;
-    std::vector<struct ggml_tensor *> grad_m;
-    std::vector<struct ggml_tensor *> grad_v;

    int64_t iter               = 1;
    int32_t opt_period         = 1;
@@ -81,13 +73,7 @@ struct ggml_opt_result {

 // ====== Dataset ======

-ggml_opt_dataset_t ggml_opt_dataset_init(
-        enum ggml_type type_data,
-        enum ggml_type type_label,
-        int64_t        ne_datapoint,
-        int64_t        ne_label,
-        int64_t        ndata,
-        int64_t        ndata_shard) {
+ggml_opt_dataset_t ggml_opt_dataset_init(int64_t ne_datapoint, int64_t ne_label, int64_t ndata, int64_t ndata_shard) {
    GGML_ASSERT(ne_datapoint >  0);
    GGML_ASSERT(ne_label     >= 0);
    GGML_ASSERT(ndata        >  0);
@@ -106,11 +92,11 @@ ggml_opt_dataset_t ggml_opt_dataset_init(
        result->ctx = ggml_init(params);
    }

-    result->data = ggml_new_tensor_2d(result->ctx, type_data, ne_datapoint, ndata);
+    result->data = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_datapoint, ndata);
    result->nbs_data = ggml_nbytes(result->data) * ndata_shard/ndata;

    if (ne_label > 0) {
-        result->labels = ggml_new_tensor_2d(result->ctx, type_label, ne_label, ndata);
+        result->labels = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_label, ndata);
        result->nbs_labels = ggml_nbytes(result->labels) * ndata_shard/ndata;
    } else {
        result->labels = nullptr;
@@ -133,10 +119,6 @@ void ggml_opt_dataset_free(ggml_opt_dataset_t dataset) {
    delete dataset;
 }

-int64_t ggml_opt_dataset_ndata(ggml_opt_dataset_t dataset) {
-    return dataset->ndata;
-}
-
 struct ggml_tensor * ggml_opt_dataset_data(ggml_opt_dataset_t dataset) {
    return dataset->data;
 }
@@ -162,8 +144,6 @@ void ggml_opt_dataset_get_batch(ggml_opt_dataset_t dataset, struct ggml_tensor *
    GGML_ASSERT(   data_batch && ggml_is_contiguous(data_batch));
    GGML_ASSERT(!labels_batch || ggml_is_contiguous(labels_batch));
    GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
-    GGML_ASSERT(                   data_batch->type == dataset->data->type);
-    GGML_ASSERT(!labels_batch || labels_batch->type == dataset->labels->type);

    const size_t nb_data_batch = ggml_nbytes(data_batch);
    GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
@@ -191,31 +171,6 @@ void ggml_opt_dataset_get_batch(ggml_opt_dataset_t dataset, struct ggml_tensor *
    }
 }

-void ggml_opt_dataset_get_batch_host(ggml_opt_dataset_t dataset, void * data_batch, size_t nb_data_batch, void * labels_batch, int64_t ibatch) {
-    GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
-    GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
-
-    const int64_t shards_per_batch = nb_data_batch / dataset->nbs_data;
-
-    GGML_ASSERT((ibatch + 1)*shards_per_batch <= int64_t(dataset->permutation.size()));
-
-    for (int64_t ishard_batch = 0; ishard_batch < shards_per_batch; ++ishard_batch) {
-        const int64_t ishard = dataset->permutation[ibatch*shards_per_batch + ishard_batch];
-
-        const char * ptr_data       = (const char *) dataset->data->data + ishard      *dataset->nbs_data;
-        char       * ptr_data_batch = (char       *) data_batch          + ishard_batch*dataset->nbs_data;
-        memcpy(ptr_data_batch, ptr_data, dataset->nbs_data);
-
-        if (!labels_batch) {
-            continue;
-        }
-
-        const char * ptr_labels       = (const char *) dataset->labels->data + ishard      *dataset->nbs_labels;
-        char       * ptr_labels_batch = (char       *) labels_batch          + ishard_batch*dataset->nbs_labels;
-        memcpy(ptr_labels_batch, ptr_labels, dataset->nbs_labels);
-    }
-}
-
 // ====== Model / Context ======

 struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata) {
@@ -232,18 +187,17 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us
    return result;
 }

-struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata) {
-    return *((struct ggml_opt_optimizer_params *) userdata);
-}
-
 struct ggml_opt_params ggml_opt_default_params(
        ggml_backend_sched_t      backend_sched,
+        struct ggml_context     * ctx_compute,
+        struct ggml_tensor      * inputs,
+        struct ggml_tensor      * outputs,
        enum ggml_opt_loss_type   loss_type) {
    return {
        /*backend_sched   =*/ backend_sched,
-        /*ctx_compute     =*/ nullptr,
-        /*inputs          =*/ nullptr,
-        /*logits          =*/ nullptr,
+        /*ctx_compute     =*/ ctx_compute,
+        /*inputs          =*/ inputs,
+        /*logits          =*/ outputs,
        /*loss_type       =*/ loss_type,
        /*build_type      =*/ GGML_OPT_BUILD_TYPE_OPT,
        /*opt_period      =*/ 1,
@@ -312,246 +266,195 @@ static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) {
    return dst;
 }

-static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
-    GGML_ASSERT(opt_ctx->ctx_compute && "no compute context set, either use static graphs or set one with ggml_opt_prepare_alloc");
-    GGML_ASSERT((!opt_ctx->static_graphs || opt_ctx->inputs->data) && "when using static graphs the inputs must be allocated statically");
-
-    const bool accumulate = opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD &&
-        !(opt_ctx->static_graphs && opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period == 1);
-
-    ggml_set_input(opt_ctx->inputs);
-    ggml_set_output(opt_ctx->outputs);
-
-    int n_param = 0;
-    for (int i = 0; i < opt_ctx->gf->n_nodes; ++i) {
-        const struct ggml_tensor * node = opt_ctx->gf->nodes[i];
-        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
-            n_param++;
-        }
-        GGML_ASSERT(!(node->flags & GGML_TENSOR_FLAG_LOSS) && "support for extra loss terms not implemented");
+static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph) {
+    GGML_ASSERT(graph);
+    if (opt_ctx->allocated_graph == graph) {
+        return;
    }

-    if (!opt_ctx->ctx_static) {
+    ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph
+
+    {
+        ggml_init_params params = {
+            /*.mem_size   =*/ ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE,
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ggml_free(opt_ctx->ctx_copy);
+        opt_ctx->ctx_copy = ggml_init(params);
+    }
+
+    opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
+
+    ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
+    opt_ctx->allocated_graph = graph;
+}
+
+ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
+    ggml_opt_context_t result = new struct ggml_opt_context;
+    result->backend_sched   = params.backend_sched;
+    result->ctx_compute     = params.ctx_compute;
+    result->inputs          = params.inputs;
+    result->outputs         = params.outputs;
+    result->opt_period      = params.opt_period;
+    result->get_opt_pars    = params.get_opt_pars;
+    result->get_opt_pars_ud = params.get_opt_pars_ud;
+
+    GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically");
+    GGML_ASSERT(result->opt_period >= 1);
+
+    const bool accumulate = params.build_type == GGML_OPT_BUILD_TYPE_GRAD ||
+        (params.build_type == GGML_OPT_BUILD_TYPE_OPT && result->opt_period > 1);
+
+    ggml_set_input(result->inputs);
+    ggml_set_output(result->outputs);
+
+    result->gf = ggml_new_graph_custom(result->ctx_compute, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass.
+    ggml_build_forward_expand(result->gf, result->outputs);
+
+    int n_param = 0;
+    for (int i = 0; i < result->gf->n_nodes; ++i) {
+        if (result->gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
+            n_param++;
+        }
+    }
+
+    {
        // The static context is used for:
-        //   - gradients (1 per loss, 1 tensor per param if using gradient accumulation)
+        //   - gradients (1 tensor per param if using gradient accumulation)
        //   - optimizer momenta (2 tensors per param)
-        //   - labels (if using static graphs)
-        //   - loss (if using static graphs, up to 5 tensors)
-        //   - pred (if using static graphs)
-        //   - ncorrect (if using static graphs, 2 tensors).
-        constexpr size_t n_loss = 1;
-        const size_t tensors_per_param = (accumulate ? 1 : 0) +
-            (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
-        const size_t tensors_const = opt_ctx->static_graphs ? 9 : 0;
-        const size_t size_meta = (n_loss + tensors_per_param*n_param + tensors_const) * ggml_tensor_overhead();
+        //   - labels
+        //   - loss + its gradient (up to 5 tensors)
+        //   - pred
+        //   - ncorrect (2 tensors).
+        const size_t tensors_per_param = (accumulate ? 1 : 0) + (params.build_type == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
+        const size_t size_meta = (tensors_per_param*n_param + 9) * ggml_tensor_overhead();
        struct ggml_init_params params = {
            /*.mem_size   =*/ size_meta,
            /*.mem_buffer =*/ nullptr,
            /*.no_alloc   =*/ true,
        };
-        opt_ctx->ctx_static = ggml_init(params);
+        result->ctx_static = ggml_init(params);
    }
-    GGML_ASSERT(opt_ctx->build_type <= opt_ctx->build_type_alloc);
-
    {
-        // The cpu context is allocated statically if using static graphs, dynamically otherwise.
-        // It is used for:
-        //   - optimizer parameters (1 shared for all optimizer invocations)
+        // The static cpu context is used for:
+        //   - optimizer parameters (1 for the entire context)
        const size_t size_meta = 1 * ggml_tensor_overhead();
        struct ggml_init_params params = {
            /*.mem_size   =*/ size_meta,
            /*.mem_buffer =*/ nullptr,
            /*.no_alloc   =*/ true,
        };
-        ggml_free(opt_ctx->ctx_cpu);
-        opt_ctx->ctx_cpu = ggml_init(params);
-
-        ggml_backend_buffer_free(opt_ctx->buf_cpu);
-        opt_ctx->buf_cpu = nullptr;
+        result->ctx_static_cpu = ggml_init(params);
    }

-    struct ggml_context * ctx_results = opt_ctx->static_graphs ? opt_ctx->ctx_static : opt_ctx->ctx_compute;

-    switch (opt_ctx->loss_type) {
+    switch (params.loss_type) {
        case GGML_OPT_LOSS_TYPE_MEAN: {
-            opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->outputs);
-            ggml_set_name(opt_ctx->loss, "loss_sum");
-            const float scale = 1.0f / (opt_ctx->opt_period * ggml_nelements(opt_ctx->outputs));
-            opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, scale);
-            ggml_set_name(opt_ctx->loss, "loss_mean");
-            opt_ctx->loss_per_datapoint = true;
+            result->loss = ggml_sum(result->ctx_static, result->outputs);
+            ggml_set_name(result->loss, "loss_sum");
+            const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
+            result->loss = ggml_scale(result->ctx_static, result->loss, scale);
+            ggml_set_name(result->loss, "loss_mean");
+            result->loss_per_datapoint = true;
            break;
        }
        case GGML_OPT_LOSS_TYPE_SUM: {
-            opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->outputs);
-            ggml_set_name(opt_ctx->loss, "loss_sum");
-            opt_ctx->loss_per_datapoint = false;
+            result->loss = ggml_sum(result->ctx_static, result->outputs);
+            ggml_set_name(result->loss, "loss_sum");
+            result->loss_per_datapoint = false;
            break;
        }
        case GGML_OPT_LOSS_TYPE_CROSS_ENTROPY: {
-            opt_ctx->labels = ggml_dup_tensor(ctx_results, opt_ctx->outputs);
-            ggml_set_input(opt_ctx->labels);
-            ggml_set_name(opt_ctx->labels, "labels");
-            opt_ctx->loss = ggml_cross_entropy_loss(ctx_results, opt_ctx->outputs, opt_ctx->labels);
-            ggml_set_name(opt_ctx->loss, "loss_cross_entropy");
-            if (opt_ctx->opt_period > 1) {
-                opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, 1.0f / opt_ctx->opt_period);
-                ggml_set_name(opt_ctx->loss, "loss_cross_entropy_scaled");
+            result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
+            ggml_set_input(result->labels);
+            ggml_set_name(result->labels, "labels");
+            result->loss = ggml_cross_entropy_loss(result->ctx_static, result->outputs, result->labels);
+            ggml_set_name(result->loss, "loss_cross_entropy");
+            if (result->opt_period > 1) {
+                result->loss = ggml_scale(result->ctx_static, result->loss, 1.0f / result->opt_period);
+                ggml_set_name(result->loss, "loss_cross_entropy_scaled");
            }
-            opt_ctx->loss_per_datapoint = true;
+            result->loss_per_datapoint = true;
            break;
        }
        case GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR: {
-            opt_ctx->labels = ggml_dup_tensor(ctx_results, opt_ctx->outputs);
-            ggml_set_input(opt_ctx->labels);
-            ggml_set_name(opt_ctx->labels, "labels");
-            opt_ctx->loss = ggml_sub(ctx_results, opt_ctx->outputs, opt_ctx->labels);
-            ggml_set_name(opt_ctx->loss, "loss_error");
-            opt_ctx->loss = ggml_sqr(ctx_results, opt_ctx->loss);
-            ggml_set_name(opt_ctx->loss, "loss_squared_error");
-            opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->loss);
-            ggml_set_name(opt_ctx->loss, "loss_sum_squared_error");
-            const float scale = 1.0f / (opt_ctx->opt_period * ggml_nelements(opt_ctx->outputs));
-            opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, scale);
-            ggml_set_name(opt_ctx->loss, "loss_mean_squared_error");
-            opt_ctx->loss_per_datapoint = true;
+            result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
+            ggml_set_input(result->labels);
+            ggml_set_name(result->labels, "labels");
+            result->loss = ggml_sub(result->ctx_static, result->outputs, result->labels);
+            ggml_set_name(result->loss, "loss_error");
+            result->loss = ggml_sqr(result->ctx_static, result->loss);
+            ggml_set_name(result->loss, "loss_squared_error");
+            result->loss = ggml_sum(result->ctx_static, result->loss);
+            ggml_set_name(result->loss, "loss_sum_squared_error");
+            const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
+            result->loss = ggml_scale(result->ctx_static, result->loss, scale);
+            ggml_set_name(result->loss, "loss_mean_squared_error");
+            result->loss_per_datapoint = true;
            break;
        }
    }
-    ggml_set_output(opt_ctx->loss);
-    ggml_set_loss(opt_ctx->loss);
-    ggml_build_forward_expand(opt_ctx->gf, opt_ctx->loss);
+    ggml_set_output(result->loss);
+    ggml_set_loss(result->loss);
+    ggml_build_forward_expand(result->gf, result->loss);

-    if (opt_ctx->loss_type == GGML_OPT_LOSS_TYPE_CROSS_ENTROPY) {
-        opt_ctx->pred = ggml_argmax(ctx_results, opt_ctx->outputs);
-        ggml_set_name(opt_ctx->pred, "pred");
-        ggml_set_output(opt_ctx->pred);
-        ggml_build_forward_expand(opt_ctx->gf, opt_ctx->pred);
+    result->pred = ggml_argmax(result->ctx_static, result->outputs);
+    ggml_set_name(result->pred, "pred");
+    ggml_set_output(result->pred);
+    ggml_build_forward_expand(result->gf, result->pred);

-        opt_ctx->ncorrect = ggml_count_equal(ctx_results, opt_ctx->pred, ggml_argmax(ctx_results, opt_ctx->labels));
-        ggml_set_name(opt_ctx->ncorrect, "ncorrect");
-        ggml_set_output(opt_ctx->ncorrect);
-        ggml_build_forward_expand(opt_ctx->gf, opt_ctx->ncorrect);
+    if (result->labels) {
+        result->ncorrect = ggml_count_equal(result->ctx_static, result->pred, ggml_argmax(result->ctx_static, result->labels));
+        ggml_set_name(result->ncorrect, "ncorrect");
+        ggml_set_output(result->ncorrect);
+        ggml_build_forward_expand(result->gf, result->ncorrect);
+    } else {
+        result->ncorrect = nullptr;
    }

-    if (opt_ctx->buf_static) {
-        if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
-            return;
-        }
-    } else if (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_FORWARD) {
-        opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors(
-            opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
-        return;
-    }
-
-    if (opt_ctx->grad_accs.empty()) {
-        GGML_ASSERT(opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD);
-
-        const int n_nodes = opt_ctx->gf->n_nodes;
-        opt_ctx->grad_accs.resize(n_nodes);
-        for (int i = 0; i < n_nodes; ++i) {
-            ggml_tensor * node = opt_ctx->gf->nodes[i];
-            if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
-                opt_ctx->grad_accs[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
-            } else {
-                opt_ctx->grad_accs[i] = nullptr;
-            }
-        }
-
-        if (opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_OPT) {
-            opt_ctx->grad_m.resize(n_nodes);
-            opt_ctx->grad_v.resize(n_nodes);
-            for (int i = 0; i < n_nodes; ++i) {
-                ggml_tensor * node = opt_ctx->gf->nodes[i];
-                if (node->flags & GGML_TENSOR_FLAG_PARAM) {
-                    opt_ctx->grad_m[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
-                    opt_ctx->grad_v[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
-                } else {
-                    opt_ctx->grad_m[i] = nullptr;
-                    opt_ctx->grad_v[i] = nullptr;
-                }
-            }
-        }
-    }
-
-    // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
-    opt_ctx->gb_grad = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gf, /*force_grads =*/ true);
-    ggml_build_backward_expand(opt_ctx->ctx_compute, opt_ctx->gb_grad, opt_ctx->grad_accs.data());
-
-    if (opt_ctx->buf_static) {
-        if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_GRAD) {
-            return;
-        }
-    } else if (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_GRAD) {
-        opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors(opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
-        ggml_graph_reset(opt_ctx->gb_grad);
-    }
-
-    GGML_ASSERT(opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT);
-
-    // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
-    opt_ctx->gb_opt = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true);
-
-    opt_ctx->adamw_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, 7);
-    ggml_set_input(opt_ctx->adamw_params);
-    ggml_set_name(opt_ctx->adamw_params, "adamw_params");
-
-    for (int i = opt_ctx->gf->n_nodes-1; i >= 0; --i) {
-        struct ggml_tensor * node = opt_ctx->gb_opt->nodes[i];
-        struct ggml_tensor * grad = ggml_graph_get_grad(opt_ctx->gb_opt, node);
-
-        if (grad && (node->flags & GGML_TENSOR_FLAG_PARAM)) {
-            struct ggml_tensor * m        = opt_ctx->grad_m[i];
-            struct ggml_tensor * v        = opt_ctx->grad_v[i];
-            struct ggml_tensor * opt_step = ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, opt_ctx->adamw_params);
-
-            ggml_set_name(m,        (std::string("AdamW m for ")    + std::string(node->name)).c_str());
-            ggml_set_name(v,        (std::string("AdamW v for ")    + std::string(node->name)).c_str());
-            ggml_set_name(opt_step, (std::string("AdamW step for ") + std::string(node->name)).c_str());
-
-            ggml_build_forward_expand(opt_ctx->gb_opt, opt_step);
-        }
-    }
-
-    if (!opt_ctx->buf_static) {
-        opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors(
-            opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
-        ggml_graph_reset(opt_ctx->gb_opt);
-    }
-
-    opt_ctx->buf_cpu = ggml_backend_alloc_ctx_tensors_from_buft(opt_ctx->ctx_cpu, ggml_backend_cpu_buffer_type());
-}
-
-ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
-    ggml_opt_context_t result = new struct ggml_opt_context;
-    result->backend_sched    = params.backend_sched;
-    result->ctx_compute      = params.ctx_compute;
-    result->loss_type        = params.loss_type;
-    result->build_type       = params.build_type;
-    result->build_type_alloc = params.build_type;
-    result->inputs           = params.inputs;
-    result->outputs          = params.outputs;
-    result->opt_period       = params.opt_period;
-    result->get_opt_pars     = params.get_opt_pars;
-    result->get_opt_pars_ud  = params.get_opt_pars_ud;
-
-    GGML_ASSERT(result->opt_period >= 1);
-
-    result->static_graphs = result->ctx_compute;
-
-    if (!result->static_graphs) {
-        GGML_ASSERT(!result->inputs);
-        GGML_ASSERT(!result->outputs);
+    if (params.build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
+        result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
        return result;
    }

-    GGML_ASSERT(result->inputs);
-    GGML_ASSERT(result->outputs);
+    // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
+    result->gb_grad = ggml_graph_dup(result->ctx_compute, result->gf);
+    ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate);

-    result->gf = ggml_new_graph_custom(result->ctx_compute, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass.
-    ggml_build_forward_expand(result->gf, result->outputs);
+    if (params.build_type == GGML_OPT_BUILD_TYPE_GRAD) {
+        result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
+        ggml_graph_reset(result->gb_grad);
+        return result;
+    }

-    ggml_opt_build(result);
+    GGML_ASSERT(params.build_type == GGML_OPT_BUILD_TYPE_OPT);
+
+    // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
+    result->gb_opt = ggml_graph_dup(result->ctx_compute, result->gb_grad);
+
+    result->adamw_params = ggml_new_tensor_1d(result->ctx_static_cpu, GGML_TYPE_F32, 7);
+    ggml_set_input(result->adamw_params);
+    ggml_set_name(result->adamw_params, "adamw_params");
+
+    for (int i = result->gf->n_nodes-1; i >= 0; --i) {
+        struct ggml_tensor * node = result->gb_opt->nodes[i];
+        struct ggml_tensor * grad = ggml_graph_get_grad(result->gb_opt, node);
+
+        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
+            struct ggml_tensor * m        = ggml_dup_tensor(result->ctx_static, node);
+            struct ggml_tensor * v        = ggml_dup_tensor(result->ctx_static, node);
+            struct ggml_tensor * opt_step = ggml_opt_step_adamw(result->ctx_compute, node, grad, m, v, result->adamw_params);
+            ggml_build_forward_expand(result->gb_opt, opt_step);
+        }
+    }
+
+    result->buf_static = ggml_backend_alloc_ctx_tensors(
+        result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
+
+    result->buf_static_cpu = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, ggml_backend_cpu_buffer_type());
+
+    ggml_graph_reset(result->gb_opt);

    return result;
 }
@@ -561,9 +464,9 @@ void ggml_opt_free(ggml_opt_context_t opt_ctx) {
        return;
    }
    ggml_backend_buffer_free(opt_ctx->buf_static);
-    ggml_backend_buffer_free(opt_ctx->buf_cpu);
+    ggml_backend_buffer_free(opt_ctx->buf_static_cpu);
    ggml_free(opt_ctx->ctx_static);
-    ggml_free(opt_ctx->ctx_cpu);
+    ggml_free(opt_ctx->ctx_static_cpu);
    delete opt_ctx;
 }

@@ -576,10 +479,6 @@ void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer) {
    }
 }

-bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx) {
-    return opt_ctx->static_graphs;
-}
-
 struct ggml_tensor * ggml_opt_inputs(ggml_opt_context_t opt_ctx) {
    return opt_ctx->inputs;
 }
@@ -683,79 +582,8 @@ void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, doubl

 // ====== Computation ======

-void ggml_opt_prepare_alloc(
-        ggml_opt_context_t    opt_ctx,
-        struct ggml_context * ctx_compute,
-        struct ggml_cgraph  * gf,
-        struct ggml_tensor  * inputs,
-        struct ggml_tensor  * outputs) {
-    GGML_ASSERT(!opt_ctx->static_graphs);
-    opt_ctx->ctx_compute = ctx_compute;
-    opt_ctx->gf          = gf;
-    opt_ctx->inputs      = inputs;
-    opt_ctx->outputs     = outputs;
-}
-
-void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) {
-    GGML_ASSERT(!opt_ctx->eval_ready);
-    if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period > 1 && opt_ctx->opt_i == 0) {
-        ggml_graph_reset(opt_ctx->gb_grad);
-    }
-    if (backward) {
-        const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
-        opt_ctx->build_type = opt_i_next == 0 ? GGML_OPT_BUILD_TYPE_OPT : GGML_OPT_BUILD_TYPE_GRAD;
-    } else {
-        opt_ctx->build_type = GGML_OPT_BUILD_TYPE_FORWARD;
-    }
-
-    if (!opt_ctx->static_graphs) {
-        ggml_opt_build(opt_ctx);
-    }
-
-    struct ggml_cgraph * graph = nullptr;
-    switch (opt_ctx->build_type) {
-        case GGML_OPT_BUILD_TYPE_FORWARD: {
-            graph = opt_ctx->gf;
-        } break;
-        case GGML_OPT_BUILD_TYPE_GRAD: {
-            graph = opt_ctx->gb_grad;
-        } break;
-        case GGML_OPT_BUILD_TYPE_OPT: {
-            graph = opt_ctx->gb_opt;
-        } break;
-    }
-    GGML_ASSERT(graph);
-
-    if (opt_ctx->allocated_graph == graph) {
-        opt_ctx->eval_ready = true;
-        return;
-    }
-
-    ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph
-
-    if (opt_ctx->static_graphs) {
-        ggml_init_params params = {
-            /*.mem_size   =*/ graph->size*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph->size, graph->grads),
-            /*.mem_buffer =*/ nullptr,
-            /*.no_alloc   =*/ true,
-        };
-        ggml_free(opt_ctx->ctx_copy);
-        opt_ctx->ctx_copy = ggml_init(params);
-
-        opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
-    } else {
-        opt_ctx->allocated_graph_copy = graph;
-    }
-
-    ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
-    opt_ctx->allocated_graph = graph;
-
-    opt_ctx->eval_ready = true;
-}
-
-void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
-    GGML_ASSERT(opt_ctx->eval_ready);
-    if (opt_ctx->allocated_graph == opt_ctx->gb_opt) {
+static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph, ggml_opt_result * result) {
+    if (graph != opt_ctx->gf) {
        struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);

        GGML_ASSERT(opt_pars.adamw.alpha >  0.0f);
@@ -781,19 +609,9 @@ void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
        adamw_par_data[6] = beta2h;
    }

+    ggml_opt_alloc_graph(opt_ctx, graph);
    ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
    opt_ctx->iter += opt_ctx->allocated_graph == opt_ctx->gb_opt;
-    opt_ctx->opt_i = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
-
-    if (!opt_ctx->static_graphs) {
-        opt_ctx->gf                   = nullptr;
-        opt_ctx->gb_grad              = nullptr;
-        opt_ctx->gb_opt               = nullptr;
-        opt_ctx->allocated_graph      = nullptr;
-        opt_ctx->allocated_graph_copy = nullptr;
-    }
-
-    opt_ctx->eval_ready = false;

    if (!result) {
        return;
@@ -817,14 +635,12 @@ void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
    ggml_backend_tensor_get(opt_ctx->loss, &loss, 0, ggml_nbytes(opt_ctx->loss));
    result->loss.push_back(loss);

-    if (opt_ctx->pred) {
-        GGML_ASSERT(opt_ctx->pred->type == GGML_TYPE_I32);
-        std::vector<int32_t> pred(ndata);
-        ggml_backend_tensor_get(opt_ctx->pred, pred.data(), 0, ggml_nbytes(opt_ctx->pred));
-        result->pred.insert(result->pred.end(), pred.begin(), pred.end());
-    }
+    GGML_ASSERT(opt_ctx->pred->type == GGML_TYPE_I32);
+    std::vector<int32_t> pred(ndata);
+    ggml_backend_tensor_get(opt_ctx->pred, pred.data(), 0, ggml_nbytes(opt_ctx->pred));
+    result->pred.insert(result->pred.end(), pred.begin(), pred.end());

-    if (!opt_ctx->ncorrect || result->ncorrect < 0) {
+    if (!opt_ctx->labels || result->ncorrect < 0) {
        result->ncorrect = -1;
        return;
    }
@@ -836,6 +652,26 @@ void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
    result->ncorrect += ncorrect;
 }

+void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
+    ggml_opt_eval_graph(opt_ctx, opt_ctx->gf, result);
+}
+
+void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
+    if (opt_ctx->opt_period == 1) {
+        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
+        return;
+    }
+
+    const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
+    if (opt_i_next == 0) {
+        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
+        ggml_opt_reset(opt_ctx, /*optimizer =*/ false);
+    } else {
+        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_grad, result);
+    }
+    opt_ctx->opt_i = opt_i_next;
+}
+
 // ====== High-Level Functions ======

 void ggml_opt_epoch(
@@ -846,7 +682,6 @@ void ggml_opt_epoch(
        int64_t                 idata_split,
        ggml_opt_epoch_callback callback_train,
        ggml_opt_epoch_callback callback_eval) {
-    GGML_ASSERT(ggml_opt_static_graphs(opt_ctx) && "ggml_opt_epoch requires static graphs");
    struct ggml_tensor * inputs = ggml_opt_inputs(opt_ctx);
    struct ggml_tensor * labels = ggml_opt_labels(opt_ctx);
    struct ggml_tensor * data   = ggml_opt_dataset_data(dataset);
@@ -865,18 +700,16 @@ void ggml_opt_epoch(
    int64_t ibatch = 0;
    int64_t t_loop_start = ggml_time_us();
    for (; ibatch < ibatch_split; ++ibatch) {
-        ggml_opt_alloc(opt_ctx, /*backward =*/ true);
        ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
-        ggml_opt_eval(opt_ctx, result_train);
+        ggml_opt_forward_backward(opt_ctx, result_train);
        if (callback_train) {
            callback_train(true, opt_ctx, dataset, result_train, ibatch+1, ibatch_split, t_loop_start);
        }
    }
    t_loop_start = ggml_time_us();
    for (; ibatch < nbatches; ++ibatch) {
-        ggml_opt_alloc(opt_ctx, /*backward =*/ false);
        ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
-        ggml_opt_eval(opt_ctx, result_eval);
+        ggml_opt_forward(opt_ctx, result_eval);
        if (callback_eval) {
            callback_eval(false, opt_ctx, dataset, result_eval, ibatch+1-ibatch_split, nbatches-ibatch_split, t_loop_start);
        }
@@ -893,26 +726,13 @@ void ggml_opt_epoch_callback_progress_bar(
        int64_t            t_start_us) {
    fprintf(stderr, "%s[", train ? "train: " : "val:   ");

-    // The progress bar consists of partially filled blocks, unicode has 8 separate fill levels.
-    constexpr int64_t bar_length = 8;
-    const int64_t ibatch8 = 8 * ibatch;
+    constexpr int64_t bar_length = 25;
    for (int64_t j = 0; j < bar_length; ++j) {
-        if        (ibatch_max * (8*j + 8) / bar_length < ibatch8) {
-            fprintf(stderr, "\u2588"); // full block
-        } else if (ibatch_max * (8*j + 7) / bar_length < ibatch8) {
-            fprintf(stderr, "\u2589"); // 7/8 filled
-        } else if (ibatch_max * (8*j + 6) / bar_length < ibatch8) {
-            fprintf(stderr, "\u258A"); // 6/8 filled
-        } else if (ibatch_max * (8*j + 5) / bar_length < ibatch8) {
-            fprintf(stderr, "\u258B"); // 5/8 filled
-        } else if (ibatch_max * (8*j + 4) / bar_length < ibatch8) {
-            fprintf(stderr, "\u258C"); // 4/8 filled
-        } else if (ibatch_max * (8*j + 3) / bar_length < ibatch8) {
-            fprintf(stderr, "\u258D"); // 3/8 filled
-        } else if (ibatch_max * (8*j + 2) / bar_length < ibatch8) {
-            fprintf(stderr, "\u258E"); // 2/8 filled
-        } else if (ibatch_max * (8*j + 1) / bar_length < ibatch8) {
-            fprintf(stderr, "\u258F"); // 1/8 filled
+        const int64_t ibatch_j = ibatch_max * j/bar_length;
+        if (ibatch_j < ibatch) {
+            fprintf(stderr, "=");
+        } else if (ibatch_max * (j - 1)/bar_length < ibatch) {
+            fprintf(stderr, ">");
        } else {
            fprintf(stderr, " ");
        }
@@ -944,8 +764,8 @@ void ggml_opt_epoch_callback_progress_bar(
    const int64_t t_eta_m = t_eta_s / 60;
    t_eta_s -= t_eta_m * 60;

-    fprintf(stderr, "] data=%07" PRId64 "/%07" PRId64 " loss=%.5lf±%.5lf acc=%.2lf±%.2lf%% "
-            "t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 " ETA=%02" PRId64 ":%02" PRId64 ":%02" PRId64 " \r",
+    fprintf(stderr, "| data=%06" PRId64 "/%06" PRId64 ", loss=%.6lf+-%.6lf, accuracy=%.2lf+-%.2lf%%, "
+            "t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 ", ETA=%02" PRId64 ":%02" PRId64 ":%02" PRId64 "]\r",
            idata, idata_max, loss, loss_unc, 100.0*accuracy, 100.0*accuracy_unc,
            t_ibatch_h, t_ibatch_m, t_ibatch_s, t_eta_h, t_eta_m, t_eta_s);
    if (ibatch == ibatch_max) {
@@ -986,10 +806,7 @@ void ggml_opt_fit(

    int64_t epoch = 1;

-    ggml_opt_params params = ggml_opt_default_params(backend_sched, loss_type);
-    params.ctx_compute     = ctx_compute;
-    params.inputs          = inputs;
-    params.outputs         = outputs;
+    ggml_opt_params params = ggml_opt_default_params(backend_sched, ctx_compute, inputs, outputs, loss_type);
    params.opt_period      = opt_period;
    params.get_opt_pars    = get_opt_pars;
    params.get_opt_pars_ud = &epoch;
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -151,12 +151,6 @@ struct rpc_msg_buffer_clear_req {
    uint8_t value;
 };

-struct rpc_msg_set_tensor_hash_req {
-    rpc_tensor tensor;
-    uint64_t offset;
-    uint64_t hash;
-};
-
 struct rpc_msg_set_tensor_hash_rsp {
    uint8_t result;
 };
@@ -554,12 +548,15 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
    ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
    rpc_tensor rpc_tensor = serialize_tensor(tensor);
    if (size > HASH_THRESHOLD) {
-        rpc_msg_set_tensor_hash_req request;
-        request.tensor = rpc_tensor;
-        request.offset = offset;
-        request.hash = fnv_hash((const uint8_t*)data, size);
+        // input serialization format: | rpc_tensor | offset (8 bytes) | hash (8 bytes)
+        size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + sizeof(uint64_t);
+        std::vector<uint8_t> input(input_size, 0);
+        uint64_t hash = fnv_hash((const uint8_t*)data, size);
+        memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
+        memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
+        memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), &hash, sizeof(hash));
        rpc_msg_set_tensor_hash_rsp response;
-        bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, &request, sizeof(request), &response, sizeof(response));
+        bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, input.data(), input.size(), &response, sizeof(response));
        GGML_ASSERT(status);
        if (response.result) {
            // the server has the same data, no need to send it
@@ -867,7 +864,7 @@ public:
    bool free_buffer(const rpc_msg_free_buffer_req & request);
    bool buffer_clear(const rpc_msg_buffer_clear_req & request);
    bool set_tensor(const std::vector<uint8_t> & input);
-    bool set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response);
+    bool set_tensor_hash(const std::vector<uint8_t> & input, rpc_msg_set_tensor_hash_rsp & response);
    bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response);
    bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
    bool graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response);
@@ -1104,10 +1101,18 @@ bool rpc_server::get_cached_file(uint64_t hash, std::vector<uint8_t> & data) {
    return true;
 }

-bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response)
+bool rpc_server::set_tensor_hash(const std::vector<uint8_t> & input, rpc_msg_set_tensor_hash_rsp & response)
 {
+    // serialization format: | rpc_tensor | offset (8 bytes) | hash (8 bytes) |
+    if (input.size() != sizeof(rpc_tensor) + 16) {
+        return false;
+    }
+    const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
+    uint64_t offset;
+    memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
+    const uint64_t * hash = (const uint64_t *)(input.data() + sizeof(rpc_tensor) + sizeof(offset));
    std::vector<uint8_t> cached_file;
-    if (!get_cached_file(request.hash, cached_file)) {
+    if (!get_cached_file(*hash, cached_file)) {
        response.result = 0;
        return true;
    }
@@ -1120,28 +1125,25 @@ bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rp
    ggml_context_ptr ctx_ptr { ggml_init(params) };
    GGML_ASSERT(ctx_ptr != nullptr);
    ggml_context * ctx = ctx_ptr.get();
-    ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
+    ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
    if (tensor == nullptr) {
        GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
        return false;
    }
-    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu, hash: %" PRIx64 "\n",
-        __func__, (void*)tensor->buffer, tensor->data, request.offset, size, request.hash);
+    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu, hash: %" PRIx64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size, *hash);

    // sanitize tensor->data
    {
        const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
        const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);

-        if (request.tensor.data + request.offset < p0
-         || request.tensor.data + request.offset >= p1
-         || size > (p1 - request.tensor.data - request.offset)) {
+        if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
            GGML_LOG_ERROR("[%s] tensor data region (data=0x%" PRIx64 ", offset=%" PRIu64 ", size=%zu, hash=0x%" PRIx64 ") out of buffer bounds [0x%zx, 0x%zx)\n",
-                           __func__, request.tensor.data, request.offset, size, request.hash, p0, p1);
+                           __func__, in_tensor->data, offset, size, *hash, p0, p1);
            return false;
        }
    }
-    ggml_backend_tensor_set(tensor, cached_file.data(), request.offset, size);
+    ggml_backend_tensor_set(tensor, cached_file.data(), offset, size);
    response.result = 1;
    return true;
 }
@@ -1501,12 +1503,12 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
                break;
            }
            case RPC_CMD_SET_TENSOR_HASH: {
-                rpc_msg_set_tensor_hash_req request;
-                if (!recv_msg(sockfd, &request, sizeof(request))) {
+                std::vector<uint8_t> input;
+                if (!recv_msg(sockfd, input)) {
                    return;
                }
                rpc_msg_set_tensor_hash_rsp response;
-                if (!server.set_tensor_hash(request, response)) {
+                if (!server.set_tensor_hash(input, response)) {
                    return;
                }
                if (!send_msg(sockfd, &response, sizeof(response))) {
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -49,38 +49,35 @@ endif()
 target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")

 # Link against oneDNN
+find_package(DNNL)
 set(GGML_SYCL_DNNL 0)
-if(GGML_SYCL_DNN)
-    find_package(DNNL)
-    if(DNNL_FOUND)
-        if (NOT DEFINED DNNL_GPU_VENDOR)
-            # default to intel target
-            set(DNNL_GPU_VENDOR "INTEL")
-            if(NOT "${GGML_SYCL_TARGET}" STREQUAL "INTEL")
-                message(WARNING "oneDNN builds bundled with oneapi release only support INTEL target")
-            endif()
+if(DNNL_FOUND)
+    if (DEFINED ENV{ONEAPI_ROOT} AND NOT DEFINED DNNL_GPU_VENDOR)
+        # Assuming oneDNN packaged with oneapi release is used which
+        # supports only intel target
+        set(DNNL_GPU_VENDOR "INTEL")
+        if(NOT "${GGML_SYCL_TARGET}" STREQUAL "INTEL")
+            message(WARNING "oneDNN builds bundled with oneapi release only support INTEL target")
        endif()
+    endif()

-        # Verify oneDNN was compiled for the same target as llama
-        if("${GGML_SYCL_TARGET}" STREQUAL "${DNNL_GPU_VENDOR}")
-            target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
-            set(GGML_SYCL_DNNL 1)
-            get_target_property(CONFIGS DNNL::dnnl IMPORTED_CONFIGURATIONS)
-            foreach(CONFIG ${CONFIGS})
-                get_target_property(DNNL_LIB DNNL::dnnl IMPORTED_LOCATION_${CONFIG})
-                message(STATUS "Found oneDNN: ${DNNL_LIB}")
-            endforeach()
-        else()
-            message(WARNING
-                "oneDNN must be compiled for the same target as llama.cpp.
-                 llama.cpp: ${GGML_SYCL_TARGET}, oneDNN: ${DNNL_GPU_VENDOR}.
-                 Disabling oneDNN support.")
-        endif()
+    # Verify oneDNN was compiled for the same target as llama
+    if("${GGML_SYCL_TARGET}" STREQUAL "${DNNL_GPU_VENDOR}")
+        target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
+        set(GGML_SYCL_DNNL 1)
+        get_target_property(CONFIGS DNNL::dnnl IMPORTED_CONFIGURATIONS)
+        foreach(CONFIG ${CONFIGS})
+            get_target_property(DNNL_LIB DNNL::dnnl IMPORTED_LOCATION_${CONFIG})
+            message(STATUS "Found oneDNN: ${DNNL_LIB}")
+        endforeach()
    else()
-        message(STATUS "oneDNN not found, disabling oneDNN support")
+        message(WARNING
+            "oneDNN must be compiled for the same target as llama.cpp.
+             llama.cpp: ${GGML_SYCL_TARGET}, oneDNN: ${DNNL_GPU_VENDOR}.
+             Disabling oneDNN support.")
    endif()
 else()
-    message(STATUS "oneDNN support disabled by the user")
+    message(STATUS "oneDNN not found, disabling oneDNN support")
 endif()
 target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_DNNL=${GGML_SYCL_DNNL})

@@ -111,9 +108,6 @@ endif()
 if (GGML_SYCL_TARGET STREQUAL "INTEL")
    # Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically
    # See https://github.com/uxlfoundation/oneMath/issues/654
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-        set(SYCL_COMPILER ON)
-    endif()
    find_package(MKL REQUIRED)
    target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
    target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL)
--- a/ggml/src/ggml-sycl/backend.hpp
+++ b/ggml/src/ggml-sycl/backend.hpp
@@ -14,24 +14,23 @@
 #define GGML_SYCL_BACKEND_HPP

 #include "binbcast.hpp"
-#include "common.hpp"
 #include "concat.hpp"
+#include "common.hpp"
 #include "conv.hpp"
 #include "convert.hpp"
-#include "cpy.hpp"
 #include "dequantize.hpp"
 #include "dmmv.hpp"
-#include "element_wise.hpp"
-#include "gla.hpp"
-#include "im2col.hpp"
 #include "mmq.hpp"
 #include "mmvq.hpp"
-#include "norm.hpp"
-#include "outprod.hpp"
-#include "quants.hpp"
 #include "rope.hpp"
+#include "norm.hpp"
 #include "softmax.hpp"
 #include "tsembd.hpp"
+#include "im2col.hpp"
 #include "wkv.hpp"
+#include "outprod.hpp"
+#include "element_wise.hpp"
+#include "cpy.hpp"
+#include "gla.hpp"

-#endif  // GGML_SYCL_BACKEND_HPP
+#endif // GGML_SYCL_BACKEND_HPP
--- a/ggml/src/ggml-sycl/binbcast.cpp
+++ b/ggml/src/ggml-sycl/binbcast.cpp
@@ -1,74 +1,93 @@
 #include "binbcast.hpp"

-#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/sycl.hpp>

-#include "dpct/helper.hpp"
 #include "ggml.h"

-template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static __dpct_inline__ void k_bin_bcast_contiguous(const src0_t * __restrict__ src0, const src1_t * __restrict__ src1,
-                                                   dst_t * dst, std::size_t num_elements, const sycl::nd_item<1> & it) {
-    auto element_id   = it.get_global_id(0);
-    auto global_range = it.get_global_range(0);
-    for (; element_id < num_elements; element_id += global_range) {
-        auto  src0_float_val = sycl::vec(src0[element_id]).template convert<float, sycl::rounding_mode::rte>();
-        auto  src1_float_val = sycl::vec(src1[element_id]).template convert<float, sycl::rounding_mode::rte>();
-        float dst_val        = bin_op(src0_float_val[0], src1_float_val[0]);
-        auto  val_to_store   = sycl::vec(dst_val).template convert<dst_t, sycl::rounding_mode::rte>();
-        dst[element_id]      = val_to_store;
+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
+        int ne0, int ne1, int ne2, int ne3,
+        int ne10, int ne11, int ne12, int ne13,
+        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s00,*/ int s01, int s02, int s03,
+        /*int s10,*/ int s11, int s12, int s13,
+        const sycl::nd_item<3> &item_ct1) {
+    const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+    const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1));
+    const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                    item_ct1.get_local_id(0)) /
+                   ne3;
+    const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                    item_ct1.get_local_id(0)) %
+                   ne3;
+
+    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
+        return;
+    }
+
+    const int i11 = i1 % ne11;
+    const int i12 = i2 % ne12;
+    const int i13 = i3 % ne13;
+
+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
+
+    const src0_t * src0_row = src0 + i_src0;
+    const src1_t * src1_row = src1 + i_src1;
+    dst_t * dst_row = dst + i_dst;
+
+    for (int i0 = i0s; i0 < ne0;
+         i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
+        const int i10 = i0 % ne10;
+        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
    }
 }

-template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static __dpct_inline__ void k_bin_bcast(const src0_t * __restrict__ src0, const src1_t * __restrict__ src1, dst_t * dst,
-                                        int ne0, int ne1, int ne2, int ne3, int ne10, int ne11, int ne12, int ne13,
-                                        int s0, int s1, int s2, int s3, int s00, int s01, int s02, int s03, int s10,
-                                        int s11, int s12, int s13, std::size_t num_dst_elements,
-                                        const sycl::nd_item<1> & item_ct1) {
-    auto calculate_logical_index =
-        [](const std::array<int, 4> & dims, std::size_t element_id) __attribute__((always_inline))->std::array<int, 4> {
-        std::array<int, 4> logical_index;
-#pragma unroll(4)
-        for (int i = 3; i >= 0; i--) {
-            logical_index[i] = element_id % dims[i];
-            element_id /= dims[i];
-        }
-        return logical_index;
-    };
+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
+        int ne0, int ne1, int ne2, int ne3,
+        int ne10, int ne11, int ne12, int ne13,
+        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s00,*/ int s01, int s02, int s03,
+        /*int s10,*/ int s11, int s12, int s13,
+        const sycl::nd_item<3> &item_ct1) {

-    auto calculate_index = [](const std::array<int, 4> & dims, const std::array<int, 4> & strides,
-                              const std::array<int, 4> & indices) __attribute__((always_inline))
-                               ->std::size_t {
-        std::size_t index = 0;
-#pragma unroll(4)
-        for (int i = 0; i < 4; i++) {
-            auto index_i = indices[i];
-            if (indices[i] >= dims[i]) {
-                index_i = indices[i] % dims[i];
-            }
-            index += strides[i] * index_i;
-        }
-        return index;
-    };
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);

-    auto element_id = item_ct1.get_global_id(0);
-    for (; element_id < num_dst_elements; element_id += item_ct1.get_global_range(0)) {
-        auto  logical_index  = calculate_logical_index({ ne3, ne2, ne1, ne0 }, element_id);
-        auto  src_0_index    = calculate_index({ ne3, ne2, ne1, ne0 }, { s03, s02, s01, s00 }, logical_index);
-        auto  src_1_index    = calculate_index({ ne13, ne12, ne11, ne10 }, { s13, s12, s11, s10 }, logical_index);
-        auto  dst_index      = calculate_index({ ne3, ne2, ne1, ne0 }, { s3, s2, s1, s0 }, logical_index);
-        auto  src0_float_val = sycl::vec(src0[src_0_index]).template convert<float, sycl::rounding_mode::rte>();
-        auto  src1_float_val = sycl::vec(src1[src_1_index]).template convert<float, sycl::rounding_mode::rte>();
-        float dst_val        = bin_op(src0_float_val[0], src1_float_val[0]);
-        auto  val_to_store   = sycl::vec(dst_val).template convert<dst_t, sycl::rounding_mode::rte>();
-        dst[dst_index]       = val_to_store;
+    const int i3 = i/(ne2*ne1*ne0);
+    const int i2 = (i/(ne1*ne0)) % ne2;
+    const int i1 = (i/ne0) % ne1;
+    const int i0 = i % ne0;
+
+    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
+        return;
    }
+
+    const int i11 = i1 % ne11;
+    const int i12 = i2 % ne12;
+    const int i13 = i3 % ne13;
+
+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
+
+    const src0_t * src0_row = src0 + i_src0;
+    const src1_t * src1_row = src1 + i_src1;
+    dst_t * dst_row = dst + i_dst;
+
+    const int i10 = i0 % ne10;
+    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
 }

-template <float (*bin_op)(const float, const float)> struct bin_bcast_sycl {
+
+template<float (*bin_op)(const float, const float)>
+struct bin_bcast_sycl {
    template <typename src0_t, typename src1_t, typename dst_t>
    void operator()(const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, const int64_t ne00,
                    const int64_t ne01, const int64_t ne02, const int64_t ne03, const int64_t ne10, const int64_t ne11,
@@ -77,73 +96,165 @@ template <float (*bin_op)(const float, const float)> struct bin_bcast_sycl {
                    const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0,
                    const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous,
                    const bool src1_is_contiguous, const bool dst_is_contiguous, queue_ptr stream) {
-        auto check_bcast_required = [](const std::array<int64_t, 4> & src_dims,
-                                       const std::array<int64_t, 4> & dst_dims) -> bool {
-            for (int i = 0; i < 4; i++) {
-                if (dst_dims[i] > src_dims[i]) {
-                    return true;
-                }
-            }
-            return false;
+        int nr0 = ne10 / ne0;
+        int nr1 = ne11/ne1;
+        int nr2 = ne12/ne2;
+        int nr3 = ne13/ne3;
+
+        int nr[4] = { nr0, nr1, nr2, nr3 };
+
+        // collapse dimensions until first broadcast dimension
+        int64_t cne[] = {ne0, ne1, ne2, ne3};
+        int64_t cne0[] = {ne00, ne01, ne02, ne03};
+        int64_t cne1[] = {ne10, ne11, ne12, ne13};
+        size_t cnb[] = {nb0, nb1, nb2, nb3};
+        size_t cnb0[] = {nb00, nb01, nb02, nb03};
+        size_t cnb1[] = {nb10, nb11, nb12, nb13};
+        auto collapse = [](int64_t cne[]) {
+            cne[0] *= cne[1];
+            cne[1] = cne[2];
+            cne[2] = cne[3];
+            cne[3] = 1;
        };

-        dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
+        auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
+            cnb[1] *= cne[1];
+            cnb[2] *= cne[2];
+            cnb[3] *= cne[3];
+        };

-        GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
-        GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
-        GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
-        GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
+        if (src0_is_contiguous && src1_is_contiguous && dst_is_contiguous) {
+            for (int i = 0; i < 4; i++) {
+                if (nr[i] != 1) {
+                    break;
+                }
+                if (i > 0) {
+                    collapse_nb(cnb, cne);
+                    collapse_nb(cnb0, cne0);
+                    collapse_nb(cnb1, cne1);
+                    collapse(cne);
+                    collapse(cne0);
+                    collapse(cne1);
+                }
+            }
+        }
+        {
+            int64_t ne0 = cne[0];
+            int64_t ne1 = cne[1];
+            int64_t ne2 = cne[2];
+            int64_t ne3 = cne[3];

-        GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
-        GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
-        GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
-        GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
+            int64_t ne10 = cne1[0];
+            int64_t ne11 = cne1[1];
+            int64_t ne12 = cne1[2];
+            int64_t ne13 = cne1[3];

-        GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
-        GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
-        GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
-        GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
+            size_t nb0 = cnb[0];
+            size_t nb1 = cnb[1];
+            size_t nb2 = cnb[2];
+            size_t nb3 = cnb[3];

-        // dst strides in number of elements
-        size_t s0 = nb0 / sizeof(dst_t);
-        size_t s1 = nb1 / sizeof(dst_t);
-        size_t s2 = nb2 / sizeof(dst_t);
-        size_t s3 = nb3 / sizeof(dst_t);
+            size_t nb00 = cnb0[0];
+            size_t nb01 = cnb0[1];
+            size_t nb02 = cnb0[2];
+            size_t nb03 = cnb0[3];

-        // src1 strides in number of elements
-        size_t s10 = nb10 / sizeof(src0_t);
-        size_t s11 = nb11 / sizeof(src1_t);
-        size_t s12 = nb12 / sizeof(src1_t);
-        size_t s13 = nb13 / sizeof(src1_t);
+            size_t nb10 = cnb1[0];
+            size_t nb11 = cnb1[1];
+            size_t nb12 = cnb1[2];
+            size_t nb13 = cnb1[3];

-        // src0 strides in number of elements
-        size_t s00 = nb00 / sizeof(src0_t);
-        size_t s01 = nb01 / sizeof(src0_t);
-        size_t s02 = nb02 / sizeof(src0_t);
-        size_t s03 = nb03 / sizeof(src0_t);
+            size_t s0 = nb0 / sizeof(dst_t);
+            size_t s1 = nb1 / sizeof(dst_t);
+            size_t s2 = nb2 / sizeof(dst_t);
+            size_t s3 = nb3 / sizeof(dst_t);

-        std::size_t num_dst_elements = static_cast<std::size_t>(ne0) * static_cast<std::size_t>(ne1) *
-                                       static_cast<std::size_t>(ne2) * static_cast<std::size_t>(ne3);
-        std::size_t local_range  = 256;
-        std::size_t global_range = ceil_div(num_dst_elements, local_range) * local_range;
+            size_t s10 = nb10 / sizeof(src1_t);
+            size_t s11 = nb11 / sizeof(src1_t);
+            size_t s12 = nb12 / sizeof(src1_t);
+            size_t s13 = nb13 / sizeof(src1_t);

-        bool needs_broadcasting = check_bcast_required({ ne00, ne01, ne02, ne03 }, { ne0, ne1, ne2, ne3 }) ||
-                                  check_bcast_required({ ne10, ne11, ne12, ne13 }, { ne0, ne1, ne2, ne3 });
-        bool all_contiguous = src0_is_contiguous && src1_is_contiguous && dst_is_contiguous;
+            size_t s00 = nb00 / sizeof(src0_t);
+            size_t s01 = nb01 / sizeof(src0_t);
+            size_t s02 = nb02 / sizeof(src0_t);
+            size_t s03 = nb03 / sizeof(src0_t);

-        if (! needs_broadcasting && all_contiguous) {
-            stream->submit([&](sycl::handler & cgh) {
-                cgh.parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), [=](sycl::nd_item<1> it) {
-                    k_bin_bcast_contiguous<bin_op>(src0_dd, src1_dd, dst_dd, num_dst_elements, it);
-                });
-            });
-        } else {
-            stream->submit([&](sycl::handler & cgh) {
-                cgh.parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), [=](sycl::nd_item<1> it) {
-                    k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, ne12, ne13, s0, s1,
-                                        s2, s3, s00, s01, s02, s03, s10, s11, s12, s13, num_dst_elements, it);
-                });
-            });
+            GGML_UNUSED(s00);
+
+            GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
+            GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
+            GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
+            GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
+
+            GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
+            GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
+            GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
+            GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
+
+            GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
+            GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
+            GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
+            GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
+
+            GGML_ASSERT(s0 == 1);
+            GGML_ASSERT(s10 == 1);
+
+            const int block_size = 128;
+
+            int64_t hne0 = std::max(ne0/2LL, 1LL);
+
+            sycl::range<3> block_dims(1, 1, 1);
+            block_dims[2] = std::min<unsigned int>(hne0, block_size);
+            block_dims[1] = std::min<unsigned int>(
+                ne1, block_size / (unsigned int)block_dims[2]);
+            block_dims[0] = std::min(
+                std::min<unsigned int>(
+                    ne2 * ne3, block_size / (unsigned int)block_dims[2] /
+                                   (unsigned int)block_dims[1]),
+                64U);
+
+            sycl::range<3> block_nums(
+                (ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
+                (ne1 + block_dims[1] - 1) / block_dims[1],
+                (hne0 + block_dims[2] - 1) / block_dims[2]);
+
+            if (block_nums[0] > 65535) {
+                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
+                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
+                {
+                    dpct::has_capability_or_fail(stream->get_device(),
+                                                 {sycl::aspect::fp16});
+
+                    stream->parallel_for(
+                        sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
+                                              sycl::range<3>(1, 1, block_size),
+                                          sycl::range<3>(1, 1, block_size)),
+                        [=](sycl::nd_item<3> item_ct1) {
+                            k_bin_bcast_unravel<bin_op>(
+                                src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
+                                ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
+                                s03, s11, s12, s13, item_ct1);
+                        });
+                }
+            } else {
+                /*
+                DPCT1049:16: The work-group size passed to the SYCL kernel may
+                exceed the limit. To get the device limit, query
+                info::device::max_work_group_size. Adjust the work-group size if
+                needed.
+                */
+                dpct::has_capability_or_fail(stream->get_device(),
+                                             {sycl::aspect::fp16});
+
+                stream->parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
+                                            ne2, ne3, ne10, ne11, ne12, ne13,
+                                            s1, s2, s3, s01, s02, s03, s11, s12, s13,
+                                            item_ct1);
+                    });
+            }
        }
    }
 };
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	6107303ab0	llama : remove logits_all flag + reorder llama_context_params ggml-ci	2025-05-08 13:01:41 +03:00
Georgi Gerganov	6c0501adf7	context : remove logits_all flag ggml-ci	2025-05-08 13:01:34 +03:00