llama_encode : only force non-causal attention for enc-dec models

2026-04-23 16:37:33 +03:00 · 2025-05-19 13:43:59 -04:00
139 changed files with 3627 additions and 103719 deletions
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -1,10 +1,10 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc4.0.1
+ARG MUSA_VERSION=rc3.1.1
 # Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

 FROM ${BASE_MUSA_DEV_CONTAINER} AS build

@@ -21,14 +21,21 @@ RUN apt-get update && \
    libcurl4-openssl-dev \
    libgomp1

+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
 WORKDIR /app

 COPY . .

+# Use the default MUSA archs if not specified
 RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
--- a/.editorconfig
+++ b/.editorconfig
@@ -48,7 +48,3 @@ end_of_line = unset
 charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset
-
-[tools/mtmd/miniaudio.h]
-trim_trailing_whitespace = unset
-insert_final_newline = unset
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -351,7 +351,7 @@ jobs:

  ubuntu-22-cmake-musa:
    runs-on: ubuntu-22.04
-    container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
+    container: mthreads/musa:rc3.1.1-devel-ubuntu22.04

    steps:
      - name: Clone
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,4 +1,4 @@
-name: Release
+name: Create Release

 on:
  workflow_dispatch: # allows manual triggering
@@ -227,69 +227,6 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
          name: llama-bin-ubuntu-vulkan-x64.zip

-  windows-cpu:
-    runs-on: windows-latest
-
-    strategy:
-      matrix:
-        include:
-          - arch: 'x64'
-          - arch: 'arm64'
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-cpu-${{ matrix.arch }}
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Install Ninja
-        run: |
-          choco install ninja
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-        with:
-          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
-
-      - name: Build
-        shell: cmd
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch }}
-          cmake -S . -B build -G "Ninja Multi-Config" ^
-            -D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
-            -DGGML_NATIVE=OFF ^
-            -DGGML_BACKEND_DL=ON ^
-            -DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^
-            -DGGML_OPENMP=ON ^
-            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
-          Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.42.34433\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
-          7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-bin-win-cpu-${{ matrix.arch }}.zip
-          name: llama-bin-win-cpu-${{ matrix.arch }}.zip
-
  windows:
    runs-on: windows-latest

@@ -300,30 +237,52 @@ jobs:
    strategy:
      matrix:
        include:
-          - backend: 'vulkan'
+          - build: 'cpu-x64'
            arch: 'x64'
-            defines: '-DGGML_VULKAN=ON'
-            target: 'ggml-vulkan'
-          - backend: 'opencl-adreno'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
+          #- build: 'openblas-x64'
+          #  arch: 'x64'
+          #  defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+          - build: 'vulkan-x64'
+            arch: 'x64'
+            defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
+          - build: 'cpu-arm64'
+            arch: 'arm64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
+          - build: 'opencl-adreno-arm64'
            arch: 'arm64'
            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
-            target: 'ggml-opencl'

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
-          key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
+          key: windows-latest-cmake-${{ matrix.build }}
          variant: ccache
          evict-old-files: 1d

+      - name: Download OpenBLAS
+        id: get_openblas
+        if: ${{ matrix.build == 'openblas-x64' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
+          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
+          mkdir $env:RUNNER_TEMP/openblas
+          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
+          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
+          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
+          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
+          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
+
      - name: Install Vulkan SDK
        id: get_vulkan
-        if: ${{ matrix.backend == 'vulkan' }}
+        if: ${{ matrix.build == 'vulkan-x64' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
@@ -337,7 +296,7 @@ jobs:

      - name: Install OpenCL Headers and Libs
        id: install_opencl
-        if: ${{ matrix.backend == 'opencl-adreno' && matrix.arch == 'arm64' }}
+        if: ${{ matrix.build == 'opencl-adreno-arm64' }}
        run: |
          git clone https://github.com/KhronosGroup/OpenCL-Headers
          cd OpenCL-Headers
@@ -355,22 +314,46 @@ jobs:
            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
          cmake --build build-arm64-release --target install --config release

+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+        with:
+          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
+
      - name: Build
        id: cmake_build
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
-          cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF
-          cmake --build build --config Release --target ${{ matrix.target }}
+          cmake -S . -B build ${{ matrix.defines }} `
+            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
+
+      - name: Add libopenblas.dll
+        id: add_libopenblas_dll
+        if: ${{ matrix.build == 'openblas-x64' }}
+        run: |
+          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
+          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name

      - name: Pack artifacts
        id: pack_artifacts
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
-          7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
+          Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
-          path: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
-          name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
+          name: llama-bin-win-${{ matrix.build }}.zip

  windows-cuda:
    runs-on: windows-2019
@@ -383,6 +366,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+            fetch-depth: 0

      - name: Install ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
@@ -401,30 +386,45 @@ jobs:
        run: |
          choco install ninja

+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+
      - name: Build
        id: cmake_build
        shell: cmd
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
          cmake -S . -B build -G "Ninja Multi-Config" ^
-            -DGGML_BACKEND_DL=ON ^
            -DGGML_NATIVE=OFF ^
-            -DGGML_CPU=OFF ^
+            -DGGML_BACKEND_DL=ON ^
+            -DGGML_CPU_ALL_VARIANTS=ON ^
            -DGGML_CUDA=ON ^
-            -DLLAMA_CURL=OFF
+            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
+            ${{ env.CMAKE_ARGS }}
          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
-          cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
+          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
+          cmake --build build --config Release
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name

      - name: Pack artifacts
        id: pack_artifacts
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
-          7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
+          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip .\build\bin\Release\*

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
-          path: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
-          name: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip
+          name: llama-bin-win-cuda${{ matrix.cuda }}-x64.zip

      - name: Copy and pack Cuda runtime
        run: |
@@ -432,13 +432,13 @@ jobs:
          $dst='.\build\bin\cudart\'
          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*
+          7z a cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip $dst\*

      - name: Upload Cuda runtime
        uses: actions/upload-artifact@v4
        with:
-          path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
-          name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
+          path: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
+          name: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip

  windows-sycl:
    runs-on: windows-latest
@@ -451,11 +451,12 @@ jobs:
      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
@@ -468,18 +469,15 @@ jobs:
        run:  |
          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL

+      # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+
      - name: Build
        id: cmake_build
-        shell: cmd
-        run: |
-          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
-          cmake -G "Ninja" -B build ^
-            -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
-            -DCMAKE_BUILD_TYPE=Release ^
-            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
-            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
-            -DLLAMA_CURL=OFF
-          cmake --build build --target ggml-sycl -j
+        run:  examples/sycl/win-build-sycl.bat
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name

      - name: Build the release package
        id: pack_artifacts
@@ -504,12 +502,12 @@ jobs:
          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin

          echo "cp oneAPI running time dll files to ./build/bin done"
-          7z a llama-bin-win-sycl-x64.zip ./build/bin/*
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*

      - name: Upload the release package
        uses: actions/upload-artifact@v4
        with:
-          path: llama-bin-win-sycl-x64.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
          name: llama-bin-win-sycl-x64.zip

  windows-hip:
@@ -517,14 +515,14 @@ jobs:

    strategy:
      matrix:
-        include:
-          - name: "radeon"
-            gpu_targets: "gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
+        gpu_target: [gfx1100, gfx1101, gfx1030]

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+            fetch-depth: 0

      - name: Clone rocWMMA repository
        id: clone_rocwmma
@@ -534,7 +532,7 @@ jobs:
      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
-          key: windows-latest-cmake-hip-${{ matrix.name }}-x64
+          key: windows-latest-cmake-hip-release
          evict-old-files: 1d

      - name: Install
@@ -552,39 +550,50 @@ jobs:
        run: |
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version

+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+
      - name: Build
        id: cmake_build
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
          cmake -G "Unix Makefiles" -B build -S . `
            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
            -DCMAKE_BUILD_TYPE=Release `
-            -DGGML_BACKEND_DL=ON `
-            -DGGML_NATIVE=OFF `
-            -DGGML_CPU=OFF `
-            -DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
+            -DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
            -DGGML_HIP_ROCWMMA_FATTN=ON `
            -DGGML_HIP=ON `
-            -DLLAMA_CURL=OFF
-          cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
+            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
          md "build\bin\rocblas\library\"
          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"

+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
      - name: Pack artifacts
        id: pack_artifacts
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
-          7z a llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
+          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
-          path: llama-bin-win-hip-${{ matrix.name }}-x64.zip
-          name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip

  ios-xcode-build:
    runs-on: macos-latest
@@ -646,16 +655,14 @@ jobs:
    runs-on: ubuntu-latest

    needs:
+      - ubuntu-22-cpu
+      - ubuntu-22-vulkan
      - windows
-      - windows-cpu
      - windows-cuda
      - windows-sycl
      - windows-hip
-      - ubuntu-22-cpu
-      - ubuntu-22-vulkan
      - macOS-arm64
      - macOS-x64
-      - ios-xcode-build

    steps:
      - name: Clone
@@ -673,43 +680,10 @@ jobs:
        uses: actions/download-artifact@v4
        with:
          path: ./artifact
-          merge-multiple: true

      - name: Move artifacts
        id: move_artifacts
-        run: |
-          mkdir -p release
-
-          echo "Adding CPU backend files to existing zips..."
-          for arch in x64 arm64; do
-            cpu_zip="artifact/llama-bin-win-cpu-${arch}.zip"
-            temp_dir=$(mktemp -d)
-            echo "Extracting CPU backend for $arch..."
-            unzip "$cpu_zip" -d "$temp_dir"
-
-            echo "Adding CPU files to $arch zips..."
-            for target_zip in artifact/llama-bin-win-*-${arch}.zip; do
-              if [[ "$target_zip" == "$cpu_zip" ]]; then
-                continue
-              fi
-              echo "Adding CPU backend to $(basename "$target_zip")"
-              realpath_target_zip=$(realpath "$target_zip")
-              (cd "$temp_dir" && zip -r "$realpath_target_zip" .)
-            done
-
-            rm -rf "$temp_dir"
-          done
-
-          echo "Renaming and moving zips to release..."
-          for zip_file in artifact/llama-bin-win-*.zip; do
-            base_name=$(basename "$zip_file" .zip)
-            zip_name="llama-${{ steps.tag.outputs.name }}-${base_name#llama-}.zip"
-            echo "Moving $zip_file to release/$zip_name"
-            mv "$zip_file" "release/$zip_name"
-          done
-
-          echo "Moving other artifacts..."
-          mv -v artifact/*.zip release
+        run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release

      - name: Create release
        id: create_release
@@ -728,7 +702,7 @@ jobs:
            const path = require('path');
            const fs = require('fs');
            const release_id = '${{ steps.create_release.outputs.id }}';
-            for (let file of await fs.readdirSync('./release')) {
+            for (let file of await fs.readdirSync('./artifact/release')) {
              if (path.extname(file) === '.zip') {
                console.log('uploadReleaseAsset', file);
                await github.repos.uploadReleaseAsset({
@@ -736,7 +710,7 @@ jobs:
                  repo: context.repo.repo,
                  release_id: release_id,
                  name: file,
-                  data: await fs.readFileSync(`./release/${file}`)
+                  data: await fs.readFileSync(`./artifact/release/${file}`)
                });
              }
            }
--- a/.github/workflows/winget.yml
+++ b/.github/workflows/winget.yml
@@ -1,42 +0,0 @@
-name: Update Winget Package
-
-on:
-  workflow_dispatch: # allows manual triggering
-  schedule:
-    - cron: '28 5 * * *' # Update every day at 5:28 UTC
-
-jobs:
-  update:
-    name: Update Winget Package
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Install cargo binstall
-        uses: cargo-bins/cargo-binstall@268643a6b5ea099f5718ee5cd3ff7dc89a5eb49b
-
-      - name: Install komac
-        run: |
-          cargo binstall komac@2.11.2 -y
-
-      - name: Find latest release
-        id: find_latest_release
-        uses: actions/github-script@v6
-        with:
-          script: |
-            const { data: releases } = await github.rest.repos.listReleases({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-            });
-            console.log("Latest release:", releases[0].tag_name);
-            return releases[0].tag_name;
-
-      - name: Update manifest
-        env:
-          VERSION: ${{ steps.find_latest_release.outputs.result }}
-        run: |
-          echo "Updating manifest..."
-          komac update --version ${{ env.VERSION }} \
-            --urls "https://github.com/ggml-org/llama.cpp/releases/download/${{ env.VERSION }}/llama-${{ env.VERSION }}-bin-win-vulkan-x64.zip" \
-            --token ${{ secrets.WINGET_GITHUB_TOKEN }} \
-            --submit \
-            ggml.llamacpp
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ range of hardware - locally and in the cloud.
 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
 - AVX, AVX2, AVX512 and AMX support for x86 architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
+- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
 - Vulkan and SYCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity

@@ -237,7 +237,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [BLAS](docs/build.md#blas-build) | All |
 | [BLIS](docs/backend/BLIS.md) | All |
 | [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
-| [MUSA](docs/build.md#musa) | Moore Threads GPU |
+| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
 | [CUDA](docs/build.md#cuda) | Nvidia GPU |
 | [HIP](docs/build.md#hip) | AMD GPU |
 | [Vulkan](docs/build.md#vulkan) | GPU |
@@ -580,4 +580,3 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
 - [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
 - [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
 - [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
--- a/ci/README.md
+++ b/ci/README.md
@@ -54,7 +54,7 @@ docker run --privileged -it \
    -v $HOME/llama.cpp/ci-cache:/ci-cache \
    -v $HOME/llama.cpp/ci-results:/ci-results \
    -v $PWD:/ws -w /ws \
-    mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
+    mthreads/musa:rc3.1.1-devel-ubuntu22.04
 ```

 Inside the container, execute the following commands:
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -60,16 +60,12 @@ add_library(${TARGET} STATIC
    base64.hpp
    chat.cpp
    chat.h
-    chat-parser.cpp
-    chat-parser.h
    common.cpp
    common.h
    console.cpp
    console.h
    json-schema-to-grammar.cpp
    json.hpp
-    json-partial.h
-    json-partial.cpp
    llguidance.cpp
    log.cpp
    log.h
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -39,7 +39,7 @@
 using json = nlohmann::ordered_json;

 std::initializer_list<enum llama_example> mmproj_examples = {
-    LLAMA_EXAMPLE_MTMD,
+    LLAMA_EXAMPLE_LLAVA,
    LLAMA_EXAMPLE_SERVER,
 };

@@ -1445,14 +1445,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.n_keep = value;
        }
    ));
-    add_opt(common_arg(
-        {"--swa-full"},
-        string_format("use full-size SWA cache (default: %s)\n"
-            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
-        [](common_params & params) {
-            params.swa_full = true;
-        }
-    ).set_env("LLAMA_ARG_SWA_FULL"));
    add_opt(common_arg(
        {"--no-context-shift"},
        string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -1678,7 +1670,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.warmup = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING}));
    add_opt(common_arg(
        {"--spm-infill"},
        string_format(
@@ -2065,6 +2057,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.grp_attn_w = value;
        }
    ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"-dkvc", "--dump-kv-cache"},
+        "verbose print of the KV cache",
+        [](common_params & params) {
+            params.dump_kv_cache = true;
+        }
+    ));
    add_opt(common_arg(
        {"-nkvo", "--no-kv-offload"},
        "disable KV offload",
@@ -2233,12 +2232,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
    add_opt(common_arg(
-        {"--image", "--audio"}, "FILE",
-        "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
+        {"--image"}, "FILE",
+        "path to an image file. use with multimodal models. Specify multiple times for batching",
        [](common_params & params, const std::string & value) {
            params.image.emplace_back(value);
        }
-    ).set_examples({LLAMA_EXAMPLE_MTMD}));
+    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
    if (llama_supports_rpc()) {
        add_opt(common_arg(
            {"--rpc"}, "SERVERS",
@@ -2848,24 +2847,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
    add_opt(common_arg(
        {"--reasoning-format"}, "FORMAT",
-        "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
-        "- none: leaves thoughts unparsed in `message.content`\n"
-        "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
-        "(default: deepseek)",
+        "reasoning format (default: deepseek; allowed values: deepseek, none)\n"
+        "controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
+        "only supported for non-streamed responses",
        [](common_params & params, const std::string & value) {
            /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
            else if (value == "none") {     params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
-            else { throw std::invalid_argument("invalid value"); }
+            else { std::invalid_argument("invalid value"); }
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
-    add_opt(common_arg(
-        {"--reasoning-budget"}, "N",
-        "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
-        [](common_params & params, int value) {
-            if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
-            params.reasoning_budget = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
    add_opt(common_arg(
        {"--chat-template"}, "JINJA_TEMPLATE",
        string_format(
@@ -2877,7 +2867,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.chat_template = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
    add_opt(common_arg(
        {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
        string_format(
@@ -2964,7 +2954,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
            else if (value == "md") { params.batched_bench_output_jsonl = false; }
-            else { throw std::invalid_argument("invalid value"); }
+            else { std::invalid_argument("invalid value"); }
        }
    ).set_examples({LLAMA_EXAMPLE_BENCH}));
    add_opt(common_arg(
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@@ -1,376 +0,0 @@
-#include "chat-parser.h"
-#include "common.h"
-#include "log.h"
-#include "regex-partial.h"
-
-#include <optional>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-using json = nlohmann::ordered_json;
-
-common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax)
-    : input_(input), is_partial_(is_partial), syntax_(syntax)
-{
-    result_.role = "assistant";
-
-    while (true) {
-        std::string id = std::to_string(std::rand());
-        if (input.find(id) == std::string::npos) {
-            healing_marker_ = id;
-            break;
-        }
-    }
-}
-
-std::string common_chat_msg_parser::str(const common_string_range & rng) const {
-    GGML_ASSERT(rng.begin <= rng.end);
-    return input_.substr(rng.begin, rng.end - rng.begin);
-}
-
-void common_chat_msg_parser::add_content(const std::string &content) {
-    result_.content += content;
-}
-
-void common_chat_msg_parser::add_reasoning_content(const std::string &reasoning_content) {
-    result_.reasoning_content += reasoning_content;
-}
-
-bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::string & id, const std::string & arguments) {
-    if (name.empty()) {
-        return false;
-    }
-
-    common_chat_tool_call tool_call;
-    tool_call.name = name;
-    tool_call.arguments = arguments;
-    tool_call.id = id;
-
-    // LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
-    result_.tool_calls.emplace_back(tool_call);
-    return true;
-}
-bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
-    std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
-    std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
-    std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
-    return add_tool_call(name, id, arguments);
-}
-
-bool common_chat_msg_parser::add_tool_calls(const json & arr) {
-    for (const auto & item : arr) {
-        if (!add_tool_call(item)) {
-            return false;
-        }
-    }
-    return true;
-}
-void common_chat_msg_parser::finish() {
-    if (!is_partial_ && pos_ != input_.size()) {
-        throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
-    }
-}
-
-bool common_chat_msg_parser::consume_spaces() {
-    const auto length = input_.size();
-    auto consumed = false;
-    while (pos_ < length && std::isspace(input_[pos_])) {
-        ++pos_;
-        consumed = true;
-    }
-    return consumed;
-}
-
-bool common_chat_msg_parser::try_consume_literal(const std::string & literal) {
-    auto pos = pos_;
-    for (auto i = 0u; i < literal.size(); ++i) {
-        if (pos >= input_.size()) {
-            return false;
-        }
-        if (input_[pos] != literal[i]) {
-            return false;
-        }
-        ++pos;
-    }
-    pos_ = pos;
-    return true;
-}
-
-std::optional<common_chat_msg_parser::find_regex_result>  common_chat_msg_parser::try_find_literal(const std::string & literal) {
-    auto idx = input_.find(literal, pos_);
-    if (idx != std::string::npos) {
-        find_regex_result res;
-        res.prelude = input_.substr(pos_, idx - pos_);
-        auto end = idx + literal.size();
-        res.groups.emplace_back(common_string_range{idx, end});
-        move_to(end);
-        return res;
-    }
-    if (is_partial_) {
-        idx = string_find_partial_stop(input_, literal);
-        if (idx != std::string::npos && idx >= pos_) {
-            find_regex_result res;
-            res.prelude = input_.substr(pos_, idx - pos_);
-            auto end = input_.size();
-            res.groups.emplace_back(common_string_range{idx, end});
-            move_to(end);
-            return res;
-        }
-    }
-    return std::nullopt;
-}
-
-void common_chat_msg_parser::consume_literal(const std::string & literal) {
-    if (!try_consume_literal(literal)) {
-        throw common_chat_msg_partial_exception(literal);
-    }
-}
-
-bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
-    auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
-        auto stripped_reasoning = string_strip(reasoning);
-        if (stripped_reasoning.empty()) {
-            return;
-        }
-        if (syntax_.reasoning_in_content) {
-            add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "<think>" : start_think);
-            add_content(stripped_reasoning);
-            if (closed) {
-                add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
-            }
-        } else {
-            add_reasoning_content(stripped_reasoning);
-        }
-    };
-    if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
-        if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
-            if (auto res = try_find_literal(end_think)) {
-                handle_reasoning(res->prelude, /* closed */ true);
-                consume_spaces();
-                return true;
-            }
-            auto rest = consume_rest();
-            if (!rest.empty()) {
-                handle_reasoning(rest, /* closed */ !is_partial());
-            }
-            if (!syntax_.thinking_forced_open) {
-                throw common_chat_msg_partial_exception(end_think);
-            }
-            return true;
-        }
-    }
-    return false;
-}
-
-std::string common_chat_msg_parser::consume_rest() {
-    auto rest = input_.substr(pos_);
-    pos_ = input_.size();
-    return rest;
-}
-
-// Tries to find the regex, consumes it (pos right after it) and gives the prelude (right before it) and the groups to the callback.
-std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from) {
-    auto m = regex.search(input_, from == std::string::npos ? pos_ : from);
-    if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
-        return std::nullopt;
-    }
-    if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
-        if (is_partial()) {
-            throw common_chat_msg_partial_exception(regex.str());
-        }
-        return std::nullopt;
-    }
-    auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);
-    pos_ = m.groups[0].end;
-
-    return find_regex_result{prelude, m.groups};
-}
-
-common_chat_msg_parser::find_regex_result common_chat_msg_parser::consume_regex(const common_regex & regex) {
-    if (auto result = try_consume_regex(regex)) {
-        return *result;
-    }
-    throw common_chat_msg_partial_exception(regex.str());
-}
-
-std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_consume_regex(const common_regex & regex) {
-    auto m = regex.search(input_, pos_);
-    if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
-        return std::nullopt;
-    }
-    if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
-        if (is_partial()) {
-            throw common_chat_msg_partial_exception(regex.str());
-        }
-        return std::nullopt;
-    }
-    if (m.groups[0].begin != pos_) {
-        // Didn't match at the current position.
-        return std::nullopt;
-    }
-    pos_ = m.groups[0].end;
-
-    return find_regex_result {
-        /* .prelude = */ "",
-        m.groups,
-    };
-}
-
-std::optional<common_json> common_chat_msg_parser::try_consume_json() {
-    auto it = input_.cbegin() + pos_;
-    const auto end = input_.cend();
-    common_json result;
-    if (!common_json_parse(it, end, healing_marker_, result)) {
-        return std::nullopt;
-    }
-    pos_ = std::distance(input_.cbegin(), it);
-    if (result.healing_marker.marker.empty()) {
-        // No healing marker, just return the parsed json
-        return result;
-    }
-    if (!is_partial()) {
-        throw common_chat_msg_partial_exception("JSON");
-    }
-    return result;
-}
-
-common_json common_chat_msg_parser::consume_json() {
-    if (auto result = try_consume_json()) {
-        return *result;
-    }
-    throw common_chat_msg_partial_exception("JSON");
-}
-
-common_chat_msg_parser::consume_json_result common_chat_msg_parser::consume_json_with_dumped_args(
-    const std::vector<std::vector<std::string>> & args_paths,
-    const std::vector<std::vector<std::string>> & content_paths
-) {
-    if (auto result = try_consume_json_with_dumped_args(args_paths, content_paths)) {
-        return *result;
-    }
-    throw common_chat_msg_partial_exception("JSON");
-}
-
-std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parser::try_consume_json_with_dumped_args(
-    const std::vector<std::vector<std::string>> & args_paths,
-    const std::vector<std::vector<std::string>> & content_paths
-) {
-    auto partial = try_consume_json();
-    if (!partial) {
-        return std::nullopt;
-    }
-    auto is_arguments_path = [&](const std::vector<std::string> & path) {
-        return std::find(args_paths.begin(), args_paths.end(), path) != args_paths.end();
-    };
-    auto is_content_path = [&](const std::vector<std::string> & path) {
-        return std::find(content_paths.begin(), content_paths.end(), path) != content_paths.end();
-    };
-
-    if (partial->healing_marker.marker.empty()) {
-        if (args_paths.empty()) {
-            // No arguments to dump, and JSON was parsed fully.
-            return consume_json_result {
-                partial->json,
-                /* .is_partial = */ false,
-            };
-        }
-        if (is_arguments_path({})) {
-            // Entire JSON is the arguments and was parsed fully.
-            return consume_json_result {
-                partial->json.dump(),
-                /* .is_partial = */ false,
-            };
-        }
-    }
-
-    LOG_DBG("Parsed partial JSON: %s (json_healing_marker: %s)\n", partial->json.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
-
-    auto found_healing_marker = false;
-    std::vector<std::string> path;
-    std::function<json(const json &)> remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
-        if (is_arguments_path(path)) {
-            auto arguments = j.dump();
-            if (is_partial() && !partial->healing_marker.marker.empty()) {
-                auto idx = arguments.find(partial->healing_marker.json_dump_marker);
-                if (idx != std::string::npos) {
-                    arguments.resize(idx);
-                    found_healing_marker = true;
-                }
-                if (arguments == "\"") {
-                    // This happens because of completing `:"$magic` after `"arguments"`
-                    arguments = "";
-                }
-            }
-            return arguments;
-        }
-        if (is_content_path(path)) {
-            if (!j.is_string()) {
-                throw std::runtime_error("Content path must be a string");
-            }
-            std::string str = j;
-            auto idx = str.find(partial->healing_marker.marker); // not using json_dump_marker as we're inside a string
-            if (idx != std::string::npos) {
-                str.resize(idx);
-                found_healing_marker = true;
-            }
-            return str;
-        }
-        if (j.is_object()) {
-            auto obj = json::object();
-            for (const auto & p : j.items()) {
-                const auto & key = p.key();
-                const auto & value = p.value();
-                const std::string key_str = key; // NOLINT
-                auto idx = key_str.find(healing_marker_);
-                if (idx != std::string::npos) {
-                    found_healing_marker = true;
-                    break;
-                }
-                path.push_back(key_str);
-                if (value.is_string()) {
-                    const std::string value_str = value;
-                    if (value_str.find(healing_marker_) != std::string::npos) {
-                        found_healing_marker = true;
-                        if (is_content_path(path)) {
-                            if (partial->healing_marker.marker == partial->healing_marker.json_dump_marker) {
-                                // The healing occurred inside the string: good. Otherwise we just ditch the entire key/value pair.
-                                obj[key] = remove_unsupported_healings_and_dump_args(value);
-                            }
-                        }
-                        break;
-                    }
-                    obj[key] = value;
-                } else {
-                    obj[key] = remove_unsupported_healings_and_dump_args(value);
-                }
-                path.pop_back();
-            }
-            return obj;
-        }
-        if (j.is_array()) {
-            auto arr = json::array();
-            for (const auto & value : j) {
-                if (value.is_string()) {
-                    std::string str = value;
-                    auto idx = str.find(healing_marker_);
-                    if (idx != std::string::npos) {
-                        // Don't heal array values that aren't in the arguments.
-                        found_healing_marker = true;
-                        break;
-                    }
-                }
-                arr.push_back(remove_unsupported_healings_and_dump_args(value));
-            }
-            return arr;
-        }
-        return j;
-    };
-
-    auto cleaned = remove_unsupported_healings_and_dump_args(partial->json);
-    LOG_DBG("Cleaned up JSON %s to %s (json_healing_marker : '%s')\n", partial->json.dump().c_str(), cleaned.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
-    return consume_json_result {
-        cleaned,
-        /* .is_partial = */ found_healing_marker,
-    };
-}
--- a/common/chat-parser.h
+++ b/common/chat-parser.h
@@ -1,116 +0,0 @@
-#pragma once
-
-#include "chat.h"
-#include "json-partial.h"
-#include "json.hpp"
-#include "regex-partial.h"
-
-#include <optional>
-#include <string>
-#include <vector>
-
-class common_chat_msg_partial_exception : public std::runtime_error {
-  public:
-    common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
-};
-
-class common_chat_msg_parser {
-    std::string input_;
-    bool is_partial_;
-    common_chat_syntax syntax_;
-    std::string healing_marker_;
-
-    size_t pos_ = 0;
-    common_chat_msg result_;
-
-  public:
-    common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
-    const std::string & input() const { return input_; }
-    size_t pos() const { return pos_; }
-    const std::string & healing_marker() const { return healing_marker_; }
-    const bool & is_partial() const { return is_partial_; }
-    const common_chat_msg & result() const { return result_; }
-
-    void move_to(size_t pos) {
-        if (pos > input_.size()) {
-            throw std::runtime_error("Invalid position!");
-        }
-        pos_ = pos;
-    }
-    void move_back(size_t n) {
-        if (pos_ < n) {
-            throw std::runtime_error("Can't move back that far!");
-        }
-        pos_ -= n;
-    }
-
-    // Get the substring of the input at the given range
-    std::string str(const common_string_range & rng) const;
-
-    // Appends to the result.content field
-    void add_content(const std::string & content);
-
-    // Appends to the result.reasoning_content field
-    void add_reasoning_content(const std::string & reasoning_content);
-
-    // Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
-    bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);
-
-    // Adds a tool call using the "name", "id" and "arguments" fields of the json object
-    bool add_tool_call(const nlohmann::ordered_json & tool_call);
-
-    // Adds an array of tool calls using their "name", "id" and "arguments" fields.
-    bool add_tool_calls(const nlohmann::ordered_json & arr);
-
-    void finish();
-
-    bool consume_spaces();
-
-    void consume_literal(const std::string & literal);
-
-    bool try_parse_reasoning(const std::string & start_think, const std::string & end_think);
-
-    std::string consume_rest();
-
-    struct find_regex_result {
-        std::string prelude;
-        std::vector<common_string_range> groups;
-    };
-
-    std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos);
-
-    bool try_consume_literal(const std::string & literal);
-
-    std::optional<find_regex_result> try_find_literal(const std::string & literal);
-
-    find_regex_result consume_regex(const common_regex & regex);
-
-    std::optional<find_regex_result> try_consume_regex(const common_regex & regex);
-
-    std::optional<common_json> try_consume_json();
-    common_json consume_json();
-
-    struct consume_json_result {
-        nlohmann::ordered_json value;
-        bool is_partial;
-    };
-
-    /*
-        Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings.
-
-        By default, object keys can't be truncated, nor can string values (their corresponding key is removed,
-        e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}`
-
-        But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings
-        - with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}`
-        - with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}`
-    */
-    consume_json_result consume_json_with_dumped_args(
-        const std::vector<std::vector<std::string>> & args_paths = {},
-        const std::vector<std::vector<std::string>> & content_paths = {}
-    );
-    std::optional<consume_json_result> try_consume_json_with_dumped_args(
-        const std::vector<std::vector<std::string>> & args_paths = {},
-        const std::vector<std::vector<std::string>> & content_paths = {}
-    );
-};
--- a/common/chat.cpp
+++ b/common/chat.cpp
--- a/common/chat.h
+++ b/common/chat.h
@@ -3,7 +3,6 @@
 #pragma once

 #include "common.h"
-#include <functional>
 #include <chrono>
 #include <string>
 #include <vector>
@@ -14,19 +13,11 @@ struct common_chat_tool_call {
    std::string name;
    std::string arguments;
    std::string id;
-
-    bool operator==(const common_chat_tool_call & other) const {
-        return name == other.name && arguments == other.arguments && id == other.id;
-    }
 };

 struct common_chat_msg_content_part {
    std::string type;
    std::string text;
-
-    bool operator==(const common_chat_msg_content_part & other) const {
-        return type == other.type && text == other.text;
-    }
 };

 struct common_chat_msg {
@@ -37,51 +28,6 @@ struct common_chat_msg {
    std::string reasoning_content;
    std::string tool_name;
    std::string tool_call_id;
-
-    template <class T> T to_json_oaicompat() const;
-
-    bool empty() const {
-        return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
-    }
-    void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
-        for (auto i = 0u; i < tool_calls.size(); i++) {
-            if (ids_cache.size() <= i) {
-                auto id = tool_calls[i].id;
-                if (id.empty()) {
-                    id = gen_tool_call_id();
-                }
-                ids_cache.push_back(id);
-            }
-            tool_calls[i].id = ids_cache[i];
-        }
-    }
-    bool operator==(const common_chat_msg & other) const {
-        return role == other.role
-            && content == other.content
-            && content_parts == other.content_parts
-            && tool_calls == other.tool_calls
-            && reasoning_content == other.reasoning_content
-            && tool_name == other.tool_name
-            && tool_call_id == other.tool_call_id;
-    }
-    bool operator!=(const common_chat_msg & other) const {
-        return !(*this == other);
-    }
-};
-
-struct common_chat_msg_diff {
-    // std::string reasoning_content_delta;
-    std::string content_delta;
-    size_t tool_call_index = std::string::npos;
-    common_chat_tool_call tool_call_delta;
-
-    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
-
-    bool operator==(const common_chat_msg_diff & other) const {
-        return content_delta == other.content_delta
-        && tool_call_index == other.tool_call_index
-        && tool_call_delta == other.tool_call_delta;
-    }
 };

 struct common_chat_tool {
@@ -103,11 +49,14 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_LLAMA_3_X,
    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+    COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
    COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
    COMMON_CHAT_FORMAT_HERMES_2_PRO,
+    COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
    COMMON_CHAT_FORMAT_COMMAND_R7B,
+    COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,

    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
@@ -122,8 +71,7 @@ struct common_chat_templates_inputs {
    std::vector<common_chat_tool> tools;
    common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
    bool parallel_tool_calls = false;
-    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
-    bool enable_thinking = true;
+    bool extract_reasoning     = true;
    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
 };

@@ -132,20 +80,11 @@ struct common_chat_params {
    std::string                         prompt;
    std::string                         grammar;
    bool                                grammar_lazy = false;
-    bool                                thinking_forced_open = false;
    std::vector<common_grammar_trigger> grammar_triggers;
    std::vector<std::string>            preserved_tokens;
    std::vector<std::string>            additional_stops;
 };

-struct common_chat_syntax {
-    common_chat_format       format                = COMMON_CHAT_FORMAT_CONTENT_ONLY;
-    common_reasoning_format  reasoning_format      = COMMON_REASONING_FORMAT_NONE;
-    // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
-    bool                     reasoning_in_content  = false;
-    bool                     thinking_forced_open  = false;
-};
-
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);

@@ -182,9 +121,8 @@ std::string common_chat_format_example(
    const struct common_chat_templates * tmpls,
    bool use_jinja);

-const char*               common_chat_format_name(common_chat_format format);
-const char*               common_reasoning_format_name(common_reasoning_format format);
-common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
+std::string               common_chat_format_name(common_chat_format format);
+common_chat_msg           common_chat_parse(      const std::string & input, common_chat_format format);

 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);

@@ -197,5 +135,3 @@ template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common
 // T can be std::string containing JSON or nlohmann::ordered_json
 template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
 template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
-
-template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -849,7 +849,7 @@ std::string fs_get_cache_directory() {
    if (getenv("LLAMA_CACHE")) {
        cache_directory = std::getenv("LLAMA_CACHE");
    } else {
-#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
+#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
        if (std::getenv("XDG_CACHE_HOME")) {
            cache_directory = std::getenv("XDG_CACHE_HOME");
        } else {
@@ -1102,9 +1102,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
        mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
    }

-    mparams.progress_callback           = params.load_progress_callback;
-    mparams.progress_callback_user_data = params.load_progress_callback_user_data;
-
    return mparams;
 }

@@ -1136,7 +1133,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    cparams.flash_attn        = params.flash_attn;
    cparams.no_perf           = params.no_perf;
    cparams.op_offload        = !params.no_op_offload;
-    cparams.swa_full          = params.swa_full;

    if (params.reranking) {
        cparams.embeddings    = true;
@@ -1329,6 +1325,81 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
    return text;
 }

+//
+// KV cache utils
+//
+
+void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
+    static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
+
+    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
+        view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
+
+    llama_kv_cache_view_cell * c_curr = view.cells;
+    llama_seq_id * cs_curr = view.cells_sequences;
+
+    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
+        if (i % row_size == 0) {
+            printf("\n%5d: ", i);
+        }
+        int seq_count = 0;
+        for (int j = 0; j < view.n_seq_max; j++) {
+            if (cs_curr[j] >= 0) { seq_count++; }
+        }
+        putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
+    }
+
+    printf("\n=== Done dumping\n");
+}
+
+void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
+    static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
+        view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
+
+    std::unordered_map<llama_seq_id, size_t> seqs;
+    llama_kv_cache_view_cell * c_curr = view.cells;
+    llama_seq_id * cs_curr = view.cells_sequences;
+
+    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
+        for (int j = 0; j < view.n_seq_max; j++) {
+            if (cs_curr[j] < 0) { continue; }
+            if (seqs.find(cs_curr[j]) == seqs.end()) {
+                if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
+                const size_t sz = seqs.size();
+                seqs[cs_curr[j]] = sz;
+            }
+        }
+        if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
+    }
+
+    printf("=== Sequence legend: ");
+    for (const auto & it : seqs) {
+        printf("%zu=%d, ", it.second, it.first);
+    }
+    printf("'+'=other sequence ids");
+
+    c_curr = view.cells;
+    cs_curr = view.cells_sequences;
+    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
+        if (i % row_size == 0) {
+            printf("\n%5d: ", i);
+        }
+        for (int j = 0; j < view.n_seq_max; j++) {
+            if (cs_curr[j] >= 0) {
+                const auto & it = seqs.find(cs_curr[j]);
+                putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
+            } else {
+                putchar('.');
+            }
+        }
+        putchar(' ');
+    }
+
+    printf("\n=== Done dumping\n");
+}
+
 //
 // Embedding utils
 //
--- a/common/common.h
+++ b/common/common.h
@@ -76,7 +76,7 @@ enum llama_example {
    LLAMA_EXAMPLE_SERVER,
    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
    LLAMA_EXAMPLE_EXPORT_LORA,
-    LLAMA_EXAMPLE_MTMD,
+    LLAMA_EXAMPLE_LLAVA,
    LLAMA_EXAMPLE_LOOKUP,
    LLAMA_EXAMPLE_PARALLEL,
    LLAMA_EXAMPLE_TTS,
@@ -115,7 +115,7 @@ enum common_grammar_trigger_type {
    COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
-    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
 };

 struct common_grammar_trigger {
@@ -323,13 +323,13 @@ struct common_params {
    bool flash_attn        = false; // flash attention
    bool no_perf           = false; // disable performance metrics
    bool ctx_shift         = true;  // context shift on inifinite text generation
-    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
+    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
    bool warmup            = true;  // warmup run
    bool check_tensors     = false; // validate tensor data
@@ -368,7 +368,6 @@ struct common_params {
    bool use_jinja = false;                                                                                 // NOLINT
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
-    int reasoning_budget = -1;
    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response

    std::vector<std::string> api_keys;
@@ -429,11 +428,6 @@ struct common_params {

    // common params
    std::string out_file; // output filename for all example programs
-    // optional callback for model loading progress and cancellation:
-    // called with a progress value between 0.0 and 1.0.
-    // return false from callback to abort model loading or true to continue
-    llama_progress_callback load_progress_callback = NULL;
-    void *                  load_progress_callback_user_data = NULL;
 };

 // call once at the start of a program if it uses libcommon
@@ -622,6 +616,16 @@ std::string common_detokenize(
        const std::vector<llama_token> & tokens,
                                  bool   special = true);

+//
+// KV cache utils
+//
+
+// Dump the KV cache view with the number of sequences per cell.
+void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
+
+// Dump the KV cache view showing individual sequences in each cell (long output).
+void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+
 //
 // Embedding utils
 //
--- a/common/json-partial.cpp
+++ b/common/json-partial.cpp
@@ -1,255 +0,0 @@
-#include <json-partial.h>
-#include "ggml.h"
-#include "log.h"
-#include <string>
-
-#include <json.hpp>
-
-using json = nlohmann::ordered_json;
-
-enum common_json_stack_element_type {
-    COMMON_JSON_STACK_ELEMENT_OBJECT,
-    COMMON_JSON_STACK_ELEMENT_KEY,
-    COMMON_JSON_STACK_ELEMENT_ARRAY,
-};
-
-struct common_json_stack_element {
-    common_json_stack_element_type type;
-    std::string key;
-};
-
-bool common_json_parse(
-    const std::string & input,
-    const std::string & healing_marker,
-    common_json & out)
-{
-    std::string::const_iterator it = input.begin();
-    const auto end = input.end();
-    return common_json_parse(it, end, healing_marker, out);
-}
-
-bool common_json_parse(
-    std::string::const_iterator & it,
-    const std::string::const_iterator & end,
-    const std::string & healing_marker,
-    common_json & out)
-{
-    // // https://json.nlohmann.me/features/parsing/sax_interface/
-    struct json_error_locator : public nlohmann::json_sax<json> {
-        std::size_t position;
-        bool found_error;
-        std::string last_token;
-        std::string exception_message;
-        std::vector<common_json_stack_element> stack;
-
-        json_error_locator() : position(0), found_error(false) {}
-
-        bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT
-            this->position = position - 1;
-            this->found_error = true;
-            this->last_token = last_token;
-            this->exception_message = ex.what();
-            return false;
-        }
-        void close_value() {
-            if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) {
-                stack.pop_back();
-            }
-        }
-        bool null() override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool boolean(bool) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool number_integer(number_integer_t) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool number_unsigned(number_unsigned_t) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool number_float(number_float_t, const string_t &) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool string(string_t &) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool binary(binary_t &) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool start_object(std::size_t) override { // NOLINT
-            stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""});
-            return true;
-        }
-        bool end_object() override {
-            GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT);
-            stack.pop_back();
-            close_value();
-            return true;
-        }
-        bool key(string_t & key) override { // NOLINT
-            stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key});
-            return true;
-        }
-        bool start_array(std::size_t) override { // NOLINT
-            stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""});
-            return true;
-        }
-        bool end_array() override {
-            GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY);
-            stack.pop_back();
-            close_value();
-            return true;
-        }
-    };
-    json_error_locator err_loc;
-    auto start = it;
-    json::sax_parse(it, end, &err_loc);
-
-    if (err_loc.found_error) {
-        it = start;
-        auto temptative_end = it + err_loc.position;
-        // LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());
-
-        auto input = std::string(it, temptative_end);
-        try {
-            out.json = json::parse(input);
-            // out.json = json::parse(it, temptative_end);
-            it = temptative_end;
-            return true;
-        } catch (const std::exception & ex) {
-            // No, needs healing.
-            LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str());
-        }
-        auto can_parse = [](const std::string & str) {
-            try {
-                auto _ = json::parse(str); // NOLINT
-                return true;
-            } catch (const std::exception &) {
-                return false;
-            }
-        };
-        if (!healing_marker.empty() && !err_loc.stack.empty()) {
-            std::string str(it, temptative_end);
-            auto last_non_sp_pos = str.find_last_not_of(" \n\r\t");
-            if (last_non_sp_pos == std::string::npos) {
-                throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
-            }
-            auto last_non_sp_char = str[last_non_sp_pos];
-            // Used to detect stops on a number, which may not be complete.
-            auto was_maybe_number = [&]() {
-                if (!str.empty() && std::isspace(str.back())) {
-                    return false;
-                }
-                return std::isdigit(last_non_sp_char) ||
-                    last_non_sp_char == '.' ||
-                    last_non_sp_char == 'e' ||
-                    last_non_sp_char == 'E' ||
-                    last_non_sp_char == '-';
-            };
-
-            std::string closing;
-            for (size_t i = err_loc.stack.size(); i > 0; i--) {
-                auto & el = err_loc.stack[i - 1];
-                if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
-                    closing += "}";
-                } else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
-                    closing += "]";
-                } else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) {
-                    throw std::runtime_error("Unexpected stack element type");
-                }
-            }
-
-            const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
-
-            if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
-                // We're inside an object value
-                if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) {
-                    // Was about to create an object value
-                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                } else if (can_parse(str + ": 1" + closing)) {
-                    str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing;
-                } else if (last_non_sp_char == '{' && can_parse(str + closing)) {
-                    // Was about to create an object
-                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
-                } else if (can_parse(str + "\"" + closing)) {
-                    // Was inside an object value string
-                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
-                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
-                    // Was inside an object value string after an escape
-                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
-                } else {
-                    // find last :
-                    auto last_pos = str.find_last_of(':');
-                    if (last_pos == std::string::npos) {
-                        throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
-                    }
-                    // Cutting back to opening : for object value
-                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                }
-            } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
-                if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) {
-                    // Was about to create an array value
-                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                } else if (can_parse(str + "\"" + closing)) {
-                    // Was inside an array value string
-                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
-                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
-                    // Was inside an array value string after an escape
-                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
-                } else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
-                    // Had just finished a value
-                    str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
-                } else {
-                    auto last_pos = str.find_last_of("[,");
-                    if (last_pos == std::string::npos) {
-                        throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location");
-                    }
-                    // Cutting back to last [ or , for array value
-                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                }
-            } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
-                if ((last_non_sp_char == '{' && can_parse(str + closing)) ||
-                        (last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) {
-                    // Was about to create an object key+value
-                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
-                } else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) {
-                    // Was about to create an object key+value
-                    str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing;
-                } else if (can_parse(str + "\": 1" + closing)) {
-                    // Was inside an object key string
-                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing;
-                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
-                    // Was inside an object key string after an escape
-                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
-                } else {
-                    auto last_pos = str.find_last_of(':');
-                    if (last_pos == std::string::npos) {
-                        throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
-                    }
-                    // fprintf(stderr, "Cutting back to last : for object key+value\n");
-                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                }
-            } else {
-                throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
-            }
-            // fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str());
-            out.json = json::parse(str);
-            it = temptative_end;
-            return true;
-        }
-        // TODO: handle unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
-        // fprintf(stderr, "Closing: TODO\n");
-        return false;
-    }
-    out.json = json::parse(it, end);
-    it = end;
-    return true;
-}
--- a/common/json-partial.h
+++ b/common/json-partial.h
@@ -1,37 +0,0 @@
-#pragma once
-#include <json.hpp>
-
-// Healing marker (empty if the JSON was fully parsed / wasn't healed).
-struct common_healing_marker {
-    // Raw marker.
-    std::string marker;
-
-    // Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format).
-    std::string json_dump_marker;
-};
-
-// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string)
-struct common_json {
-    nlohmann::ordered_json json;
-
-    common_healing_marker healing_marker;
-};
-
-// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty.
-//
-// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON.
-// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker.
-// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format).
-//
-// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again).
-bool common_json_parse(
-    const std::string & input,
-    const std::string & healing_marker,
-    common_json & out);
-
-// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds.
-bool common_json_parse(
-    std::string::const_iterator & it,
-    const std::string::const_iterator & end,
-    const std::string & healing_marker,
-    common_json & out);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -161,7 +161,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
        GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
    } else {
-        std::vector<std::string> trigger_patterns;
+        std::vector<std::string> patterns_at_start;
        std::vector<std::string> patterns_anywhere;
        std::vector<llama_token> trigger_tokens;
        for (const auto & trigger : params.grammar_triggers) {
@@ -173,13 +173,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
+                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
                {
-                    patterns_anywhere.push_back(trigger.value);
-                    break;
-                }
-                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
-                {
-                    trigger_patterns.push_back(trigger.value);
+                    const auto & pattern = trigger.value;
+                    (trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
@@ -193,6 +190,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
            }
        }

+        std::vector<std::string> trigger_patterns;
+        if (!patterns_at_start.empty()) {
+            trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
+        }
        if (!patterns_anywhere.empty()) {
            trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
        }
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -45,7 +45,7 @@ class SentencePieceTokenTypes(IntEnum):

 class ModelType(IntEnum):
    TEXT = 1
-    MMPROJ = 2
+    VISION = 2


 AnyModel = TypeVar("AnyModel", bound="type[ModelBase]")
@@ -54,7 +54,7 @@ AnyModel = TypeVar("AnyModel", bound="type[ModelBase]")
 class ModelBase:
    _model_classes: dict[ModelType, dict[str, type[ModelBase]]] = {
        ModelType.TEXT: {},
-        ModelType.MMPROJ: {},
+        ModelType.VISION: {},
    }

    dir_model: Path
@@ -88,7 +88,7 @@ class ModelBase:
                 small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None):
        if type(self) is ModelBase or \
                type(self) is TextModel or \
-                type(self) is MmprojModel:
+                type(self) is VisionModel:
            raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")

        self.dir_model = dir_model
@@ -309,7 +309,6 @@ class ModelBase:
                            gguf.MODEL_TENSOR.POSNET_NORM1,
                            gguf.MODEL_TENSOR.POSNET_NORM2,
                            gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
-                            gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
                        )
                    )
                    or not new_name.endswith(".weight")
@@ -439,7 +438,7 @@ class ModelBase:
        assert names

        def func(modelcls: AnyModel) -> AnyModel:
-            model_type = ModelType.MMPROJ if modelcls.model_arch == gguf.MODEL_ARCH.MMPROJ else ModelType.TEXT
+            model_type = ModelType.VISION if modelcls.model_arch == gguf.MODEL_ARCH.CLIP_VISION else ModelType.TEXT
            for name in names:
                cls._model_classes[model_type][name] = modelcls
            return modelcls
@@ -1115,87 +1114,60 @@ class TextModel(ModelBase):
            self.gguf_writer.add_pooling_type(pooling_type)


-class MmprojModel(ModelBase):
-    model_type = ModelType.MMPROJ
-    model_arch = gguf.MODEL_ARCH.MMPROJ
+class VisionModel(ModelBase):
+    model_type = ModelType.VISION
+    model_arch = gguf.MODEL_ARCH.CLIP_VISION
    preprocessor_config: dict[str, Any]
    global_config: dict[str, Any]

-    has_vision_encoder: bool = True # by default
-    has_audio_encoder: bool = False
-
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

-        if self.model_arch != gguf.MODEL_ARCH.MMPROJ:
-            raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ")
-
-        if self.has_vision_encoder and self.has_audio_encoder:
-            raise NotImplementedError("both vision + audio not supported yet")
+        if self.model_arch != gguf.MODEL_ARCH.CLIP_VISION:
+            raise TypeError("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION")

        # get n_embd of the text model
        if "text_config" not in self.hparams:
            self.hparams["text_config"] = {}
-        if "audio_config" not in self.hparams:
-            self.hparams["audio_config"] = {}
        text_config = {**self.hparams, **self.hparams["text_config"]}
        self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
        assert self.n_embd_text > 0, "n_embd not found in hparams"

+        if "vision_config" not in self.hparams:
+            raise ValueError("vision_config not found in hparams")
        # move vision config to the top level, while preserving the original hparams in global_config
        self.global_config = self.hparams
-
-        if "vision_config" in self.hparams:
-            self.hparams = self.hparams["vision_config"]
-        elif "audio_config" in self.hparams:
-            self.hparams = self.hparams["audio_config"]
-        else:
-            raise ValueError("vision_config / audio_config not found in hparams")
+        self.hparams = self.hparams["vision_config"]

        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"])
-        self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
+        self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, self.block_count)

        # load preprocessor config
        with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
            self.preprocessor_config = json.load(f)

    def set_type(self):
-        self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
+        self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION)

    def set_gguf_parameters(self):
        self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
+        self.gguf_writer.add_vision_has_vision_encoder(True)

-        if self.has_vision_encoder:
-            self.gguf_writer.add_clip_has_vision_encoder(True)
-            self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
+        # vision config
+        self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
+        self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
+        self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
+        self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
+        self.gguf_writer.add_vision_block_count(self.block_count)
+        self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))

-            # vision config
-            self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
-            self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
-            self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
-            self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
-            self.gguf_writer.add_vision_block_count(self.block_count)
-            self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))
-
-            # preprocessor config
-            self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
-            self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
-
-        elif self.has_audio_encoder:
-            self.gguf_writer.add_clip_has_audio_encoder(True)
-            self.gguf_writer.add_audio_projection_dim(self.n_embd_text)
-
-            # audio config
-            self.gguf_writer.add_audio_embedding_length(self.find_hparam(["hidden_size"]))
-            self.gguf_writer.add_audio_feed_forward_length(self.find_hparam(["intermediate_size"]))
-            self.gguf_writer.add_audio_block_count(self.block_count)
-            self.gguf_writer.add_audio_head_count(self.find_hparam(["num_attention_heads"]))
-
-        else:
-            raise ValueError("MmprojModel must have either vision or audio encoder")
+        # preprocessor config
+        self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
+        self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])

    def write_vocab(self):
-        raise ValueError("MmprojModel does not support vocab writing")
+        raise ValueError("VisionModel does not support vocab writing")


@ModelBase.register("GPTNeoXForCausalLM")
@@ -1979,7 +1951,7 @@ class LlamaModel(TextModel):
    "LlavaForConditionalGeneration", # pixtral
    "Mistral3ForConditionalGeneration", # mistral small 3.1
 )
-class LlavaVisionModel(MmprojModel):
+class LlavaVisionModel(VisionModel):
    img_break_tok_id = -1

    def __init__(self, *args, **kwargs):
@@ -2005,7 +1977,7 @@ class LlavaVisionModel(MmprojModel):
        super().set_gguf_parameters()
        hparams = self.hparams
        if hparams["model_type"] == "pixtral":
-            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL)
+            self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.PIXTRAL)
            self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])

            # hidden_act
@@ -2044,7 +2016,7 @@ class LlavaVisionModel(MmprojModel):


@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration")
-class SmolVLMModel(MmprojModel):
+class SmolVLMModel(VisionModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        if self.hparams["model_type"] == "smolvlm_vision":
@@ -2056,7 +2028,7 @@ class SmolVLMModel(MmprojModel):

    def set_gguf_parameters(self):
        super().set_gguf_parameters()
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.IDEFICS3)
+        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.IDEFICS3)
        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
        self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
        self.gguf_writer.add_vision_use_gelu(True)
@@ -2122,10 +2094,10 @@ class Llama4Model(LlamaModel):


@ModelBase.register("Llama4ForConditionalGeneration")
-class Llama4VisionModel(MmprojModel):
+class Llama4VisionModel(VisionModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LLAMA4)
+        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.LLAMA4)
        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"])
        self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / self.hparams["pixel_shuffle_ratio"]))
        assert self.hparams["hidden_act"] == "gelu"
@@ -2643,7 +2615,7 @@ class QwenModel(TextModel):
        self.gguf_writer.add_file_type(self.ftype)


-@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration")
+@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM")
 class Qwen2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.QWEN2

@@ -2667,14 +2639,13 @@ class Qwen2Model(TextModel):
            name = f"model.{name}"  # map to Qwen2ForCausalLM tensors
        if "language_model." in name:
            name = name.replace("language_model.", "") # for InternVL
-        if name.startswith("mlp") or name.startswith("multi_modal_projector") \
-                or name.startswith("vision_model") or name.startswith("audio_tower"):
-            # skip vision and audio tensors
+        if name.startswith("mlp") or name.startswith("vision_model"):
+            # skip visual tensors
            return []
        yield from super().modify_tensors(data_torch, name, bid)


-@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
+@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
 class Qwen2VLModel(TextModel):
    model_arch = gguf.MODEL_ARCH.QWEN2VL

@@ -2698,8 +2669,8 @@ class Qwen2VLModel(TextModel):
        return [(self.map_tensor_name(name), data_torch)]


-@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
-class Qwen2VLVisionModel(MmprojModel):
+@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
+class Qwen2VLVisionModel(VisionModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.hparams["image_size"] = self.hparams.get("image_size", 560)
@@ -2714,9 +2685,9 @@ class Qwen2VLVisionModel(MmprojModel):
        super().set_gguf_parameters()
        hparams = self.hparams
        if self.global_config['model_type'] == 'qwen2_vl':
-            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
+            self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN2VL)
        elif self.global_config['model_type'] == 'qwen2_5_vl':
-            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
+            self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN25VL)
            self.gguf_writer.add_vision_use_silu(True)
            # find n_wa_pattern (window attention pattern)
            fullatt_block_indexes = hparams.get("fullatt_block_indexes")
@@ -2775,11 +2746,11 @@ class Qwen2VLVisionModel(MmprojModel):


@ModelBase.register("InternVisionModel")
-class InternVisionModel(MmprojModel):
+class InternVisionModel(VisionModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        hparams = self.hparams
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL)
+        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.INTERNVL)
        self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
        # hidden_act
        if hparams["hidden_act"] == "silu":
@@ -4037,11 +4008,11 @@ class Gemma3Model(TextModel):


@ModelBase.register("Gemma3ForConditionalGeneration")
-class Gemma3VisionModel(MmprojModel):
+class Gemma3VisionModel(VisionModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        hparams = self.hparams
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3)
+        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.GEMMA3)
        # default values below are taken from HF tranformers code
        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
        self.gguf_writer.add_vision_use_gelu(True)
@@ -5988,65 +5959,6 @@ class ChameleonModel(TextModel):
        return data_torch


-@ModelBase.register("UltravoxModel")
-class UltravoxModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.LLAMA # dummy
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")
-
-
-@ModelBase.register("Qwen2AudioForConditionalGeneration")
-class WhisperEncoderModel(MmprojModel):
-    has_vision_encoder = False # no vision encoder
-    has_audio_encoder = True
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.hparams["hidden_size"] = self.hparams["d_model"]
-        self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
-        self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2A)
-        self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
-        self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
-
-    def tensor_force_quant(self, name, new_name, bid, n_dims):
-        del bid, new_name, n_dims  # unused
-        if ".conv" in name and ".weight" in name:
-            return gguf.GGMLQuantizationType.F16
-        return False
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid  # unused
-
-        if name.startswith("language_model."):
-            # skip language model tensors
-            return []
-
-        # prevent clash naming with vision tensors
-        if name.startswith("multi_modal_projector"):
-            name = "audio." + name
-
-        if "conv1.bias" in name or "conv2.bias" in name:
-            # transpose conv1 and conv2 bias
-            data_torch = data_torch.unsqueeze(-1)
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-
-@ModelBase.register("UltravoxModel")
-class UltravoxWhisperEncoderModel(WhisperEncoderModel):
-    has_vision_encoder = False # no vision encoder
-    has_audio_encoder = True
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
-
 ###### CONVERSION LOGIC ######


@@ -6222,15 +6134,13 @@ def split_str_to_n_bytes(split_str: str) -> int:


 def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str:
-    # TODO @ngxson : this won't work correctly if the model has both audio & vision encoders
-    # maybe we should fallback to text model's arch in that case, since not many models have both
    text_config = hparams.get("text_config", {})
    vision_config = hparams.get("vision_config", {})
    arch = hparams["architectures"][0]
    # if "architectures" is found in the sub-config, use that instead
    if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
        arch = text_config["architectures"][0]
-    elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
+    elif model_type == ModelType.VISION and vision_config.get("architectures") is not None:
        arch = vision_config["architectures"][0]
    return arch

@@ -6293,7 +6203,7 @@ def main() -> None:

    with torch.inference_mode():
        output_type = ftype_map[args.outtype]
-        model_type = ModelType.MMPROJ if args.mmproj else ModelType.TEXT
+        model_type = ModelType.VISION if args.mmproj else ModelType.TEXT
        hparams = ModelBase.load_hparams(dir_model)
        model_architecture = get_model_architecture(hparams, model_type)
        logger.info(f"Model architecture: {model_architecture}")
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -56,82 +56,60 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi

 ## Model Supports

-| Model Name                  | FP16  | Q4_0 | Q8_0 |
+| Model Name                  | FP16  | Q8_0 | Q4_0 |
 |:----------------------------|:-----:|:----:|:----:|
-| Llama-2                     |   √   |   √  |   √  |
-| Llama-3                     |   √   |   √  |   √  |
-| Mistral-7B                  |   √   |   √  |   √  |
-| Mistral MOE                 |   √   |   √  |   √  |
-| DBRX                        |   -   |   -  |   -  |
-| Falcon                      |   √   |   √  |   √  |
-| Chinese LLaMA/Alpaca        |   √   |   √  |   √  |
-| Vigogne(French)             |   √   |   √  |   √  |
-| BERT                        |   x   |   x  |   x  |
-| Koala                       |   √   |   √  |   √  |
-| Baichuan                    |   √   |   √  |   √  |
-| Aquila 1 & 2                |   √   |   √  |   √  |
-| Starcoder models            |   √   |   √  |   √  |
-| Refact                      |   √   |   √  |   √  |
-| MPT                         |   √   |   √  |   √  |
-| Bloom                       |   √   |   √  |   √  |
-| Yi models                   |   √   |   √  |   √  |
-| stablelm models             |   √   |   √  |   √  |
-| DeepSeek models             |   x   |   x  |   x  |
-| Qwen models                 |   √   |   √  |   √  |
-| PLaMo-13B                   |   √   |   √  |   √  |
-| Phi models                  |   √   |   √  |   √  |
-| PhiMoE                      |   √   |   √  |   √  |
-| GPT-2                       |   √   |   √  |   √  |
-| Orion                       |   √   |   √  |   √  |
-| InternlLM2                  |   √   |   √  |   √  |
-| CodeShell                   |   √   |   √  |   √  |
-| Gemma                       |   √   |   √  |   √  |
-| Mamba                       |   √   |   √  |   √  |
-| Xverse                      |   √   |   √  |   √  |
-| command-r models            |   √   |   √  |   √  |
-| Grok-1                      |   -   |   -  |   -  |
-| SEA-LION                    |   √   |   √  |   √  |
+| AquilaChat2-7B              |   √   |   √  |   √  |
+| Baichuan-7b                 |   √   |   √  |   √  |
+| Baichuan2-7B-Chat           |   √   |   √  |   √  |
+| bitnet_b1_58-large          |   √   |   √  |   √  |
+| bloom-560m                  |   √   |   x  |   √  |
+| bloomz-alpaca-560m          |   √   |   x  |   √  |
+| c4ai-command-r-35B-v01      |   x   |   x  |   x  |
+| chatglm3-6B                 |   x   |   x  |   x  |
+| chinese-alpaca-2-1.3b       |   √   |   √  |   √  |
+| CodeShell-7B                |   √   |   √  |   √  |
+| deepseek-ai_deepseek-coder-1.3B-base | x |   x  |   x  |
+| deepseek-ai_DeepSeek-V2-Lite | x   |   x  |   x   |
+| deepseek-coder-6.7B-instruct | x   |   x  |   x   |
+| DeepSeek-V2-Lite-64x1.5B    |   x   |   x  |   x  |
+| falcon-7b-instruct          |   √   |   √  |   √  |
+| flan-t5-large               |   √   |   √  |   √  |
+| gemma-2-9b-it               |   √   |   √  |   √  |
+| glm-4-9B                    |   x   |   x  |   x  |
+| gpt2                        |   √   |   √  |   √  |
+| Gpt2-163M                   |   √   |   √  |   √  |
+| granite-3B-code-instruct    |   √   |   √  |   √  |
 | GritLM-7B                   |   √   |   √  |   √  |
-| OLMo                        |   √   |   √  |   √  |
-| OLMo 2                      |   √   |   √  |   √  |
-| OLMoE                       |   √   |   √  |   √  |
-| Granite models              |   √   |   √  |   √  |
-| GPT-NeoX                    |   √   |   √  |   √  |
-| Pythia                      |   √   |   √  |   √  |
-| Snowflake-Arctic MoE        |   -   |   -  |   -  |
-| Smaug                       |   √   |   √  |   √  |
-| Poro 34B                    |   √   |   √  |   √  |
-| Bitnet b1.58 models         |   √   |   x  |   x  |
-| Flan-T5                     |   √   |   √  |   √  |
-| Open Elm models             |   x   |   √  |   √  |
-| chatGLM3-6B + ChatGLM4-9b +  GLMEdge-1.5b + GLMEdge-4b    |   √   |   √  |   √  |
-| GLM-4-0414                  |   √   |   √  |   √  |
-| SmolLM                      |   √   |   √  |   √  |
-| EXAONE-3.0-7.8B-Instruct    |   √   |   √  |   √  |
-| FalconMamba Models          |   √   |   √  |   √  |
-| Jais Models                 |   -   |   x  |   x  |
-| Bielik-11B-v2.3             |   √   |   √  |   √  |
-| RWKV-6                      |   -   |   √  |   √  |
-| QRWKV-6                     |   √   |   √  |   √  |
-| GigaChat-20B-A3B            |   x   |   x  |   x  |
-| Trillion-7B-preview         |   √   |   √  |   √  |
-| Ling models                 |   √   |   √  |   √  |
-
-
-**Multimodal**
-| Model Name                  | FP16  | Q4_0 | Q8_0 |
-|:----------------------------|:-----:|:----:|:----:|
-| LLaVA 1.5 models, LLaVA 1.6 models      |   x   |   x  |   x  |
-|  BakLLaVA                   |   √   |   √  |   √  |
-|  Obsidian                   |   √   |   -  |   -  |
-|  ShareGPT4V                 |   x   |   -  |   -  |
-|  MobileVLM 1.7B/3B models   |   -   |   -  |   -  |
-|  Yi-VL                      |   -   |   -  |   -  |
-|  Mini CPM                   |   √   |   √  |   √  |
-|  Moondream                  |   √   |   √  |   √  |
-|  Bunny                      |   √   |   -  |   -  |
-|  GLM-EDGE                   |   √   |   √  |   √  |
-|  Qwen2-VL                   |   √   |   √  |   √  |
+| internlm2_5-7b-chat         |   √   |   √  |   √  |
+| koala-7B-HF                 |   √   |   √  |   √  |
+| Llama-2-7b-chat-hf          |   √   |   √  |   √  |
+| Llama-3-Smaug-8B            |   √   |   √  |   √  |
+| Llama2-Chinese-7b-Chat      |   √   |   √  |   √  |
+| Llama3-8B                   |   √   |   √  |   √  |
+| Llama3-8b-chinese           |   √   |   √  |   √  |
+| mamba-130m-hf               |   √   |   √  |   √  |
+| Mistral-7B-Instruct-v0.2    |   √   |   √  |   √  |
+| Mixtral-8x7B-Instruct-v0.1  |   x   |   √  |   √  |
+| mpt-7B                      |   √   |   √  |   √  |
+| OLMo-1B-hf                  |   √   |   √  |   √  |
+| OpenELM-3B-Instruct         |   √   |   √  |   √  |
+| Orion-14b-base              |   √   |   √  |   √  |
+| phi1                        |   x   |   x  |   x  |
+| phi2                        |   x   |   x  |   x  |
+| Phi-3-mini-4k-instruct      |   √   |   √  |   √  |
+| plamo-13b                   |   √   |   √  |   √  |
+| pythia-70M                  |   x   |   x  |   x  |
+| Qwen-7B                     |   √   |   √  |   √  |
+| Qwen2-1.5B-Instruct         |   √   |   x  |   √  |
+| Refact-1_6B-fim             |   √   |   √  |   √  |
+| SmolLM-135M                 |   √   |   √  |   √  |
+| stablelm-zephyr             |   x   |   x  |   x  |
+| stablelm-2-zephyr-1_6b      |   x   |   x  |   x  |
+| starcoderbase-1b            |   √   |   √  |   √  |
+| starcoder2-3b               |   √   |   √  |   √  |
+| vigogne-7b-chat             |   √   |   √  |   √  |
+| xverse-7b-chat              |   √   |   √  |   √  |
+| Yi-6b-Chat                  |   √   |   √  |   √  |



@@ -280,15 +258,6 @@ cmake --build build --config release
 ### **GitHub contribution**:
 Please add the **[CANN]** prefix/tag in issues/PRs titles to help the CANN-team check/address them without delay.

-## Updates
-### Basic Flash Attention Support
-The basic FA kernel with aclnnops has been added in aclnn_ops.cpp.
-Currently, the FA only supports the cases with FP16 KV tensors and NO logit softcap.
-Since the aclnn interface for flash attention cannot support the logit softcap, we will only update the quantized version in the future.
-
-Authors from Peking University: Bizhao Shi (bshi@pku.edu.cn), Yuxin Yang (yxyang@pku.edu.cn), Ruiyang Ma (ruiyang@stu.pku.edu.cn), and Guojie Luo (gluo@pku.edu.cn).
-
-We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers from Huawei Technologies Co., Ltd for their help during the code development and pull request.

 ## TODO
 - Support more models and data types.
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -107,7 +107,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment

 The defaults are:

- `MUSA_VERSION` set to `rc4.0.1`
+- `MUSA_VERSION` set to `rc3.1.1`

 The resulting images, are essentially the same as the non-MUSA images:

--- a/docs/function-calling.md
+++ b/docs/function-calling.md
@@ -325,65 +325,36 @@ To get the official template from original HuggingFace repos, you can use [scrip
 > [!TIP]
 > If there is no official `tool_use` Jinja template, you may want to set `--chat-template chatml` to use a default that works with many models (YMMV!), or write your own (e.g. we provide a custom [llama-cpp-deepseek-r1.jinja](../models/templates/llama-cpp-deepseek-r1.jinja) for DeepSeek R1 distills)

-> [!CAUTION]
-> Beware of extreme KV quantizations (e.g. `-ctk q4_0`), they can substantially degrade the model's tool calling performance.
-
 Test in CLI (or with any library / software that can use OpenAI-compatible API backends):

 ```bash
 curl http://localhost:8080/v1/chat/completions -d '{
-    "model": "gpt-3.5-turbo",
-    "tools": [
-        {
-        "type":"function",
-        "function":{
-            "name":"python",
-            "description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
-            "parameters":{
-            "type":"object",
-            "properties":{
-                "code":{
-                "type":"string",
-                "description":"The code to run in the ipython interpreter."
-                }
-            },
-            "required":["code"]
+"model": "gpt-3.5-turbo",
+"tools": [
+    {
+    "type":"function",
+    "function":{
+        "name":"python",
+        "description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
+        "parameters":{
+        "type":"object",
+        "properties":{
+            "code":{
+            "type":"string",
+            "description":"The code to run in the ipython interpreter."
            }
+        },
+        "required":["code"]
        }
-        }
-    ],
-    "messages": [
-        {
-        "role": "user",
-        "content": "Print a hello world message with python."
-        }
-    ]
-}'
-
-
-curl http://localhost:8080/v1/chat/completions -d '{
-    "model": "gpt-3.5-turbo",
-    "messages": [
-        {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."},
-        {"role": "user", "content": "What is the weather in Istanbul?"}
-    ],
-    "tools": [{
-        "type":"function",
-        "function":{
-            "name":"get_current_weather",
-            "description":"Get the current weather in a given location",
-            "parameters":{
-                "type":"object",
-                "properties":{
-                    "location":{
-                        "type":"string",
-                        "description":"The city and country/state, e.g. `San Francisco, CA`, or `Paris, France`"
-                    }
-                },
-                "required":["location"]
-            }
-        }
-    }]
+    }
+    }
+],
+"messages": [
+    {
+    "role": "user",
+    "content": "Print a hello world message with python."
+    }
+]
 }'
 ```

--- a/docs/multimodal.md
+++ b/docs/multimodal.md
@@ -4,9 +4,7 @@ llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools
 - [llama-mtmd-cli](../tools/mtmd/README.md)
 - [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API

-Currently, we support **image** and **audio** input. Audio is highly experimental and may have reduced quality.
-
-To enable it, you can use one of the 2 methods below:
+To enable it, can use use one of the 2 methods below:

 - Use `-hf` option with a supported model (see a list of pre-quantized model below)
    - To load a model using `-hf` while disabling multimodal, use `--no-mmproj`
@@ -33,14 +31,12 @@ llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload

 ## Pre-quantized models

-These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/collections/ggml-org/multimodal-ggufs-68244e01ff1f39e5bebeeedc
+These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/ggml-org

 Replaces the `(tool_name)` with the name of binary you want to use. For example, `llama-mtmd-cli` or `llama-server`

 NOTE: some models may require large context window, for example: `-c 8192`

-**Vision models**:
-
 ```sh
 # Gemma 3
 (tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
@@ -81,20 +77,4 @@ NOTE: some models may require large context window, for example: `-c 8192`

 # Llama 4 Scout
 (tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF
-
-# Moondream2 20250414 version
-(tool_name) -hf ggml-org/moondream2-20250414-GGUF
-
-```
-
-**Audio models**:
-
-```sh
-# Ultravox 0.5
-(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF
-(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF
-
-# Qwen2-Audio and SeaLLM-Audio
-# note: no pre-quantized GGUF this model, as they have very poor result
-# ref: https://github.com/ggml-org/llama.cpp/pull/13760
 ```
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -50,6 +50,8 @@ int main(int argc, char ** argv) {
    const int N = 5;  // n-gram size
    const int G = 15; // max verification n-grams

+    const bool dump_kv_cache = params.dump_kv_cache;
+
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -150,6 +152,9 @@ int main(int argc, char ** argv) {
    // here we keep adding new n-grams as we go
    ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);

+    // debug
+    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
+
    const auto t_dec_start = ggml_time_us();

    // sample first token
@@ -167,6 +172,12 @@ int main(int argc, char ** argv) {
    }

    while (true) {
+        // debug
+        if (dump_kv_cache) {
+            llama_kv_cache_view_update(ctx, &kvc_view);
+            common_kv_cache_dump_view_seqs(kvc_view, 40);
+        }
+
        // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
        //
        // Example for W = 5, N = 4, G = 2:
@@ -462,6 +473,8 @@ int main(int argc, char ** argv) {

    common_sampler_free(smpl);

+    llama_kv_cache_view_free(&kvc_view);
+
    llama_batch_free(batch);

    llama_backend_free();
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -24,6 +24,8 @@ int main(int argc, char ** argv){
    // max. number of additional tokens to draft if match is found
    const int n_draft = params.speculative.n_max;

+    const bool dump_kv_cache = params.dump_kv_cache;
+
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -108,9 +110,18 @@ int main(int argc, char ** argv){

    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);

+    // debug
+    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
+
    const auto t_dec_start = ggml_time_us();

    while (true) {
+        // debug
+        if (dump_kv_cache) {
+            llama_kv_cache_view_update(ctx, &kvc_view);
+            common_kv_cache_dump_view_seqs(kvc_view, 40);
+        }
+
        // print current draft sequence
        LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());

--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -178,6 +178,8 @@ int main(int argc, char ** argv) {
    // insert new requests as soon as the previous one is done
    const bool cont_batching = params.cont_batching;

+    const bool dump_kv_cache = params.dump_kv_cache;
+
    // is the system prompt shared in the cache
    const bool is_sp_shared = params.is_pp_shared;

@@ -239,6 +241,8 @@ int main(int argc, char ** argv) {
    int32_t n_total_gen    = 0;
    int32_t n_cache_miss   = 0;

+    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
+
    const auto t_main_start = ggml_time_us();

    LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
@@ -268,6 +272,11 @@ int main(int argc, char ** argv) {
    LOG_INF("Processing requests ...\n\n");

    while (true) {
+        if (dump_kv_cache) {
+            llama_kv_cache_view_update(ctx, &kvc_view);
+            common_kv_cache_dump_view_seqs(kvc_view, 40);
+        }
+
        common_batch_clear(batch);

        // decode any currently ongoing sequences
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -81,14 +81,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
    }
 }

-static void batch_encode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
+static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
    // clear previous kv_cache values (irrelevant for embeddings)
    llama_kv_self_clear(ctx);

    // run model
    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
-    if (llama_encode(ctx, batch) < 0) {
-        LOG_ERR("%s : failed to encode\n", __func__);
+    if (llama_decode(ctx, batch) < 0) {
+        LOG_ERR("%s : failed to decode\n", __func__);
    }

    for (int i = 0; i < batch.n_tokens; i++) {
@@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
        // encode if at capacity
        if (batch.n_tokens + n_toks > n_batch) {
            float * out = emb + p * n_embd;
-            batch_encode(ctx, batch, out, s, n_embd);
+            batch_decode(ctx, batch, out, s, n_embd);
            common_batch_clear(batch);
            p += s;
            s = 0;
@@ -246,7 +246,7 @@ int main(int argc, char ** argv) {

    // final batch
    float * out = emb + p * n_embd;
-    batch_encode(ctx, batch, out, s, n_embd);
+    batch_decode(ctx, batch, out, s, n_embd);

    // save embeddings to chunks
    for (int i = 0; i < n_chunks; i++) {
@@ -267,7 +267,7 @@ int main(int argc, char ** argv) {
        batch_add_seq(query_batch, query_tokens, 0);

        std::vector<float> query_emb(n_embd, 0);
-        batch_encode(ctx, query_batch, query_emb.data(), 1, n_embd);
+        batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);

        common_batch_clear(query_batch);

--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
    auto generate = [&](const std::string & prompt) {
        std::string response;

-        const bool is_first = llama_kv_self_seq_pos_max(ctx, 0) == 0;
+        const bool is_first = llama_kv_self_used_cells(ctx) == 0;

        // tokenize the prompt
        const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
        while (true) {
            // check if we have enough space in the context to evaluate this batch
            int n_ctx = llama_n_ctx(ctx);
-            int n_ctx_used = llama_kv_self_seq_pos_max(ctx, 0);
+            int n_ctx_used = llama_kv_self_used_cells(ctx);
            if (n_ctx_used + batch.n_tokens > n_ctx) {
                printf("\033[0m\n");
                fprintf(stderr, "context size exceeded\n");
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -536,7 +536,6 @@ extern "C" {
        GGML_UNARY_OP_HARDSWISH,
        GGML_UNARY_OP_HARDSIGMOID,
        GGML_UNARY_OP_EXP,
-        GGML_UNARY_OP_GELU_ERF,

        GGML_UNARY_OP_COUNT,
    };
@@ -1025,16 +1024,6 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-    // GELU using erf (error function) when possible
-    // some backends may fallback to approximation based on Abramowitz and Stegun formula
-    GGML_API struct ggml_tensor * ggml_gelu_erf(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_gelu_erf_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
    GGML_API struct ggml_tensor * ggml_gelu_quick(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
--- a/ggml/src/ggml-cann/Doxyfile
+++ b/ggml/src/ggml-cann/Doxyfile
--- a/ggml/src/ggml-cann/acl_tensor.cpp
+++ b/ggml/src/ggml-cann/acl_tensor.cpp
@@ -31,8 +31,6 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
            return ACL_FLOAT;
        case GGML_TYPE_F16:
            return ACL_FLOAT16;
-        case GGML_TYPE_BF16:
-            return ACL_BF16;
        case GGML_TYPE_I8:
            return ACL_INT8;
        case GGML_TYPE_I16:
--- a/ggml/src/ggml-cann/acl_tensor.h
+++ b/ggml/src/ggml-cann/acl_tensor.h
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -66,7 +66,6 @@
 #include <aclnnop/aclnn_gt_scalar.h>
 #include <aclnnop/aclnn_pow.h>
 #include <aclnnop/aclnn_grouped_matmul_v2.h>
-#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
 #include <float.h>

 #include <cmath>
@@ -75,13 +74,11 @@
 #include <vector>

 #include "ggml-impl.h"
-#include "ggml.h"

 #define GGML_COMMON_DECL_C

 #include "../ggml-common.h"

-
 void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclTensor ** acl_src0,
                 aclTensor ** acl_src1, aclTensor ** acl_dst) {
    GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_can_repeat(src1, src0));
@@ -2700,10 +2697,14 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
        }
    }

-    size_t GROUP_SIZE = 128;
    // GroupedMatmulV2 required tensor_list.size < 128
+    size_t GROUP_SIZE = 128;
+    std::vector<std::vector<aclTensor*>> src0_tensor_vec_vec;
+    std::vector<std::vector<aclTensor*>> src1_tensor_vec_vec;
+    std::vector<std::vector<aclTensor*>> dst_tensor_vec_vec;
+
+    // split and call GroupedMatmulV2
    for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
-        // split and call GroupedMatmulV2
        size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
        std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
        std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
@@ -2721,133 +2722,6 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
    return;
 }

-/**
- * @brief Performs expert-specific matrix multiplication (MoE) with
- * quantized precision using the CANN backend.
- *
- * This function executes a matrix multiplication operation tailored for
- * Mixture of Experts (MoE) models, where the input tensor is multiplied
- * with expert-specific quantized weight matrices. It leverages the CANN
- * backend to perform efficient low-precision computations and stores the
- * quantized result in the destination tensor `dst`.
- *
- * Quantization techniques reduce memory footprint and improve performance
- * by using lower-bit representations (e.g., int8) instead of floating-point.
- * This function is designed to work with such formats and may incorporate
- * optimizations like identity-based fast paths or routing masks for sparse
- * expert selection.
- *
- * @param ctx The context for executing CANN backend operations.
- * @param dst The destination tensor where the quantized MoE multiplication result
- * will be stored.
- *
- * @note This function assumes quantized data types and is designed for
- * MoE architectures with potential sparse expert routing.
- */
-static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    // TODO: Use aclnnGroupedMatMul
-    //dst   [M, K, N, 1]
-    ggml_tensor * src0 = dst->src[0];  //src0	[D, M, A, 1]
-    ggml_tensor * src1 = dst->src[1];  //src1	[D, B, N, 1], B = K or B = 1
-    ggml_tensor * ids  = dst->src[2];  //ids	[K, N]
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    // copy index from npu to cpu
-    int64_t n_as = ne02; // A
-    int64_t n_ids = ids->ne[0]; // K
-
-    std::vector<char> ids_host(ggml_nbytes(ids));
-    ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
-        ACL_MEMCPY_DEVICE_TO_HOST);
-    ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
-
-    char * src0_original = (char *) src0->data;
-    char * src1_original = (char *) src1->data;
-    char * dst_original  = (char *)  dst->data;
-
-    ggml_tensor src0_row = *src0;
-    ggml_tensor src1_row = *src1;
-    ggml_tensor dst_row = *dst;
-
-    const enum ggml_type type = dst->src[0]->type;
-    float weight_elem_size;
-    if (type == GGML_TYPE_Q4_0) {
-        weight_elem_size = float(sizeof(uint8_t)) / 2;
-    } else if (type == GGML_TYPE_Q8_0) {
-        weight_elem_size = float(sizeof(uint8_t));
-    } else {
-        GGML_ABORT("MUL_MAT_ID only support quant type Q4_0 and Q8_0 ");
-    }
-
-    // src0_row [D, M, 1, 1] weight without permute
-    src0_row.ne[2] = 1;
-    src0_row.ne[3] = 1;
-    src0_row.nb[0] = weight_elem_size;
-    src0_row.nb[1] = weight_elem_size * ne00;
-    src0_row.nb[2] = weight_elem_size * ne00;
-    src0_row.nb[3] = weight_elem_size * ne00;
-    size_t weight_stride = ne00 * ne01 * weight_elem_size;
-    size_t weight_size = weight_stride * ne02 * ne03;
-
-    // scale [D, M, 1, 1] -> scale && permute
-    size_t scale_elem_size = sizeof(uint16_t);
-    size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
-
-    // src1_row [D, 1, 1, 1] -> input
-    src1_row.ne[1] = 1;
-    src1_row.ne[2] = 1;
-    src1_row.ne[3] = 1;
-    src1_row.nb[2] = nb11;
-    src1_row.nb[3] = nb11;
-
-    // dst_row [M, 1, 1, 1] -> out
-    dst_row.ne[1] = 1;
-    dst_row.ne[2] = 1;
-    dst_row.ne[3] = 1;
-    dst_row.nb[2] = nb1;
-    dst_row.nb[3] = nb1;
-
-    //create weight for one row
-    ggml_cann_pool_alloc weight_allocator(ctx.pool());
-    void* weight_buffer = weight_allocator.alloc(nb02);
-    for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
-        for (int64_t id = 0; id < n_ids; id++) {
-            // expert index
-            int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
-            GGML_ASSERT(i02 >= 0 && i02 < n_as);
-
-            // If B = 1 (broadcast), always use 0; otherwise, use id.
-            int64_t i11 = (ne11 == 1 ? 0 : id);
-            int64_t i12 = iid1;
-
-            int64_t i1 = id;
-            int64_t i2 = i12;
-
-            void* src0_tmp_ptr = src0_original + i02*weight_stride;
-            void* scale_tmp_ptr = src0_original + weight_size + i02*scale_stride;
-            void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
-            void* dst_tmp_ptr  = dst_original  + i1*nb1   + i2*nb2;
-
-            // mem cpy
-            ggml_cann_async_memcpy(ctx, weight_buffer, src0_tmp_ptr, weight_stride,
-                ACL_MEMCPY_DEVICE_TO_DEVICE);
-            void* scale_buffer = (char*)weight_buffer + weight_stride;
-            ggml_cann_async_memcpy(ctx, scale_buffer, scale_tmp_ptr, scale_stride,
-                ACL_MEMCPY_DEVICE_TO_DEVICE);
-
-            src0_row.data = weight_buffer;
-            src1_row.data = src1_tmp_ptr;
-            dst_row.data = dst_tmp_ptr;
-            dst_row.src[0] = &src0_row;
-            dst_row.src[1] = &src1_row;
-
-            ggml_cann_mul_mat(ctx, &dst_row);
-        }
-    }
-    return;
-}
-
 void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    const enum ggml_type type = dst->src[0]->type;
    switch (type) {
@@ -2855,339 +2729,8 @@ void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
        case GGML_TYPE_F16:
            ggml_cann_mul_mat_id_fp(ctx, dst);
            break;
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q8_0:
-            ggml_cann_mul_mat_id_quant(ctx, dst);
-            break;
        default:
            GGML_ABORT("Unsupported type for mul_mat_id");
            break;
    }
 }
-
-void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
-
-    ggml_tensor* src0 = dst->src[0]; // q, fp32
-    ggml_tensor* src1 = dst->src[1]; // k, fp16
-    ggml_tensor* src2 = dst->src[2]; // v, fp16
-    ggml_tensor* src3 = dst->src[3]; // mask, fp16
-
-    float maxBias = 0.0f;
-    float scaleValue = 1.0f;
-    float logitSoftcap = 0.0f;
-    memcpy(&scaleValue,    (float*)dst->op_params + 0, sizeof(float));
-    memcpy(&maxBias,       (float*)dst->op_params + 1, sizeof(float));
-    memcpy(&logitSoftcap,  (float*)dst->op_params + 2, sizeof(float));
-
-    if(logitSoftcap == 0.0f){
-        size_t faElemSize = sizeof(uint16_t);
-        auto   faDataType = ACL_FLOAT16; //ACL_BF16;
-
-        aclTensor* acl_src0_f16_tensor = nullptr;
-        aclTensor* acl_src1_f16_tensor = nullptr;
-        aclTensor* acl_src2_f16_tensor = nullptr;
-        aclTensor* acl_dst_f16_tensor  = nullptr;
-
-        // Step 1: cast the src0 (Query) to fp16 if needed
-        ggml_cann_pool_alloc src0_f16_allocator(ctx.pool());
-        void* src0_f16_buffer = nullptr;
-
-        if(ggml_cann_type_mapping(src0->type) != faDataType){
-            aclTensor* acl_src0_f32_tensor = ggml_cann_create_tensor(src0);
-            src0_f16_buffer = src0_f16_allocator.alloc(
-                                    ggml_nelements(src0) * faElemSize);
-
-            int64_t* src0_f16_ne = src0->ne;
-            size_t   src0_f16_nb[GGML_MAX_DIMS];
-            src0_f16_nb[0] = sizeof(uint16_t);
-            for(int i = 1; i < GGML_MAX_DIMS; ++i){
-                src0_f16_nb[i] = src0_f16_nb[i - 1] * src0_f16_ne[i - 1];
-            }
-
-            acl_src0_f16_tensor = ggml_cann_create_tensor(
-                src0_f16_buffer, faDataType, faElemSize,
-                src0_f16_ne, src0_f16_nb, GGML_MAX_DIMS
-            );
-            aclnn_cast(ctx, acl_src0_f32_tensor, acl_src0_f16_tensor, faDataType);
-            ggml_cann_release_resources(ctx, acl_src0_f32_tensor);
-        }else{
-            acl_src0_f16_tensor = ggml_cann_create_tensor(src0);
-        }
-
-        // Step 2: create the acl tensors for src1 (Key), src2 (Value),
-        //         and the direct output from FusedInferAttention
-
-        acl_src1_f16_tensor = ggml_cann_create_tensor(src1);
-        acl_src2_f16_tensor = ggml_cann_create_tensor(src2);
-
-        ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
-        void* out_f16_buffer = out_f16_allocator.alloc(
-                                    ggml_nelements(dst) * faElemSize);
-
-        int64_t* out_f16_ne = src0->ne;
-        size_t out_f16_nb[GGML_MAX_DIMS];
-        out_f16_nb[0] = faElemSize;
-        for(int i = 1; i < GGML_MAX_DIMS; ++i){
-            out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
-        }
-
-        acl_dst_f16_tensor = ggml_cann_create_tensor(
-            out_f16_buffer, faDataType, faElemSize,
-            out_f16_ne, out_f16_nb, GGML_MAX_DIMS
-        );
-
-        // Step 3: create the PSEShift tensor if needed
-        //         this tensor is considered as mask (f16) in the llama.cpp
-
-        aclTensor* bcast_pse_tensor = nullptr;
-        int64_t bcast_pse_ne[GGML_MAX_DIMS];
-        size_t bcast_pse_nb[GGML_MAX_DIMS];
-        ggml_cann_pool_alloc bcast_pse_allocator(ctx.pool());
-        void* bcast_pse_buffer = nullptr;
-
-        if(src3 != nullptr){
-            bcast_pse_buffer = bcast_pse_allocator.alloc(
-                            ggml_nelements(src3) * src0->ne[2] * sizeof(uint16_t));
-
-            if(src0->ne[1] > 1){
-                // Case 1: broadcast pse for prefill stage with multiple head
-                aclTensor* acl_mask_f16_tensor = ggml_cann_create_tensor(src3);
-                bcast_pse_ne[0] = src3->ne[0];
-                bcast_pse_ne[1] = src3->ne[1];
-                bcast_pse_ne[2] = src0->ne[2];
-                bcast_pse_ne[3] = src3->ne[3];
-
-                bcast_pse_nb[0] = sizeof(uint16_t);
-                for(int i = 1; i < GGML_MAX_DIMS; ++i){
-                    bcast_pse_nb[i] = bcast_pse_nb[i - 1] * bcast_pse_ne[i - 1];
-                }
-
-                bcast_pse_tensor = ggml_cann_create_tensor(
-                    bcast_pse_buffer, ACL_FLOAT16, sizeof(uint16_t),
-                    bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS);
-
-                int64_t repeats[] = {1, src0->ne[2], 1, 1};
-                aclnn_repeat(ctx, acl_mask_f16_tensor, bcast_pse_tensor, repeats);
-
-                ggml_cann_release_resources(ctx, acl_mask_f16_tensor);
-            }else{
-                // Case 2: trunc the first row and broadcast pse for decode stage with multiple head
-                int64_t trunc_pse_ne[GGML_MAX_DIMS] = {src3->ne[0], src0->ne[1], src3->ne[2], src3->ne[3]};
-                size_t* trunc_pse_nb = src3->nb;
-
-                aclTensor* acl_mask_f16_trunc_tensor = ggml_cann_create_tensor(
-                    src3->data, ACL_FLOAT16, sizeof(uint16_t),
-                    trunc_pse_ne, trunc_pse_nb, GGML_MAX_DIMS);
-
-                bcast_pse_ne[0] = src3->ne[0];
-                bcast_pse_ne[1] = src0->ne[1];
-                bcast_pse_ne[2] = src0->ne[2];
-                bcast_pse_ne[3] = src3->ne[3];
-
-                bcast_pse_nb[0] = sizeof(uint16_t);
-                for(int i = 1; i < GGML_MAX_DIMS; ++i){
-                    bcast_pse_nb[i] = bcast_pse_nb[i - 1] * bcast_pse_ne[i - 1];
-                }
-
-                bcast_pse_tensor = ggml_cann_create_tensor(
-                    bcast_pse_buffer, ACL_FLOAT16, sizeof(uint16_t),
-                    bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS);
-
-                int64_t repeats[] = {1, src0->ne[2], 1, 1};
-                aclnn_repeat(ctx, acl_mask_f16_trunc_tensor, bcast_pse_tensor, repeats);
-
-                ggml_cann_release_resources(ctx, acl_mask_f16_trunc_tensor);
-            }
-
-            // Compute the slope if needed. Derived from ggml_cann_softmax().
-            if(maxBias != 0.0f){
-                // alibi
-                const int64_t ne2_ne3 = src0->ne[2] * src0->ne[3];
-                const int64_t n_head = src0->ne[2];
-                const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
-                float m0 = powf(2.0f, -(maxBias) / n_heads_log2_floor);
-                float m1 = powf(2.0f, -(maxBias / 2.0f) / n_heads_log2_floor);
-                // init arange
-                ggml_cann_pool_alloc arange_allocator(ctx.pool(),
-                                                    ne2_ne3 * faElemSize);
-                void* tmp_arange_buffer = arange_allocator.get();
-
-                // arange1: [1, ..., n_heads_log2_floor+1)
-                float start = 1;
-                float stop = n_heads_log2_floor + 1;
-                float step = 1;
-                int64_t n_elements_arange = n_heads_log2_floor;
-
-                int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
-                size_t tmp_arange1_nb[] = {faElemSize};
-                aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
-                    tmp_arange_buffer, faDataType, faElemSize,
-                    tmp_arange1_ne, tmp_arange1_nb,
-                    GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
-
-                aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
-
-                aclTensor* tmp_arange2_tensor = nullptr;
-                if (n_heads_log2_floor < ne2_ne3) {
-                    // arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
-                    start = 1;
-                    stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
-                    step = 2;
-                    n_elements_arange = ne2_ne3 - n_heads_log2_floor;
-                    int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
-                    size_t tmp_arange2_nb[] = {faElemSize};
-
-                    aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
-                        (char*)tmp_arange_buffer +
-                            n_heads_log2_floor * faElemSize,
-                        faDataType, faElemSize,
-                        tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
-                    aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
-                                n_elements_arange);
-                }
-
-                // init mk_base
-                ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
-                                                    ne2_ne3 * faElemSize);
-                void* tmp_mk_base_buffer = mk_base_allocator.get();
-                int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
-                size_t tmp_mk_base1_nb[] = {faElemSize};
-                aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
-                    tmp_mk_base_buffer, faDataType, faElemSize,
-                    tmp_mk_base1_ne, tmp_mk_base1_nb,
-                    GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
-
-                aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
-
-                aclTensor* tmp_mk_base2_tensor = nullptr;
-                if (n_heads_log2_floor < ne2_ne3) {
-                    int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
-                    size_t tmp_mk_base2_nb[] = {faElemSize};
-                    aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
-                        (char*)tmp_mk_base_buffer +
-                            n_heads_log2_floor * faElemSize,
-                        faDataType, faElemSize,
-                        tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
-                    aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
-                }
-
-                // init mk
-                int64_t tmp_mk_base_ne[] = {ne2_ne3};
-                size_t tmp_mk_base_nb[] = {faElemSize};
-                aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
-                    tmp_mk_base_buffer, faDataType, faElemSize,
-                    tmp_mk_base_ne, tmp_mk_base_nb,
-                    GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
-                aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
-                    tmp_arange_buffer, faDataType, faElemSize,
-                    tmp_mk_base_ne, tmp_mk_base_nb,
-                    GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
-                aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
-
-                // reshape mk
-                int64_t tmp_mk_ne[] = {1, 1, src0->ne[2], src0->ne[3]};
-                size_t tmp_mk_nb[GGML_MAX_DIMS];
-                tmp_mk_nb[0] = faElemSize;
-                for (int i = 1; i < GGML_MAX_DIMS; i++) {
-                    tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
-                }
-                aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
-                    tmp_mk_base_buffer, faDataType, faElemSize,
-                    tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
-                    ACL_FORMAT_ND);
-                GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor, tmp_mk_tensor);
-
-                ggml_cann_release_resources(ctx, tmp_arange1_tensor, tmp_arange2_tensor,
-                    tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
-                    tmp_arange_tensor, tmp_mk_tensor);
-            }
-        }
-
-        // Step 4: set the inputs for FusedInferAttention.
-        int kvTensorNum = 1;
-        aclTensor* acl_q_tensor = acl_src0_f16_tensor;
-        aclTensor* acl_k_tensors[] = {acl_src1_f16_tensor};
-        aclTensor* acl_v_tensors[] = {acl_src2_f16_tensor};
-        auto acl_k_tensor_list = aclCreateTensorList(acl_k_tensors, kvTensorNum);
-        auto acl_v_tensor_list = aclCreateTensorList(acl_v_tensors, kvTensorNum);
-
-        int64_t numHeads = src0->ne[2]; // N
-        int64_t numKeyValueHeads = src1->ne[2];
-        // double  scaleValue = 1 / sqrt(src0->ne[0]); // 1/sqrt(d)
-        int64_t preTokens = 65535;
-        int64_t nextTokens = 65535;
-        char layout[5] = {'B', 'N', 'S', 'D', 0};
-        int64_t sparseMode = 0;
-        int64_t innerPrecise = (src0->ne[1] == 1) ? 0 : 2;
-        int64_t blockSize = 0;
-        int64_t antiquantMode = 0;
-        bool softmaxLseFlag = false;
-        int64_t keyAntiquantMode = 0;
-        int64_t valueAntiquantMode = 0;
-
-        // Step 5: launch the FusedInferAttentionScoreV2 kernel.
-        // Refer to https://gitee.com/ascend/cann-ops-adv/blob/master/docs/FusedInferAttentionScoreV2.md
-
-        GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2,
-            acl_q_tensor, acl_k_tensor_list, acl_v_tensor_list, // q, k, v
-            bcast_pse_tensor, nullptr, // pse, mask
-            nullptr, nullptr, // actSeqLen, actSeqLenkv
-            nullptr, nullptr, // deqScale1, quantScale1
-            nullptr, nullptr, nullptr, // deqScale2, quantScale2, quantOffset2
-            nullptr, nullptr, // antiquantScale, antiquantOffset
-            nullptr, // blockTable
-            nullptr, nullptr, // qPadSize, kvPadSize
-            nullptr, nullptr, // kAntiquantScale, kAntiQuantOffset
-            nullptr, nullptr, // vAntiquantScale, vAntiQuantOffset
-            nullptr, nullptr, nullptr, // kSharedPrefix, vSharedPrefix, actSharedLen
-            numHeads, scaleValue, // heads, scaleValue
-            preTokens, nextTokens, // preTokens, nextTokens
-            layout, // inputLayout
-            numKeyValueHeads, // numKVHeads
-            sparseMode, innerPrecise, // sparseMode, innerPrecise
-            blockSize, antiquantMode, // blockSize, antiquantMode
-            softmaxLseFlag, // softmaxLseFlag
-            keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode
-            acl_dst_f16_tensor, // attentionOut
-            nullptr // softmaxLse
-        );
-
-        // Step 6: post-processing, permute and cast to f32
-
-        int64_t new_dim[] = {0, 2, 1, 3};
-        aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
-
-        if(ggml_cann_type_mapping(dst->type) != faDataType){
-            ggml_cann_pool_alloc perm_out_f16_allocator(ctx.pool());
-            perm_out_f16_allocator.alloc(ggml_nelements(dst) * faElemSize);
-            void* perm_out_f16_buffer = perm_out_f16_allocator.get();
-
-            int64_t* perm_out_f16_ne = dst->ne;
-            size_t  perm_out_f16_nb[GGML_MAX_DIMS];
-            perm_out_f16_nb[0] = faElemSize;
-            for(int i = 1; i < GGML_MAX_DIMS; ++i){
-                perm_out_f16_nb[i] = perm_out_f16_nb[i - 1] * perm_out_f16_ne[i - 1];
-            }
-            aclTensor* acl_perm_out_f16_tensor = ggml_cann_create_tensor(
-                perm_out_f16_buffer, faDataType, faElemSize,
-                perm_out_f16_ne, perm_out_f16_nb, GGML_MAX_DIMS);
-            aclnn_permute(ctx, acl_dst_f16_tensor, acl_perm_out_f16_tensor, new_dim, GGML_MAX_DIMS);
-            aclnn_cast(ctx,
-                acl_perm_out_f16_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
-            ggml_cann_release_resources(ctx, acl_perm_out_f16_tensor);
-        }else{
-            // only need to permute
-            aclnn_permute(ctx, acl_dst_f16_tensor, acl_dst_tensor, new_dim, GGML_MAX_DIMS);
-        }
-        ggml_cann_release_resources(ctx, acl_src0_f16_tensor,
-                                         acl_src1_f16_tensor,
-                                         acl_src2_f16_tensor,
-                                         acl_dst_f16_tensor,
-                                         acl_dst_tensor);
-        if(src3 != nullptr){
-            ggml_cann_release_resources(ctx, bcast_pse_tensor);
-        }
-    }else{
-        GGML_ABORT("Function is not implemented.");
-    }
-}
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -714,21 +714,6 @@ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 */
 void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);

-/**
- * @brief   Performs the Flash Attention extended operator using the CANN backend.
- *
- * @details This function implements the memory-efficient Flash Attention algorithm
- *          for computing scaled dot-product attention with hardware acceleration.
- *          The result is stored in the destination tensor `dst`.
- *
- *          This operation is accelerated using the CANN backend to improve runtime performance.
- *
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the result will be stored.
- *            dst->op is expected to be `GGML_OP_FLASH_ATTN_EXT`.
- */
-void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
 /*
 * @brief A generic wrapper for ACL resources with custom deleter support.
 */
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -36,7 +36,6 @@
 #include "ggml-backend-impl.h"
 #include "ggml-cann/aclnn_ops.h"
 #include "ggml-cann/common.h"
-#include "ggml.h"

 #define GGML_COMMON_DECL_C

@@ -1749,9 +1748,6 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
        case GGML_OP_COUNT_EQUAL:
            ggml_cann_count_equal(ctx, dst);
            break;
-        case GGML_OP_FLASH_ATTN_EXT:
-            ggml_cann_flash_attn_ext(ctx, dst);
-            break;
        default:
            return false;
    }
@@ -2039,15 +2035,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                case GGML_TYPE_F16:
                case GGML_TYPE_F32:
                    return true;
-                case GGML_TYPE_Q8_0:
-                case GGML_TYPE_Q4_0:
-#ifdef ASCEND_310P
-                    // Q4 && Q8 per group is not suppor on 310p device
-                    return false;
-#endif
-                    // only support contiguous for quantized types.
-                    return ggml_is_contiguous(op->src[0]) &&
-                            ggml_is_contiguous(op->src[1]);
                default:
                    return false;
            }
@@ -2181,38 +2168,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
        case GGML_OP_PAD_REFLECT_1D:
        case GGML_OP_COUNT_EQUAL:
            return true;
-        case GGML_OP_FLASH_ATTN_EXT:{
-            // derived from [ggml-cuda.cu]
-            if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
-                return false;
-            }
-            if(op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 && op->src[1]->type != GGML_TYPE_BF16){
-                return false;
-            }
-            if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){
-                return false;
-            }
-            if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
-                // different head sizes of K and V are not supported yet
-                return false;
-            }
-            if (op->src[0]->ne[0] == 192) {
-                return false;
-            }
-            if (op->src[0]->ne[0] == 576) {
-                // DeepSeek MLA
-                return false;
-            }
-            if (op->src[0]->ne[3] != 1) {
-                return false;
-            }
-            float logitSoftcap = 0.0f;
-            memcpy(&logitSoftcap,  (float*)op->op_params + 2, sizeof(float));
-            if(logitSoftcap != 0.0f) {
-                return false;
-            }
-            return true;
-        }
        default:
            return false;
    }
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2202,7 +2202,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                    } break;

                case GGML_UNARY_OP_GELU:
-                case GGML_UNARY_OP_GELU_ERF:
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_SILU:
                    {
@@ -3484,19 +3483,6 @@ void ggml_cpu_init(void) {
            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);

            GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
-
-#ifdef GGML_USE_OPENMP
-            //if (!getenv("OMP_WAIT_POLICY")) {
-            //    // set the wait policy to active, so that OpenMP threads don't sleep
-            //    putenv("OMP_WAIT_POLICY=active");
-            //}
-
-            if (!getenv("KMP_BLOCKTIME")) {
-                // set the time to wait before sleeping a thread
-                // this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
-                putenv("KMP_BLOCKTIME=200"); // 200ms
-            }
-#endif
        }

 #if defined(__ARM_ARCH)
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -2691,109 +2691,6 @@ static void ggml_compute_forward_gelu(
    }
 }

-// ggml_compute_forward_gelu_erf
-
-static void ggml_compute_forward_gelu_erf_f32(
-        const ggml_compute_params * params,
-        ggml_tensor * dst) {
-
-    const ggml_tensor * src0 = dst->src[0];
-
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_gelu_erf_f32(nc,
-                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])));
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            GGML_UNUSED(x);
-            assert(!isnan(x));
-            assert(!isinf(x));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_gelu_erf_f16(
-    const ggml_compute_params * params,
-    ggml_tensor * dst) {
-
-    const ggml_tensor * src0 = dst->src[0];
-
-    assert(ggml_is_contiguous_1(src0));
-    assert(ggml_is_contiguous_1(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_gelu_erf_f16(nc,
-                (ggml_fp16_t *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            const float v = GGML_FP16_TO_FP32(x);
-            GGML_UNUSED(v);
-            assert(!isnan(v));
-            assert(!isinf(v));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_gelu_erf(
-        const ggml_compute_params * params,
-        ggml_tensor * dst) {
-
-    const ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_gelu_erf_f32(params, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_gelu_erf_f16(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
 // ggml_compute_forward_gelu_quick

 static void ggml_compute_forward_gelu_quick_f32(
@@ -7852,10 +7749,6 @@ void ggml_compute_forward_unary(
            {
                ggml_compute_forward_gelu(params, dst);
            } break;
-        case GGML_UNARY_OP_GELU_ERF:
-            {
-                ggml_compute_forward_gelu_erf(params, dst);
-            } break;
        case GGML_UNARY_OP_GELU_QUICK:
            {
                ggml_compute_forward_gelu_quick(params, dst);
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -428,7 +428,6 @@ inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp
 static const float GELU_COEF_A     = 0.044715f;
 static const float GELU_QUICK_COEF = -1.702f;
 static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
-static const float SQRT_2_INV      = 0.70710678118654752440084436210484f;

 inline static float ggml_gelu_f32(float x) {
    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
@@ -441,14 +440,6 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
    }
 }

-inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    for (int i = 0; i < n; ++i) {
-        float xi = GGML_FP16_TO_FP32(x[i]);
-        float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
-        y[i] = GGML_FP32_TO_FP16(res);
-    }
-}
-
 #ifdef GGML_GELU_FP16
 inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
    uint16_t t;
@@ -472,13 +463,6 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
 }
 #endif

-inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
-    for (int i = 0; i < n; ++i) {
-        float xi = x[i];
-        y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
-    }
-}
-
 inline static float ggml_gelu_quick_f32(float x) {
    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
 }
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -1,8 +1,5 @@
 #include "cpy.cuh"
 #include "dequantize.cuh"
-#ifdef GGML_USE_MUSA
-#include "ggml-musa/mudnn.cuh"
-#endif // GGML_USE_MUSA

 typedef void (*cpy_kernel_t)(const char * cx, char * cdst);

@@ -600,14 +597,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
 #endif
    if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
        GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
-#ifdef GGML_USE_MUSA
-        if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
-            CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
-        } else
-#endif // GGML_USE_MUSA
-        {
-            CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
-        }
+        CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -772,7 +772,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
    GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
    GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
    GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
-    GGML_UNUSED(kb0); GGML_UNUSED(tile_Q);
+    GGML_UNUSED(kb0);
    NO_DEVICE_CODE;
 #endif // NEW_MMA_AVAILABLE
 }
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -2,9 +2,9 @@
 #include "fattn-common.cuh"

 template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
-#ifndef GGML_USE_HIP
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
-#endif // GGML_USE_HIP
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_vec_ext_f16(
        const char * __restrict__ Q,
        const char * __restrict__ K,
@@ -48,12 +48,6 @@ static __global__ void flash_attn_vec_ext_f16(
        NO_DEVICE_CODE;
        return;
    }
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    if (ncols > 1) {
-        NO_DEVICE_CODE;
-        return;
-    }
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)

    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.

@@ -97,13 +91,6 @@ static __global__ void flash_attn_vec_ext_f16(
            kqsum_shared[j][threadIdx.x] = 0.0f;
        }
    }
-
-    __shared__ half maskh_shared[ncols*D];
-#pragma unroll
-    for (int j = 0; j < ncols; ++j) {
-        maskh_shared[j*D + tid] = 0.0f;
-    }
-
    __syncthreads();

    // Convert Q to half2 (f16 K) or q8_1 (quantized K) and store in registers:
@@ -188,36 +175,6 @@ static __global__ void flash_attn_vec_ext_f16(
    for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:

-        if (mask) {
-#pragma unroll
-            for (int j = 0; j < ncols; ++j) {
-                maskh_shared[j*D + tid] = slopeh*maskh[j*ne11 + k_VKQ_0 + tid];
-            }
-
-            __syncthreads();
-
-            // When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.
-            // In such cases, skip the KV slice.
-            // On AMD __all_sync would not work correctly because it assumes a warp size of 64.
-#ifndef GGML_USE_HIP
-            bool skip = true;
-#pragma unroll
-            for (int j = 0; j < ncols; ++j) {
-#pragma unroll
-                for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
-                    const int i = i0 + threadIdx.x;
-
-                    const float2 tmp = __half22float2(((const half2 *) maskh_shared)[j*(D/2) + i]);
-                    skip = skip && isinf(tmp.x) && isinf(tmp.y);
-                }
-            }
-            if (__all_sync(0xFFFFFFFF, skip)) {
-                __syncthreads();
-                continue;
-            }
-#endif // GGML_USE_HIP
-        }
-
        // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
        // see https://github.com/ggerganov/llama.cpp/pull/7061 .
        // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
@@ -245,7 +202,7 @@ static __global__ void flash_attn_vec_ext_f16(
                    sum = logit_softcap*tanhf(sum);
                }

-                sum += maskh_shared[j*D + i_KQ];
+                sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);

                if (ncols == 1) {
                    kqmax_new        = ggml_cuda_hmax(kqmax_new,        sum);
@@ -378,9 +335,7 @@ void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml
    float logit_softcap;
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));

-    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-
-    if (Q->ne[1] == 1 || GGML_CUDA_CC_IS_NVIDIA(cc)) {
+    if (Q->ne[1] == 1) {
        constexpr int cols_per_block = 1;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
--- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@@ -2,9 +2,9 @@
 #include "fattn-common.cuh"

 template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
-#ifndef GGML_USE_HIP
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
-#endif // GGML_USE_HIP
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_vec_ext_f32(
        const char * __restrict__ Q,
        const char * __restrict__ K,
@@ -60,12 +60,6 @@ static __global__ void flash_attn_vec_ext_f32(
        NO_DEVICE_CODE;
        return;
    }
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    if (ncols > 1) {
-        NO_DEVICE_CODE;
-        return;
-    }
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)

    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.

@@ -110,13 +104,6 @@ static __global__ void flash_attn_vec_ext_f32(
            kqsum_shared[j][threadIdx.x] = 0.0f;
        }
    }
-
-    __shared__ float maskf_shared[ncols*D];
-#pragma unroll
-    for (int j = 0; j < ncols; ++j) {
-        maskf_shared[j*D + tid] = 0.0f;
-    }
-
    __syncthreads();

    // Convert Q to float2 (f16 K) or q8_1 (quantized K) and store in registers:
@@ -194,35 +181,6 @@ static __global__ void flash_attn_vec_ext_f32(
    for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:

-        if (mask) {
-#pragma unroll
-            for (int j = 0; j < ncols; ++j) {
-                maskf_shared[j*D + tid] = slope*__half2float(maskh[j*ne11 + k_VKQ_0 + tid]);
-            }
-
-            __syncthreads();
-
-            // When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.
-            // In such cases, skip the KV slice.
-            // On AMD __all_sync would not work correctly because it assumes a warp size of 64.
-#ifndef GGML_USE_HIP
-            bool skip = true;
-#pragma unroll
-            for (int j = 0; j < ncols; ++j) {
-#pragma unroll
-                for (int i0 = 0; i0 < D; i0 += WARP_SIZE) {
-                    const int i = i0 + threadIdx.x;
-
-                    skip = skip && isinf(maskf_shared[j*D + i]);
-                }
-            }
-            if (__all_sync(0xFFFFFFFF, skip)) {
-                __syncthreads();
-                continue;
-            }
-#endif // GGML_USE_HIP
-        }
-
        float kqmax_new_arr[ncols];
 #pragma unroll
        for (int j = 0; j < ncols; ++j) {
@@ -246,7 +204,7 @@ static __global__ void flash_attn_vec_ext_f32(
                    sum = logit_softcap*tanhf(sum);
                }

-                sum += maskf_shared[j*D + i_KQ];
+                sum += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;

                kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum);

@@ -368,9 +326,7 @@ void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml
    float logit_softcap;
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));

-    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-
-    if (Q->ne[1] == 1 || GGML_CUDA_CC_IS_NVIDIA(cc)) {
+    if (Q->ne[1] == 1) {
        constexpr int cols_per_block = 1;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2192,9 +2192,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
                case GGML_UNARY_OP_SILU:
                    ggml_cuda_op_silu(ctx, dst);
                    break;
-                case GGML_UNARY_OP_GELU_ERF:
-                    ggml_cuda_op_gelu_erf(ctx, dst);
-                    break;
                case GGML_UNARY_OP_GELU_QUICK:
                    ggml_cuda_op_gelu_quick(ctx, dst);
                    break;
@@ -2980,7 +2977,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                case GGML_UNARY_OP_SIGMOID:
                case GGML_UNARY_OP_HARDSIGMOID:
                case GGML_UNARY_OP_HARDSWISH:
-                case GGML_UNARY_OP_GELU_ERF:
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_EXP:
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -23,12 +23,6 @@ static __device__ __forceinline__ float op_gelu(float x) {
    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
 }

-static __device__ __forceinline__ float op_gelu_erf(float x) {
-    const float SQRT_2_INV = 0.70710678118654752440084436210484f;
-
-    return 0.5f*x*(1.0f + erff(x*SQRT_2_INV));
-}
-
 static __device__ __forceinline__ float op_gelu_quick(float x) {
    const float GELU_QUICK_COEF = -1.702f;

@@ -140,10 +134,6 @@ void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_unary<op_gelu>(ctx, dst);
 }

-void ggml_cuda_op_gelu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_cuda_op_unary<op_gelu_erf>(ctx, dst);
-}
-
 void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_unary<op_gelu_quick>(ctx, dst);
 }
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -30,8 +30,6 @@ void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

-void ggml_cuda_op_gelu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
 void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -149,8 +149,6 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_SIGMOID,
    GGML_METAL_KERNEL_TYPE_GELU,
    GGML_METAL_KERNEL_TYPE_GELU_4,
-    GGML_METAL_KERNEL_TYPE_GELU_ERF,
-    GGML_METAL_KERNEL_TYPE_GELU_ERF_4,
    GGML_METAL_KERNEL_TYPE_GELU_QUICK,
    GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,
    GGML_METAL_KERNEL_TYPE_SILU,
@@ -1105,8 +1103,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIGMOID,                         sigmoid,                         true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU,                            gelu,                            true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4,                          gelu_4,                          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_ERF,                        gelu_erf,                        true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_ERF_4,                      gelu_erf_4,                      true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK,                      gelu_quick,                      true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,                    gelu_quick_4,                    true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU,                            silu,                            true);
@@ -1617,7 +1613,6 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
                case GGML_UNARY_OP_RELU:
                case GGML_UNARY_OP_SIGMOID:
                case GGML_UNARY_OP_GELU:
-                case GGML_UNARY_OP_GELU_ERF:
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_SILU:
                case GGML_UNARY_OP_ELU:
@@ -2256,25 +2251,6 @@ static bool ggml_metal_encode_node(

                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                } break;
-                case GGML_UNARY_OP_GELU_ERF:
-                {
-                    int64_t n = ggml_nelements(dst);
-
-                    id<MTLComputePipelineState> pipeline = nil;
-
-                    if (n % 4 == 0) {
-                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_ERF_4].pipeline;
-                        n /= 4;
-                    } else {
-                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_ERF].pipeline;
-                    }
-
-                    [encoder setComputePipelineState:pipeline];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                } break;
                case GGML_UNARY_OP_GELU_QUICK:
                {
                    int64_t n = ggml_nelements(dst);
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -856,7 +856,6 @@ kernel void kernel_tanh(
 constant float GELU_COEF_A     = 0.044715f;
 constant float GELU_QUICK_COEF = -1.702f;
 constant float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
-constant float SQRT_2_INV      = 0.70710678118654752440084436210484f;

 kernel void kernel_gelu(
    device const float * src0,
@@ -898,42 +897,6 @@ kernel void kernel_gelu_quick_4(
    dst[tpig] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
 }

-// based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation
-// ref: https://www.johndcook.com/blog/python_erf/
-constant float p_erf  = 0.3275911f;
-constant float a1_erf = 0.254829592f;
-constant float a2_erf = -0.284496736f;
-constant float a3_erf = 1.421413741f;
-constant float a4_erf = -1.453152027f;
-constant float a5_erf = 1.061405429f;
-
-template<typename T>
-T erf_approx(T x) {
-    T sign_x = sign(x);
-    x = fabs(x);
-    T t = 1.0f / (1.0f + p_erf * x);
-    T y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x);
-    return sign_x * y;
-}
-
-kernel void kernel_gelu_erf(
-    device const float * src0,
-    device       float * dst,
-    uint tpig[[thread_position_in_grid]]) {
-    device const float & x = src0[tpig];
-
-    dst[tpig] = 0.5f*x*(1.0f+erf_approx<float>(x*SQRT_2_INV));
-}
-
-kernel void kernel_gelu_erf_4(
-    device const float4 * src0,
-    device       float4 * dst,
-    uint tpig[[thread_position_in_grid]]) {
-    device const float4 & x = src0[tpig];
-
-    dst[tpig] = 0.5f*x*(1.0f+erf_approx<float4>(x*SQRT_2_INV));
-}
-
 kernel void kernel_silu(
        device const float * src0,
        device       float * dst,
@@ -3292,7 +3255,7 @@ template<
    typename kd4x4_t, // key type in device memory
    short nl_k,
    void (*deq_k)(device const kd4x4_t *, short, thread k4x4_t &),
-    typename vd4x4_t, // value type in device memory
+    typename vd4x4_t, // key type in device memory
    short nl_v,
    void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &),
    short DK,        // K head size
@@ -3813,7 +3776,7 @@ template<
    typename kd4_t, // key type in device memory
    short nl_k,
    void (*deq_k_t4)(device const kd4_t *, short, thread k4_t &),
-    typename vd4_t, // value type in device memory
+    typename vd4_t, // key type in device memory
    short nl_v,
    void (*deq_v_t4)(device const vd4_t *, short, thread v4_t &),
    short DK,       // K head size
--- a/ggml/src/ggml-musa/CMakeLists.txt
+++ b/ggml/src/ggml-musa/CMakeLists.txt
@@ -27,15 +27,12 @@ if (MUSAToolkit_FOUND)

    file(GLOB   GGML_HEADERS_MUSA "../ggml-cuda/*.cuh")
    list(APPEND GGML_HEADERS_MUSA "../../include/ggml-cuda.h")
-    list(APPEND GGML_HEADERS_MUSA "../ggml-musa/mudnn.cuh")

    file(GLOB   GGML_SOURCES_MUSA "../ggml-cuda/*.cu")
    file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-mma*.cu")
    list(APPEND GGML_SOURCES_MUSA ${SRCS})
    file(GLOB   SRCS "../ggml-cuda/template-instances/mmq*.cu")
    list(APPEND GGML_SOURCES_MUSA ${SRCS})
-    file(GLOB   SRCS "../ggml-musa/*.cu")
-    list(APPEND GGML_SOURCES_MUSA ${SRCS})

    if (GGML_CUDA_FA_ALL_QUANTS)
        file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
@@ -65,9 +62,7 @@ if (MUSAToolkit_FOUND)
                            )

    # TODO: do not use CUDA definitions for MUSA
-    if (NOT GGML_BACKEND_DL)
-        target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
-    endif()
+    target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)

    add_compile_definitions(GGML_USE_MUSA)
    add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
@@ -97,10 +92,9 @@ if (MUSAToolkit_FOUND)
    endif()

    if (GGML_STATIC)
-        # TODO: mudnn has not provided static libraries yet
        target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
    else()
-        target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas mudnn)
+        target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas)
    endif()

    if (GGML_CUDA_NO_VMM)
--- a/ggml/src/ggml-musa/mudnn.cu
+++ b/ggml/src/ggml-musa/mudnn.cu
@@ -1,112 +0,0 @@
-#include <mutex>
-#include <mudnn.h>
-
-#include "mudnn.cuh"
-
-namespace mudnn = musa::dnn;
-
-// Returns a human-readable error string for mudnn::Status
-const char* mudnnGetErrorString(mudnn::Status err) {
-    switch (err) {
-        case mudnn::Status::SUCCESS:
-            return "Success";
-        case mudnn::Status::INVALID_PARAMETER:
-            return "Invalid parameter";
-        case mudnn::Status::NOT_INITIALIZED:
-            return "Not initialized";
-        case mudnn::Status::ALLOC_FAILED:
-            return "Allocation failed";
-        case mudnn::Status::NOT_SUPPORTED:
-            return "Not supported";
-        case mudnn::Status::INTERNAL_ERROR:
-            return "Internal error";
-        case mudnn::Status::ARCH_MISMATCH:
-            return "Architecture mismatch";
-        case mudnn::Status::EXECUTION_FAILED:
-            return "Execution failed";
-        default:
-            return "Unknown mudnn status";
-    }
-}
-
-// Error checking macro for MUDNN calls
-#define MUDNN_CHECK(err) CUDA_CHECK_GEN(err, mudnn::Status::SUCCESS, mudnnGetErrorString)
-
-namespace {
-    // Thread-safe cache for mudnn::Handle objects per device
-    std::unordered_map<int, std::unique_ptr<mudnn::Handle>> handle_cache;
-    std::mutex handle_cache_mutex;
-
-    mudnn::Handle* get_cached_handle(int device_id) {
-        std::lock_guard<std::mutex> lock(handle_cache_mutex);
-        auto it = handle_cache.find(device_id);
-        if (it != handle_cache.end()) {
-            return it->second.get();
-        }
-        auto handle = std::make_unique<mudnn::Handle>(device_id);
-        mudnn::Handle* handle_ptr = handle.get();
-        handle_cache[device_id] = std::move(handle);
-        return handle_ptr;
-    }
-}
-
-// Extracts dimensions and strides from a ggml_tensor
-int get_ggml_dims_and_strides(const ggml_tensor* tensor,
-                              std::vector<int64_t>& dims,
-                              std::vector<int64_t>& strides) {
-    const int ndims = ggml_n_dims(tensor);
-    const size_t element_size = ggml_element_size(tensor);
-
-    dims.resize(ndims);
-    strides.resize(ndims);
-
-    for (int i = 0; i < ndims; ++i) {
-        dims[i] = tensor->ne[i];
-        strides[i] = tensor->nb[i] / static_cast<int64_t>(element_size);
-    }
-    return ndims;
-}
-
-// Converts ggml_type to mudnn::Tensor::Type
-mudnn::Tensor::Type ggml_type_to_mudnn_type(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_F32:
-            return mudnn::Tensor::Type::FLOAT;
-        case GGML_TYPE_F16:
-            return mudnn::Tensor::Type::HALF;
-
-        // TODO: Add support for other types
-
-        default:
-            MUDNN_CHECK(mudnn::Status::NOT_SUPPORTED);
-    }
-
-    return mudnn::Tensor::Type::FLOAT; // Default fallback
-}
-
-// Asynchronous memory copy using mudnn::Unary::IDENTITY
-musaError_t mudnnMemcpyAsync(ggml_backend_cuda_context& ctx, const ggml_tensor* dst, const ggml_tensor* src) {
-    mudnn::Tensor tensor_dst, tensor_src;
-
-    MUDNN_CHECK(tensor_dst.SetType(ggml_type_to_mudnn_type(dst->type)));
-    MUDNN_CHECK(tensor_src.SetType(ggml_type_to_mudnn_type(src->type)));
-
-    std::vector<int64_t> dims, strides;
-    const int ndims = get_ggml_dims_and_strides(src, dims, strides);
-
-    MUDNN_CHECK(tensor_dst.SetNdInfo(ndims, dims.data(), strides.data()));
-    MUDNN_CHECK(tensor_src.SetNdInfo(ndims, dims.data(), strides.data()));
-    MUDNN_CHECK(tensor_dst.SetAddr(dst->data));
-    MUDNN_CHECK(tensor_src.SetAddr(src->data));
-
-    mudnn::Unary op;
-    MUDNN_CHECK(op.SetMode(mudnn::Unary::Mode::IDENTITY));
-    MUDNN_CHECK(op.SetAlpha(0.0f));
-    MUDNN_CHECK(op.SetBeta(0.0f));
-
-    mudnn::Handle* handle = get_cached_handle(ctx.device);
-    MUDNN_CHECK(handle->SetStream(ctx.stream()));
-    MUDNN_CHECK(op.Run(*handle, tensor_dst, tensor_src));
-
-    return musaSuccess;
-}
--- a/ggml/src/ggml-musa/mudnn.cuh
+++ b/ggml/src/ggml-musa/mudnn.cuh
@@ -1,12 +0,0 @@
-#pragma once
-
-#include "../include/ggml.h"
-#include "../ggml-cuda/common.cuh"
-
-// Asynchronously copies data from src tensor to dst tensor using the provided context.
-// Returns a musaError_t indicating success or failure.
-musaError_t mudnnMemcpyAsync(
-    ggml_backend_cuda_context &ctx,
-    const ggml_tensor *dst,
-    const ggml_tensor *src
-);
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -27,7 +27,6 @@
 #include <cmath>
 #include <memory>
 #include <charconv>
-#include <mutex>

 #undef MIN
 #undef MAX
@@ -75,7 +74,6 @@ struct ggml_cl_version {
    cl_uint minor = 0;
 };

-
 struct ggml_cl_compiler_version {
    ADRENO_CL_COMPILER_TYPE type;
    int major = -1;
@@ -93,14 +91,6 @@ struct ggml_cl_compiler_version {
    }
 };

-static size_t align_to(size_t value, size_t to_alignment) {
-    GGML_ASSERT(to_alignment && "Invalid alignment (must be non-zero)");
-    GGML_ASSERT((to_alignment & (to_alignment - 1)) == 0 && "to_alignment must be power-of-two");
-
-    return ((value + to_alignment - 1) / to_alignment) * to_alignment;
-}
-
-
 // Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
 static ggml_cl_version parse_cl_version(std::string_view str) {
    size_t major_str_begin = 0;
@@ -231,25 +221,13 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
    return { type, major, minor, patch };
 }

-struct ggml_backend_opencl_context;
-
 // backend device context
 struct ggml_backend_opencl_device_context {
    cl_platform_id platform;
    std::string platform_name;

-    cl_device_id   device;
-    std::string    device_name;
-    cl_device_type device_type;
-    std::string    device_version;
-
-    // Initialized by ggml_cl2_init().
-    ggml_backend_opencl_context * backend_ctx = nullptr;
-
-    // Initialized by ggml_backend_opencl_device_get_buffer_type()
-    ggml_backend_buffer_type buffer_type;
-
-    cl_context context = nullptr;
+    cl_device_id device;
+    std::string device_name;
 };

 // backend context
@@ -270,8 +248,6 @@ struct ggml_backend_opencl_context {

    int adreno_wave_size;

-    cl_bool non_uniform_workgroups;
-
    cl_context context;
    cl_command_queue queue;

@@ -368,8 +344,15 @@ struct ggml_backend_opencl_context {
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
 };

-// All registered devices with a default device in the front.
-static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
+static ggml_backend_device                 g_ggml_backend_opencl_device;
+static ggml_backend_opencl_device_context  g_ggml_ctx_dev_main {
+    /*.platform         =*/ nullptr,
+    /*.platform_nane    =*/ "",
+    /*.device           =*/ nullptr,
+    /*.device_name      =*/ "",
+};
+
+static int ggml_backend_opencl_n_devices = 0;

 // Profiling
 #ifdef GGML_OPENCL_PROFILING
@@ -1124,19 +1107,25 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
    GGML_LOG_CONT("\n");
 }

-// XXX static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
-// XXX    static bool initialized = false;
-// XXX    static ggml_backend_opencl_context *backend_ctx = nullptr;
+static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
+    static bool initialized = false;
+    static ggml_backend_opencl_context *backend_ctx = nullptr;

-static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev);
+    if (initialized) {
+        return backend_ctx;
+    }

-namespace /* anonymous */ {
-extern struct ggml_backend_device_i ggml_backend_opencl_device_i;
-}
+    ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *)dev->context;
+    GGML_ASSERT(dev_ctx);
+    GGML_ASSERT(dev_ctx->platform == nullptr);
+    GGML_ASSERT(dev_ctx->device == nullptr);
+    GGML_ASSERT(backend_ctx == nullptr);

-// Look for available and suitable devices.
-static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_reg * reg) {
-    std::vector<ggml_backend_device> found_devices;
+    initialized = true;
+    backend_ctx = new ggml_backend_opencl_context();
+    backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
+
+    cl_int err;

 #ifdef GGML_OPENCL_PROFILING
    GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
@@ -1169,12 +1158,11 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
    struct cl_device devices[NDEV];
    unsigned n_devices = 0;
    struct cl_device * default_device = NULL;
-    unsigned           default_platform_number = 0;

    cl_platform_id platform_ids[NPLAT];
    if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
        GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
-        return found_devices;
+        return backend_ctx;
    }

    for (unsigned i = 0; i < n_platforms; i++) {
@@ -1209,22 +1197,19 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
        }

        if (default_device == NULL && p->default_device != NULL) {
-            default_device          = p->default_device;
-            default_platform_number = i;
+            default_device = p->default_device;
        }
    }

    if (n_devices == 0) {
        GGML_LOG_ERROR("ggml_opencl: could find any OpenCL devices.\n");
-        return found_devices;
+        return backend_ctx;
    }

-    char *      user_platform_string = getenv("GGML_OPENCL_PLATFORM");
-    char *      user_device_string   = getenv("GGML_OPENCL_DEVICE");
-    int         user_platform_number = -1;
-    int         user_device_number   = -1;
-    cl_device * candidate_devices    = nullptr;
-    unsigned    n_candidate_devices  = 0;
+    char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
+    char * user_device_string = getenv("GGML_OPENCL_DEVICE");
+    int user_platform_number = -1;
+    int user_device_number = -1;

    unsigned n;
    if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
@@ -1239,11 +1224,12 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
            GGML_LOG_ERROR("ggml_opencl: invalid device number %d\n", user_device_number);
            exit(1);
        }
-        default_device      = &platform->devices[user_device_number];
-        candidate_devices   = platform->devices;
-        n_candidate_devices = platform->n_devices;
+        default_device = &platform->devices[user_device_number];
    } else {
-        // Choose a platform by matching a substring.
+
+        struct cl_device * selected_devices = devices;
+        unsigned n_selected_devices = n_devices;
+
        if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
            for (unsigned i = 0; i < n_platforms; i++) {
                struct cl_platform * p = &platforms[i];
@@ -1258,20 +1244,20 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
                exit(1);
            }
        }
-
-        int                  platform_idx = user_platform_number != -1 ? user_platform_number : default_platform_number;
-        struct cl_platform * p            = &platforms[platform_idx];
-        candidate_devices                 = p->devices;
-        n_candidate_devices               = p->n_devices;
-        default_device                    = p->default_device;
-        if (n_candidate_devices == 0) {
-            GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
-            exit(1);
+        if (user_platform_number != -1) {
+            struct cl_platform * p = &platforms[user_platform_number];
+            selected_devices = p->devices;
+            n_selected_devices = p->n_devices;
+            default_device = p->default_device;
+            if (n_selected_devices == 0) {
+                GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
+                exit(1);
+            }
        }

        if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
-            for (unsigned i = 0; i < n_candidate_devices; i++) {
-                struct cl_device * d = &candidate_devices[i];
+            for (unsigned i = 0; i < n_selected_devices; i++) {
+                struct cl_device * d = &selected_devices[i];
                if (strstr(d->name, user_device_string) != NULL) {
                    user_device_number = d->number;
                    break;
@@ -1283,145 +1269,71 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
            }
        }
        if (user_device_number != -1) {
-            candidate_devices   = &devices[user_device_number];
-            n_candidate_devices = 1;
-            default_device      = &candidate_devices[0];
+            selected_devices = &devices[user_device_number];
+            n_selected_devices = 1;
+            default_device = &selected_devices[0];
        }

-        GGML_ASSERT(n_candidate_devices > 0);
+        GGML_ASSERT(n_selected_devices > 0);

        if (default_device == NULL) {
-            default_device = &candidate_devices[0];
+            default_device = &selected_devices[0];
        }
    }

-    GGML_ASSERT(n_candidate_devices != 0 && candidate_devices);
-
-    // Put the default device in front.
-    for (unsigned i = 1; i < n_candidate_devices; i++) {
-        if (&candidate_devices[i] == default_device) {
-            std::swap(candidate_devices[0], candidate_devices[i]);
-            default_device = &candidate_devices[0];
-            break;
-        }
+    GGML_LOG_INFO("ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
+    GGML_LOG_INFO("ggml_opencl: selecting device: '%s (%s)'\n", default_device->name, default_device->version);
+    if (default_device->type != CL_DEVICE_TYPE_GPU) {
+        GGML_LOG_WARN("ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
    }

-    GGML_LOG_INFO("ggml_opencl: selected platform: '%s'\n", default_device->platform->name);
+    dev_ctx->platform = default_device->platform->id;
+    dev_ctx->device = default_device->id;
+    backend_ctx->device = default_device->id;

-    std::vector<cl_device_id> device_ids;
-    for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
-        device_ids.push_back(dev->id);
-    }
-
-    cl_int                err;
-    cl_context            shared_context;
-    cl_context_properties properties[] = { (intptr_t) CL_CONTEXT_PLATFORM, (intptr_t) default_device->platform->id, 0 };
-
-    CL_CHECK(
-        (shared_context = clCreateContext(properties, device_ids.size(), device_ids.data(), NULL, NULL, &err), err));
-
-    for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
-        GGML_LOG_INFO("\nggml_opencl: device: '%s (%s)'\n", dev->name, dev->version);
-
-        auto dev_ctx = std::unique_ptr<ggml_backend_opencl_device_context>(new ggml_backend_opencl_device_context{
-            /*.platform         =*/dev->platform->id,
-            /*.platform_nane    =*/dev->platform->name,
-            /*.device           =*/dev->id,
-            /*.device_name      =*/dev->name,
-            /*.device_type      =*/dev->type,
-            /*.device_version   =*/dev->version,
-            /*.backend_ctx      =*/nullptr,
-            /*.buffer_type      =*/{},
-            /*.context          =*/shared_context,
-        });
-
-        found_devices.push_back(ggml_backend_device{
-            /* .iface   = */ ggml_backend_opencl_device_i,
-            /* .reg     = */ reg,
-            /* .context = */ dev_ctx.get(),
-        });
-
-        if (!ggml_cl2_init(&found_devices.back())) {
-            found_devices.pop_back();
-            GGML_LOG_INFO("ggml_opencl: drop unsupported device.\n");
-            continue;
-        }
-
-        dev_ctx.release();
-    }
-
-    if (found_devices.size()) {
-        auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(found_devices.front().context);
-        GGML_LOG_INFO("ggml_opencl: default device: '%s (%s)'\n", dev_ctx->device_name.c_str(),
-                      dev_ctx->device_version.c_str());
-
-        if (dev_ctx->device_type != CL_DEVICE_TYPE_GPU) {
-            GGML_LOG_WARN("ggml_opencl: warning, the default device is not a GPU: '%s'.\n",
-                          dev_ctx->device_name.c_str());
-        }
-    }
-
-    return found_devices;
-}
-
-// Initialize device if it is supported (returns nullptr if it is not).
-static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
-    GGML_ASSERT(dev);
-    GGML_ASSERT(dev->context);
-
-    ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
-    GGML_ASSERT(dev_ctx->platform);
-    GGML_ASSERT(dev_ctx->device);
-
-    if (dev_ctx->backend_ctx) {
-        return dev_ctx->backend_ctx;
-    }
-
-    auto backend_ctx        = std::make_unique<ggml_backend_opencl_context>();
-    backend_ctx->device     = dev_ctx->device;
-    backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
-
-    if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
-        strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
-        strstr(dev_ctx->device_version.c_str(), "Adreno")) {
+    if (strstr(default_device->name, "Adreno") ||
+        strstr(default_device->name, "Qualcomm") ||
+        strstr(default_device->version, "Adreno")) {
        backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
        // Usually device version contains the detailed device name
-        backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
+        backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->version);
        if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
-            backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
+            backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
        }

        // Use wave size of 64 for all Adreno GPUs.
        backend_ctx->adreno_wave_size = 64;
-    } else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
+    } else if (strstr(default_device->name, "Intel")) {
        backend_ctx->gpu_family = GPU_FAMILY::INTEL;
    } else {
-        GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str());
+        GGML_LOG_ERROR("Unsupported GPU: %s\n", default_device->name);
        backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
-        return nullptr;
+        return backend_ctx;
    }

 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
    if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
        GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
            "run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
-        return nullptr;
+        return backend_ctx;
    }
 #endif

    // Populate backend device name
-    backend_ctx->device_name = dev_ctx->device_name;
+    dev_ctx->platform_name = default_device->platform->name;
+    dev_ctx->device_name = default_device->name;
+    backend_ctx->device_name = default_device->name;

    // A local ref of cl_device_id for convenience
    cl_device_id device = backend_ctx->device;

-    ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);
+    ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id);

    // Check device OpenCL version, OpenCL 2.0 or above is required
    ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
    if (opencl_c_version.major < 2) {
        GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
-        return nullptr;
+        return backend_ctx;
    }

    // Check driver version
@@ -1452,7 +1364,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    // fp16 is required
    if (!backend_ctx->fp16_support) {
        GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n");
-        return nullptr;
+        return backend_ctx;
    }

    // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
@@ -1461,7 +1373,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
        strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
        GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
            "(note that subgroups is an optional feature in OpenCL 3.0)\n");
-        return nullptr;
+        return backend_ctx;
    }

    cl_uint base_align_in_bits;
@@ -1485,15 +1397,6 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
        svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");

-    if (opencl_c_version.major >= 3) {
-        CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool),
-                                 &backend_ctx->non_uniform_workgroups, 0));
-    } else {
-        GGML_ASSERT(opencl_c_version.major == 2);
-        // Non-uniform workgroup sizes is mandatory feature in v2.x.
-        backend_ctx->non_uniform_workgroups = true;
-    }
-
    // Print out configurations
 #ifdef GGML_OPENCL_SOA_Q
    GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
@@ -1503,10 +1406,14 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS

-    cl_int err;
+    cl_context_properties properties[] = {
+        (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)dev_ctx->platform, 0
+    };
+
+    CL_CHECK((backend_ctx->context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));

    // A local ref of cl_context for convenience
-    cl_context context = backend_ctx->context = dev_ctx->context;
+    cl_context context = backend_ctx->context;

    //CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
    //    (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
@@ -1519,7 +1426,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));

    // Load kernels
-    load_cl_kernels(backend_ctx.get(), opencl_c_version);
+    load_cl_kernels(backend_ctx, opencl_c_version);

 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
    // Allocate intermediate buffers and images
@@ -1549,8 +1456,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    CL_CHECK((backend_ctx->B_d_max   = clCreateBuffer(context, 0, max_B_d_bytes,   NULL, &err), err));
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS

-    dev_ctx->backend_ctx = backend_ctx.release();
-    return dev_ctx->backend_ctx;
+    // For now we support a single devices
+    ggml_backend_opencl_n_devices = 1;
+
+    return backend_ctx;
 }

 static void ggml_cl2_free(void) {
@@ -1755,46 +1664,10 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
    GGML_UNUSED(backend);
 }

-// Syncronizes the 'backend_ctx's device with others so that commands
-// enqueued to it won't start until commands in the other devices have
-// completed.
-static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
-    if (g_ggml_backend_opencl_devices.size() < 2)
-      return; // No other devices to synchronize with.
-
-    std::vector<cl_event> events;
-    events.reserve(g_ggml_backend_opencl_devices.size());
-
-    for (ggml_backend_device & backend_dev : g_ggml_backend_opencl_devices) {
-        auto * other_backend_ctx = ggml_cl2_init(&backend_dev);
-        if (backend_ctx != other_backend_ctx) {
-            cl_event ev;
-            CL_CHECK(clEnqueueMarkerWithWaitList(other_backend_ctx->queue, 0, nullptr, &ev));
-            CL_CHECK(clFlush(other_backend_ctx->queue));
-            events.push_back(ev);
-        }
-    }
-
-    CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, events.size(), events.data(), nullptr));
-    for (auto ev : events) {
-        CL_CHECK(clReleaseEvent(ev));
-    }
-}
-
-static void sync_with_other_backends(ggml_backend_t backend) {
-    auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
-    sync_with_other_backends(backend_ctx);
-}
-
 static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_tensor * node = cgraph->nodes[i];

-        // NOTE: this may oversynchronize by synchronizing with
-        //       backends/devices which don't compute 'cgraph's
-        //       dependencies.
-        sync_with_other_backends(backend);
-
        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
            continue;
        }
@@ -2185,16 +2058,15 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
        // The original tensor memory is divided into scales and quants, i.e.,
        // we first store scales, then quants.
        // Create subbuffer for scales.
-        region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
+        region.origin = extra_orig->offset + tensor->view_offs + offset;
        region.size = size_d;
        extra->d = clCreateSubBuffer(
            extra_orig->data_device, CL_MEM_READ_WRITE,
            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
        CL_CHECK(err);
-        auto previous_origin = region.origin;

        // Create subbuffer for quants.
-        region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
+        region.origin = extra_orig->offset + tensor->view_offs + offset + size_d;
        region.size = size_q;
        extra->q = clCreateSubBuffer(
            extra_orig->data_device, CL_MEM_READ_WRITE,
@@ -2399,8 +2271,8 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
    cl_context context = backend_ctx->context;
    cl_command_queue queue = backend_ctx->queue;

-    // Make sure all previously submitted commands in other devices are finished.
-    sync_with_other_backends(backend_ctx);
+    // Make sure all previously submitted commands are finished.
+    CL_CHECK(clFinish(queue));

 #ifdef GGML_OPENCL_SOA_Q
    // In end-to-end runs, get_tensor is usually used to get back the logits,
@@ -2504,8 +2376,13 @@ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_b
 }

 static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
-    ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
-    return backend_ctx->alignment;
+    // FIXME: not thread safe, device may not be initialized yet
+    static cl_uint alignment = -1;
+    if (alignment == (cl_uint)-1) {
+        ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
+        alignment = backend_ctx->alignment;
+    }
+    return alignment;
 }

 static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
@@ -2532,6 +2409,16 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
    /* .is_host          = */ NULL,
 };

+ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
+    static ggml_backend_buffer_type buffer_type = {
+        /* .iface   = */ ggml_backend_opencl_buffer_type_interface,
+        /* .device  = */ &g_ggml_backend_opencl_device,
+        /* .context = */ nullptr,
+    };
+
+    return &buffer_type;
+}
+
 //
 // backend device
 //
@@ -2589,15 +2476,9 @@ static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, co
 }

 static ggml_backend_buffer_type_t ggml_backend_opencl_device_get_buffer_type(ggml_backend_dev_t dev) {
-    auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(dev->context);
+    return ggml_backend_opencl_buffer_type();

-    dev_ctx->buffer_type = ggml_backend_buffer_type{
-        /* .iface   = */ ggml_backend_opencl_buffer_type_interface,
-        /* .device  = */ dev,
-        /* .context = */ nullptr,
-    };
-
-    return &dev_ctx->buffer_type;
+    GGML_UNUSED(dev);
 }

 static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
@@ -2613,21 +2494,12 @@ static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const
 }

 static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    // Check 'dev' and 'buffer_type' are not objects belonging to this backend.
-    if (dev->iface.get_name != ggml_backend_opencl_device_get_name ||
-        buft->iface.get_name != ggml_backend_opencl_buffer_type_get_name) {
-        return false;
-    }
+    return buft->iface.get_name == ggml_backend_opencl_buffer_type_get_name;

-    // Check cl_context is the same. clEnqueue* commands may not use
-    // buffers from another cl_context.
-    ggml_backend_opencl_context * backend_ctx0 = ggml_cl2_init(dev);
-    ggml_backend_opencl_context * backend_ctx1 = ggml_cl2_init(buft->device);
-    return backend_ctx0->context == backend_ctx1->context;
+    GGML_UNUSED(dev);
 }

-namespace /* anonymous */ {
-struct ggml_backend_device_i ggml_backend_opencl_device_i = {
+static struct ggml_backend_device_i ggml_backend_opencl_device_i = {
    /* .get_name             = */ ggml_backend_opencl_device_get_name,
    /* .get_description      = */ ggml_backend_opencl_device_get_description,
    /* .get_memory           = */ ggml_backend_opencl_device_get_memory,
@@ -2644,7 +2516,6 @@ struct ggml_backend_device_i ggml_backend_opencl_device_i = {
    /* .event_free           = */ NULL,
    /* .event_synchronize    = */ NULL,
 };
-}

 // Backend registry

@@ -2655,15 +2526,15 @@ static const char * ggml_backend_opencl_reg_get_name(ggml_backend_reg_t reg) {
 }

 static size_t ggml_backend_opencl_reg_device_count(ggml_backend_reg_t reg) {
-    return g_ggml_backend_opencl_devices.size();
+    return ggml_backend_opencl_n_devices;

    GGML_UNUSED(reg);
 }

 static ggml_backend_dev_t ggml_backend_opencl_reg_device_get(ggml_backend_reg_t reg, size_t index) {
-    GGML_ASSERT(index < ggml_backend_opencl_reg_device_count(reg));
+    GGML_ASSERT(index == 0);

-    return &g_ggml_backend_opencl_devices[index];
+    return &g_ggml_backend_opencl_device;

    GGML_UNUSED(reg);
    GGML_UNUSED(index);
@@ -2677,23 +2548,27 @@ static struct ggml_backend_reg_i ggml_backend_opencl_reg_i = {
 };

 ggml_backend_reg_t ggml_backend_opencl_reg(void) {
-    static std::mutex mutex;
+    // TODO: make this thread-safe somehow?
    static ggml_backend_reg reg;
    static bool initialized = false;
-    std::lock_guard<std::mutex> lock(mutex);

-    if (initialized) {
-        return &reg;
+    if (!initialized) {
+        reg = ggml_backend_reg {
+            /* .api_version = */ GGML_BACKEND_API_VERSION,
+            /* .iface   = */ ggml_backend_opencl_reg_i,
+            /* .context = */ NULL,
+        };
+
+        g_ggml_backend_opencl_device = ggml_backend_device {
+            /* .iface   = */ ggml_backend_opencl_device_i,
+            /* .reg     = */ &reg,
+            /* .context = */ &g_ggml_ctx_dev_main,
+        };
+
+        ggml_cl2_init(&g_ggml_backend_opencl_device);
+
+        initialized = true;
    }
-    initialized = true;
-
-    g_ggml_backend_opencl_devices = ggml_opencl_probe_devices(&reg);
-
-    reg = ggml_backend_reg{
-        /* .api_version = */ GGML_BACKEND_API_VERSION,
-        /* .iface       = */ ggml_backend_opencl_reg_i,
-        /* .context     = */ NULL,
-    };

    return &reg;
 }
@@ -3067,19 +2942,14 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
        size_t global_work_size[] = {(size_t)n, 1, 1};
        size_t local_work_size[] = {64, 1, 1};

-        size_t * local_work_size_ptr = local_work_size;
-        if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
-            local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
-        }
-
 #ifdef GGML_OPENCL_PROFILING
        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));

        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
 #else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
 #endif
    } else {
        unsigned int nth = MIN(64, ne0);
@@ -3207,19 +3077,14 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
        size_t global_work_size[] = {(size_t)n, 1, 1};
        size_t local_work_size[] = {64, 1, 1};

-        size_t * local_work_size_ptr = local_work_size;
-        if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
-            local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
-        }
-
 #ifdef GGML_OPENCL_PROFILING
        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));

        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
 #else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
 #endif
    } else {
        unsigned int nth = MIN(64, ne0);
@@ -3368,19 +3233,14 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
    size_t global_work_size[] = {(size_t)n, 1, 1};
    size_t local_work_size[] = {64, 1, 1};

-    size_t * local_work_size_ptr = local_work_size;
-    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
-        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
-    }
-
 #ifdef GGML_OPENCL_PROFILING
    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));

    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
 #else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
 #endif
 }

@@ -3413,19 +3273,14 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
    size_t global_work_size[] = {(size_t)n, 1, 1};
    size_t local_work_size[] = {64, 1, 1};

-    size_t * local_work_size_ptr = local_work_size;
-    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
-        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
-    }
-
 #ifdef GGML_OPENCL_PROFILING
    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));

    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
 #else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
 #endif
 }

@@ -3465,19 +3320,14 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
    size_t global_work_size[] = {(size_t)n, 1, 1};
    size_t local_work_size[] = {64, 1, 1};

-    size_t * local_work_size_ptr = local_work_size;
-    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
-        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
-    }
-
 #ifdef GGML_OPENCL_PROFILING
    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));

    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
 #else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
 #endif
 }

@@ -4380,19 +4230,14 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
    size_t global_work_size[] = {(size_t)n, 1, 1};
    size_t local_work_size[] = {64, 1, 1};

-    size_t * local_work_size_ptr = local_work_size;
-    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
-        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
-    }
-
 #ifdef GGML_OPENCL_PROFILING
    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));

    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
 #else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
 #endif
 }

@@ -4573,19 +4418,14 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
        size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02};
        size_t local_work_size[] = {64, 1, 1};

-        size_t * local_work_size_ptr = local_work_size;
-        if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
-            local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
-        }
-
 #ifdef GGML_OPENCL_PROFILING
        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));

        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
 #else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
 #endif
    }
 }
--- a/ggml/src/ggml-sycl/binbcast.cpp
+++ b/ggml/src/ggml-sycl/binbcast.cpp
@@ -1,93 +1,74 @@
 #include "binbcast.hpp"

+#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/sycl.hpp>

+#include "dpct/helper.hpp"
 #include "ggml.h"

-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
-        int ne0, int ne1, int ne2, int ne3,
-        int ne10, int ne11, int ne12, int ne13,
-        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s00,*/ int s01, int s02, int s03,
-        /*int s10,*/ int s11, int s12, int s13,
-        const sycl::nd_item<3> &item_ct1) {
-    const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-    const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1));
-    const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
-                    item_ct1.get_local_id(0)) /
-                   ne3;
-    const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
-                    item_ct1.get_local_id(0)) %
-                   ne3;
-
-    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-        return;
-    }
-
-    const int i11 = i1 % ne11;
-    const int i12 = i2 % ne12;
-    const int i13 = i3 % ne13;
-
-    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
-
-    const src0_t * src0_row = src0 + i_src0;
-    const src1_t * src1_row = src1 + i_src1;
-    dst_t * dst_row = dst + i_dst;
-
-    for (int i0 = i0s; i0 < ne0;
-         i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
-        const int i10 = i0 % ne10;
-        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
+template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static __dpct_inline__ void k_bin_bcast_contiguous(const src0_t * __restrict__ src0, const src1_t * __restrict__ src1,
+                                                   dst_t * dst, std::size_t num_elements, const sycl::nd_item<1> & it) {
+    auto element_id   = it.get_global_id(0);
+    auto global_range = it.get_global_range(0);
+    for (; element_id < num_elements; element_id += global_range) {
+        auto  src0_float_val = sycl::vec(src0[element_id]).template convert<float, sycl::rounding_mode::rte>();
+        auto  src1_float_val = sycl::vec(src1[element_id]).template convert<float, sycl::rounding_mode::rte>();
+        float dst_val        = bin_op(src0_float_val[0], src1_float_val[0]);
+        auto  val_to_store   = sycl::vec(dst_val).template convert<dst_t, sycl::rounding_mode::rte>();
+        dst[element_id]      = val_to_store;
    }
 }

-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
-        int ne0, int ne1, int ne2, int ne3,
-        int ne10, int ne11, int ne12, int ne13,
-        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s00,*/ int s01, int s02, int s03,
-        /*int s10,*/ int s11, int s12, int s13,
-        const sycl::nd_item<3> &item_ct1) {
+template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static __dpct_inline__ void k_bin_bcast(const src0_t * __restrict__ src0, const src1_t * __restrict__ src1, dst_t * dst,
+                                        int ne0, int ne1, int ne2, int ne3, int ne10, int ne11, int ne12, int ne13,
+                                        int s0, int s1, int s2, int s3, int s00, int s01, int s02, int s03, int s10,
+                                        int s11, int s12, int s13, std::size_t num_dst_elements,
+                                        const sycl::nd_item<1> & item_ct1) {
+    auto calculate_logical_index =
+        [](const std::array<int, 4> & dims, std::size_t element_id) __attribute__((always_inline))->std::array<int, 4> {
+        std::array<int, 4> logical_index;
+#pragma unroll(4)
+        for (int i = 3; i >= 0; i--) {
+            logical_index[i] = element_id % dims[i];
+            element_id /= dims[i];
+        }
+        return logical_index;
+    };

-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
+    auto calculate_index = [](const std::array<int, 4> & dims, const std::array<int, 4> & strides,
+                              const std::array<int, 4> & indices) __attribute__((always_inline))
+                               ->std::size_t {
+        std::size_t index = 0;
+#pragma unroll(4)
+        for (int i = 0; i < 4; i++) {
+            auto index_i = indices[i];
+            if (indices[i] >= dims[i]) {
+                index_i = indices[i] % dims[i];
+            }
+            index += strides[i] * index_i;
+        }
+        return index;
+    };

-    const int i3 = i/(ne2*ne1*ne0);
-    const int i2 = (i/(ne1*ne0)) % ne2;
-    const int i1 = (i/ne0) % ne1;
-    const int i0 = i % ne0;
-
-    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-        return;
+    auto element_id = item_ct1.get_global_id(0);
+    for (; element_id < num_dst_elements; element_id += item_ct1.get_global_range(0)) {
+        auto  logical_index  = calculate_logical_index({ ne3, ne2, ne1, ne0 }, element_id);
+        auto  src_0_index    = calculate_index({ ne3, ne2, ne1, ne0 }, { s03, s02, s01, s00 }, logical_index);
+        auto  src_1_index    = calculate_index({ ne13, ne12, ne11, ne10 }, { s13, s12, s11, s10 }, logical_index);
+        auto  dst_index      = calculate_index({ ne3, ne2, ne1, ne0 }, { s3, s2, s1, s0 }, logical_index);
+        auto  src0_float_val = sycl::vec(src0[src_0_index]).template convert<float, sycl::rounding_mode::rte>();
+        auto  src1_float_val = sycl::vec(src1[src_1_index]).template convert<float, sycl::rounding_mode::rte>();
+        float dst_val        = bin_op(src0_float_val[0], src1_float_val[0]);
+        auto  val_to_store   = sycl::vec(dst_val).template convert<dst_t, sycl::rounding_mode::rte>();
+        dst[dst_index]       = val_to_store;
    }
-
-    const int i11 = i1 % ne11;
-    const int i12 = i2 % ne12;
-    const int i13 = i3 % ne13;
-
-    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
-
-    const src0_t * src0_row = src0 + i_src0;
-    const src1_t * src1_row = src1 + i_src1;
-    dst_t * dst_row = dst + i_dst;
-
-    const int i10 = i0 % ne10;
-    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
 }

-
-template<float (*bin_op)(const float, const float)>
-struct bin_bcast_sycl {
+template <float (*bin_op)(const float, const float)> struct bin_bcast_sycl {
    template <typename src0_t, typename src1_t, typename dst_t>
    void operator()(const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, const int64_t ne00,
                    const int64_t ne01, const int64_t ne02, const int64_t ne03, const int64_t ne10, const int64_t ne11,
@@ -96,165 +77,73 @@ struct bin_bcast_sycl {
                    const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0,
                    const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous,
                    const bool src1_is_contiguous, const bool dst_is_contiguous, queue_ptr stream) {
-        int nr0 = ne10 / ne0;
-        int nr1 = ne11/ne1;
-        int nr2 = ne12/ne2;
-        int nr3 = ne13/ne3;
-
-        int nr[4] = { nr0, nr1, nr2, nr3 };
-
-        // collapse dimensions until first broadcast dimension
-        int64_t cne[] = {ne0, ne1, ne2, ne3};
-        int64_t cne0[] = {ne00, ne01, ne02, ne03};
-        int64_t cne1[] = {ne10, ne11, ne12, ne13};
-        size_t cnb[] = {nb0, nb1, nb2, nb3};
-        size_t cnb0[] = {nb00, nb01, nb02, nb03};
-        size_t cnb1[] = {nb10, nb11, nb12, nb13};
-        auto collapse = [](int64_t cne[]) {
-            cne[0] *= cne[1];
-            cne[1] = cne[2];
-            cne[2] = cne[3];
-            cne[3] = 1;
-        };
-
-        auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
-            cnb[1] *= cne[1];
-            cnb[2] *= cne[2];
-            cnb[3] *= cne[3];
-        };
-
-        if (src0_is_contiguous && src1_is_contiguous && dst_is_contiguous) {
+        auto check_bcast_required = [](const std::array<int64_t, 4> & src_dims,
+                                       const std::array<int64_t, 4> & dst_dims) -> bool {
            for (int i = 0; i < 4; i++) {
-                if (nr[i] != 1) {
-                    break;
-                }
-                if (i > 0) {
-                    collapse_nb(cnb, cne);
-                    collapse_nb(cnb0, cne0);
-                    collapse_nb(cnb1, cne1);
-                    collapse(cne);
-                    collapse(cne0);
-                    collapse(cne1);
+                if (dst_dims[i] > src_dims[i]) {
+                    return true;
                }
            }
-        }
-        {
-            int64_t ne0 = cne[0];
-            int64_t ne1 = cne[1];
-            int64_t ne2 = cne[2];
-            int64_t ne3 = cne[3];
+            return false;
+        };

-            int64_t ne10 = cne1[0];
-            int64_t ne11 = cne1[1];
-            int64_t ne12 = cne1[2];
-            int64_t ne13 = cne1[3];
+        dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });

-            size_t nb0 = cnb[0];
-            size_t nb1 = cnb[1];
-            size_t nb2 = cnb[2];
-            size_t nb3 = cnb[3];
+        GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
+        GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
+        GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
+        GGML_ASSERT(nb3 % sizeof(dst_t) == 0);

-            size_t nb00 = cnb0[0];
-            size_t nb01 = cnb0[1];
-            size_t nb02 = cnb0[2];
-            size_t nb03 = cnb0[3];
+        GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
+        GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
+        GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
+        GGML_ASSERT(nb03 % sizeof(src0_t) == 0);

-            size_t nb10 = cnb1[0];
-            size_t nb11 = cnb1[1];
-            size_t nb12 = cnb1[2];
-            size_t nb13 = cnb1[3];
+        GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
+        GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
+        GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
+        GGML_ASSERT(nb13 % sizeof(src1_t) == 0);

-            size_t s0 = nb0 / sizeof(dst_t);
-            size_t s1 = nb1 / sizeof(dst_t);
-            size_t s2 = nb2 / sizeof(dst_t);
-            size_t s3 = nb3 / sizeof(dst_t);
+        // dst strides in number of elements
+        size_t s0 = nb0 / sizeof(dst_t);
+        size_t s1 = nb1 / sizeof(dst_t);
+        size_t s2 = nb2 / sizeof(dst_t);
+        size_t s3 = nb3 / sizeof(dst_t);

-            size_t s10 = nb10 / sizeof(src1_t);
-            size_t s11 = nb11 / sizeof(src1_t);
-            size_t s12 = nb12 / sizeof(src1_t);
-            size_t s13 = nb13 / sizeof(src1_t);
+        // src1 strides in number of elements
+        size_t s10 = nb10 / sizeof(src0_t);
+        size_t s11 = nb11 / sizeof(src1_t);
+        size_t s12 = nb12 / sizeof(src1_t);
+        size_t s13 = nb13 / sizeof(src1_t);

-            size_t s00 = nb00 / sizeof(src0_t);
-            size_t s01 = nb01 / sizeof(src0_t);
-            size_t s02 = nb02 / sizeof(src0_t);
-            size_t s03 = nb03 / sizeof(src0_t);
+        // src0 strides in number of elements
+        size_t s00 = nb00 / sizeof(src0_t);
+        size_t s01 = nb01 / sizeof(src0_t);
+        size_t s02 = nb02 / sizeof(src0_t);
+        size_t s03 = nb03 / sizeof(src0_t);

-            GGML_UNUSED(s00);
+        std::size_t num_dst_elements = static_cast<std::size_t>(ne0) * static_cast<std::size_t>(ne1) *
+                                       static_cast<std::size_t>(ne2) * static_cast<std::size_t>(ne3);
+        std::size_t local_range  = 256;
+        std::size_t global_range = ceil_div(num_dst_elements, local_range) * local_range;

-            GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
-            GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
-            GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
-            GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
+        bool needs_broadcasting = check_bcast_required({ ne00, ne01, ne02, ne03 }, { ne0, ne1, ne2, ne3 }) ||
+                                  check_bcast_required({ ne10, ne11, ne12, ne13 }, { ne0, ne1, ne2, ne3 });
+        bool all_contiguous = src0_is_contiguous && src1_is_contiguous && dst_is_contiguous;

-            GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
-            GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
-            GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
-            GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
-
-            GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
-            GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
-            GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
-            GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
-
-            GGML_ASSERT(s0 == 1);
-            GGML_ASSERT(s10 == 1);
-
-            const int block_size = 128;
-
-            int64_t hne0 = std::max(ne0/2LL, 1LL);
-
-            sycl::range<3> block_dims(1, 1, 1);
-            block_dims[2] = std::min<unsigned int>(hne0, block_size);
-            block_dims[1] = std::min<unsigned int>(
-                ne1, block_size / (unsigned int)block_dims[2]);
-            block_dims[0] = std::min(
-                std::min<unsigned int>(
-                    ne2 * ne3, block_size / (unsigned int)block_dims[2] /
-                                   (unsigned int)block_dims[1]),
-                64U);
-
-            sycl::range<3> block_nums(
-                (ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
-                (ne1 + block_dims[1] - 1) / block_dims[1],
-                (hne0 + block_dims[2] - 1) / block_dims[2]);
-
-            if (block_nums[0] > 65535) {
-                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
-                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
-                {
-                    dpct::has_capability_or_fail(stream->get_device(),
-                                                 {sycl::aspect::fp16});
-
-                    stream->parallel_for(
-                        sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
-                                              sycl::range<3>(1, 1, block_size),
-                                          sycl::range<3>(1, 1, block_size)),
-                        [=](sycl::nd_item<3> item_ct1) {
-                            k_bin_bcast_unravel<bin_op>(
-                                src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
-                                ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
-                                s03, s11, s12, s13, item_ct1);
-                        });
-                }
-            } else {
-                /*
-                DPCT1049:16: The work-group size passed to the SYCL kernel may
-                exceed the limit. To get the device limit, query
-                info::device::max_work_group_size. Adjust the work-group size if
-                needed.
-                */
-                dpct::has_capability_or_fail(stream->get_device(),
-                                             {sycl::aspect::fp16});
-
-                stream->parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
-                                            ne2, ne3, ne10, ne11, ne12, ne13,
-                                            s1, s2, s3, s01, s02, s03, s11, s12, s13,
-                                            item_ct1);
-                    });
-            }
+        if (! needs_broadcasting && all_contiguous) {
+            stream->submit([&](sycl::handler & cgh) {
+                cgh.parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), [=](sycl::nd_item<1> it) {
+                    k_bin_bcast_contiguous<bin_op>(src0_dd, src1_dd, dst_dd, num_dst_elements, it);
+                });
+            });
+        } else {
+            stream->submit([&](sycl::handler & cgh) {
+                cgh.parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), [=](sycl::nd_item<1> it) {
+                    k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, ne12, ne13, s0, s1,
+                                        s2, s3, s00, s01, s02, s03, s10, s11, s12, s13, num_dst_elements, it);
+                });
+            });
        }
    }
 };
@@ -319,27 +208,32 @@ inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *ds


 void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
+    GGML_SYCL_DEBUG("call %s\n", __func__);
    ggml_sycl_op_add(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
+    GGML_SYCL_DEBUG("call %s\n", __func__);
    ggml_sycl_op_sub(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
+    GGML_SYCL_DEBUG("call %s\n", __func__);
    ggml_sycl_op_mul(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
+    GGML_SYCL_DEBUG("call %s\n", __func__);
    ggml_sycl_op_div(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s\n", __func__);
    ggml_sycl_op_repeat(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -15,7 +15,6 @@

 #include <fstream>
 #include <iostream>
-#include <string>

 #include "dpct/helper.hpp"
 #include "ggml-sycl.h"
@@ -45,20 +44,11 @@ extern int g_ggml_sycl_debug;
 extern int g_ggml_sycl_disable_optimize;
 extern int g_ggml_sycl_prioritize_dmmv;

-#if defined(__clang__) && __has_builtin(__builtin_expect)
-// Hint the optimizer to pipeline the more likely following instruction in branches
-#    define LIKELY(expr)   __builtin_expect(expr, true)
-#    define UNLIKELY(expr) __builtin_expect(expr, false)
-#else
-#    define LIKELY(expr)   (expr)
-#    define UNLIKELY(expr) (expr)
-#endif
-
-#define GGML_SYCL_DEBUG(...)              \
-    do {                                  \
-        if (UNLIKELY(g_ggml_sycl_debug))  \
-            fprintf(stderr, __VA_ARGS__); \
-    } while (0)
+#define GGML_SYCL_DEBUG(...)        \
+  do {                              \
+    if (g_ggml_sycl_debug)          \
+      fprintf(stderr, __VA_ARGS__); \
+  } while (0)

 #define CHECK_TRY_ERROR(expr)                                            \
  [&]() {                                                                \
@@ -500,76 +490,4 @@ constexpr size_t ceil_div(const size_t m, const size_t n) {
 }

 bool gpu_has_xmx(sycl::device &dev);
-
-template <int N, class T> void debug_print_array(const std::string & prefix, const T array[N]) {
-    if (LIKELY(!g_ggml_sycl_debug)) {
-        return;
-    }
-    std::stringstream ss;
-    ss << prefix << "=[";
-    for (std::size_t i = 0; i < N - 1; ++i) {
-        ss << array[i] << ", ";
-    }
-    if constexpr (N > 0) {
-        ss << array[N - 1];
-    }
-    ss << "]";
-    GGML_SYCL_DEBUG("%s", ss.str().c_str());
-}
-
-inline void debug_print_tensor(const std::string & prefix, const ggml_tensor * tensor,
-                               const std::string & suffix = "") {
-    if (LIKELY(!g_ggml_sycl_debug)) {
-        return;
-    }
-    GGML_SYCL_DEBUG("%s=", prefix.c_str());
-    if (tensor) {
-        GGML_SYCL_DEBUG("'%s':type=%s", tensor->name, ggml_type_name(tensor->type));
-        debug_print_array<GGML_MAX_DIMS>(";ne", tensor->ne);
-        debug_print_array<GGML_MAX_DIMS>(";nb", tensor->nb);
-        if (!ggml_is_contiguous(tensor)) {
-            GGML_SYCL_DEBUG(";strided");
-        }
-        if (ggml_is_permuted(tensor)) {
-            GGML_SYCL_DEBUG(";permuted");
-        }
-    } else {
-        GGML_SYCL_DEBUG("nullptr");
-    }
-    GGML_SYCL_DEBUG("%s", suffix.c_str());
-}
-
-// Use scope_op_debug_print to log operations coming from running a model
-struct scope_op_debug_print {
-    // Use string_views to avoid the cost of creating a string and concatenating them
-    // string_views must be alive for as long as the object is alive
-    // scope_op_debug_print are used with string literals in practice which are stored in constant space so always accessible
-    scope_op_debug_print(const std::string_view & func, const std::string_view & func_suffix, const ggml_tensor * dst,
-                         std::size_t num_src, const std::string_view & suffix = "") :
-        func(func),
-        func_suffix(func_suffix) {
-        if (LIKELY(!g_ggml_sycl_debug)) {
-            return;
-        }
-        GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data());
-        debug_print_tensor(" dst", dst);
-        if (dst) {
-            for (std::size_t i = 0; i < num_src; ++i) {
-                debug_print_tensor("\tsrc" + std::to_string(i), dst->src[i]);
-            }
-        }
-        GGML_SYCL_DEBUG("%s\n", suffix.data());
-    }
-
-    scope_op_debug_print(const std::string_view & func, const ggml_tensor * dst, std::size_t num_src,
-                         const std::string_view & suffix = "") :
-        scope_op_debug_print(func, "", dst, num_src, suffix) {}
-
-    ~scope_op_debug_print() { GGML_SYCL_DEBUG("[SYCL][OP] call %s%s done\n", func.data(), func_suffix.data()); }
-
-  private:
-    std::string_view func;
-    std::string_view func_suffix;
-};
-
 #endif // GGML_SYCL_COMMON_HPP
--- a/ggml/src/ggml-sycl/concat.cpp
+++ b/ggml/src/ggml-sycl/concat.cpp
@@ -159,37 +159,39 @@ static void concat_f32_sycl_non_cont(
 }

 void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
-    const ggml_tensor *  src0   = dst->src[0];
-    const ggml_tensor *  src1   = dst->src[1];
-    queue_ptr            stream = ctx.stream();
+  const ggml_tensor *src0 = dst->src[0];
+  const ggml_tensor *src1 = dst->src[1];
+  queue_ptr stream = ctx.stream();

-    const int32_t dim = ((int32_t *) dst->op_params)[0];
+  const int32_t dim = ((int32_t *)dst->op_params)[0];

-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
-        const float * src0_d = (const float *) src0->data;
-        const float * src1_d = (const float *) src1->data;
+  if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
+    const float *src0_d = (const float *)src0->data;
+    const float *src1_d = (const float *)src1->data;

-        float * dst_d = (float *) dst->data;
+    float *dst_d = (float *)dst->data;

-        if (dim != 3) {
-            for (int i3 = 0; i3 < dst->ne[3]; i3++) {
-                concat_f32_sycl(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
-                                dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0],
-                                dst->ne[1], dst->ne[2], dim, stream);
-            }
-        } else {
-            const size_t size0 = ggml_nbytes(src0);
-            const size_t size1 = ggml_nbytes(src1);
-
-            SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
-            SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
-        }
+    if (dim != 3) {
+      for (int i3 = 0; i3 < dst->ne[3]; i3++) {
+        concat_f32_sycl(
+            src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
+            dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1],
+            src0->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
+      }
    } else {
-        concat_f32_sycl_non_cont(stream, (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
-                                 src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1],
-                                 src0->nb[2], src0->nb[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
-                                 src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
-                                 dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
+      const size_t size0 = ggml_nbytes(src0);
+      const size_t size1 = ggml_nbytes(src1);
+
+      SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
+      SYCL_CHECK(CHECK_TRY_ERROR(
+          stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
    }
+  } else
+    concat_f32_sycl_non_cont(
+        stream, (const char *)src0->data, (const char *)src1->data,
+        (char *)dst->data, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+        src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src1->ne[0],
+        src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1],
+        src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
+        dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
 }
--- a/ggml/src/ggml-sycl/conv.cpp
+++ b/ggml/src/ggml-sycl/conv.cpp
@@ -72,7 +72,6 @@ static void conv_transpose_1d_f32_f32_sycl(
 }

 void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
    const ggml_tensor *src0 = dst->src[0];
    const ggml_tensor *src1 = dst->src[1];
    const float * src0_d = (const float *)src0->data;
--- a/ggml/src/ggml-sycl/cpy.cpp
+++ b/ggml/src/ggml-sycl/cpy.cpp
@@ -616,9 +616,6 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co
 }

 void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
-    // Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field
-    scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0,
-                                         std::string(" src0 type=") + ggml_type_name(src0->type));
    const int64_t ne = ggml_nelements(src0);
    GGML_ASSERT(ne == ggml_nelements(src1));

@@ -632,6 +629,8 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co

    char * src0_ddc = (char *) src0->data;
    char * src1_ddc = (char *) src1->data;
+    GGML_SYCL_DEBUG("[SYCL] %s: Tensor supplied: %s to %s\n", __func__, ggml_type_name(src0->type),
+                    ggml_type_name(src1->type));

    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
        ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
@@ -695,6 +694,8 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
 }

 void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    // TODO: why do we pass dst as src1 here?
+    GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
    ggml_sycl_cpy(ctx, dst->src[0], dst);
+    GGML_SYCL_DEBUG("[SYCL] call %s done\n", __func__);
 }
--- a/ggml/src/ggml-sycl/dmmv.cpp
+++ b/ggml/src/ggml-sycl/dmmv.cpp
@@ -1092,8 +1092,6 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;

    if (src1_convert_f16) {
-        scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
-                                             " : converting src1 to fp16");
        src1_dfloat = src1_dfloat_a.alloc(ne00);
        const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
        GGML_ASSERT(to_fp16_sycl != nullptr);
--- a/ggml/src/ggml-sycl/element_wise.cpp
+++ b/ggml/src/ggml-sycl/element_wise.cpp
@@ -1391,121 +1391,146 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst)


 void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_sqrt(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_sin(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_cos(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_acc(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_gelu(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_silu(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_gelu_quick(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_tanh(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_relu(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_sigmoid(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_hardsigmoid(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_hardswish(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

+
 void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_exp(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_log(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_neg(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_step(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_leaky_relu(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_sqr(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_upscale(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_pad(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_clamp(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_sgn(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_abs(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
    ggml_sycl_op_elu(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
--- a/ggml/src/ggml-sycl/getrows.cpp
+++ b/ggml/src/ggml-sycl/getrows.cpp
@@ -257,7 +257,8 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens
    GGML_UNUSED(ctx);
 }

-void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+
    GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);

@@ -307,3 +308,4 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
            GGML_ABORT("fatal error");
    }
 }
+
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -346,8 +346,6 @@ static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
 static enum ggml_status
 ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
                                     ggml_tensor *tensor) try {
-    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": tensor=", tensor, "\n");
    ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;

    if (tensor->view_src != NULL) {
@@ -383,23 +381,20 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
                                                ggml_tensor *tensor,
                                                const void *data, size_t offset,
                                                size_t size) try {
-    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": tensor=", tensor);
-    GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
+
    ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
    ggml_sycl_set_device(ctx->device);
    auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
-    SYCL_CHECK(CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
-#ifndef _WIN32
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
    // Note: Use host buffer to save the data from mmap(), then copy to device. It's workaround for mmap() issue on PVC GPU.
    // This function will be called during load model from disk. Use memory buffer replace dynamic won't save more time and brings potential memory leak risk here.
-    char * host_buf = (char *) malloc(size);
+    char* host_buf = (char*)malloc(size);
    memcpy(host_buf, data, size);
-    SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, host_buf, size).wait()));
+    SYCL_CHECK(
+        CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, host_buf, size)
+                             .wait()));
    free(host_buf);
-#else
-    SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, data, size).wait()));
-#endif
 }
 catch (sycl::exception const &exc) {
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -411,9 +406,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
                                                const ggml_tensor *tensor,
                                                void *data, size_t offset,
                                                size_t size) try {
-    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": tensor=", tensor);
-    GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
+
    ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;

    ggml_sycl_set_device(ctx->device);
@@ -441,12 +434,7 @@ static bool
 ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
                                    const ggml_tensor *src,
                                    ggml_tensor *dst) try {
-    bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer);
-    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": dst=", dst);
-    debug_print_tensor(" src=", src);
-    GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
-    if (is_cpy_supported) {
+    if (ggml_backend_buffer_is_sycl(src->buffer)) {
        ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
        ggml_backend_sycl_buffer_context * dst_ctx = (ggml_backend_sycl_buffer_context *)dst->buffer->context;

@@ -503,8 +491,7 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,

 static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
                                           uint8_t value) try {
-    GGML_SYCL_DEBUG("[SYCL] call %s: size=%zu\n", __func__, buffer->size);
-    ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
+     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;

    ggml_sycl_set_device(ctx->device);
    queue_ptr stream = ctx->stream;
@@ -523,9 +510,7 @@ catch (sycl::exception const &exc) {

 static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value,
                                                   size_t offset, size_t size) {
-    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": tensor=", tensor);
-    GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value);
+    GGML_SYCL_DEBUG(" [SYCL] call %s\n", __func__);
    ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
    SYCL_CHECK(ggml_sycl_set_device(ctx->device));
    auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
@@ -803,8 +788,6 @@ static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buff
 static enum ggml_status
 ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
                                           ggml_tensor *tensor) try {
-    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": tensor=", tensor, "\n");
    GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported

    ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
@@ -889,9 +872,6 @@ static void
 ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
                                          ggml_tensor *tensor, const void *data,
                                          size_t offset, size_t size) try {
-    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": tensor=", tensor);
-    GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
    // split tensors must always be set in their entirety at once
    GGML_ASSERT(offset == 0);
    GGML_ASSERT(size == ggml_nbytes(tensor));
@@ -945,9 +925,6 @@ static void
 ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
                                          const ggml_tensor *tensor, void *data,
                                          size_t offset, size_t size) try {
-    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": tensor=", tensor);
-    GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
    // split tensors must always be set in their entirety at once
    GGML_ASSERT(offset == 0);
    GGML_ASSERT(size == ggml_nbytes(tensor));
@@ -2037,12 +2014,12 @@ inline void ggml_sycl_op_mul_mat_sycl(
 #else
    bool use_fp16 = false;
 #endif
-    if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && use_fp16 && ggml_is_contiguous(src0) &&
-        row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
+    if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+        use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] &&
+        dst->op_params[0] == GGML_PREC_DEFAULT) {
+        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");
        ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
        if (src0->type != GGML_TYPE_F16) {
-            scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
-                                                 " : converting src0 to fp16");
            const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type, dst);
            GGML_ASSERT(to_fp16_sycl != nullptr);
            size_t ne = row_diff*ne00;
@@ -2055,8 +2032,6 @@ inline void ggml_sycl_op_mul_mat_sycl(

        ggml_sycl_pool_alloc<sycl::half> src1_as_f16(ctx.pool());
        if (src1->type != GGML_TYPE_F16) {
-            scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
-                                                 " : converting src1 to fp16");
            const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
            GGML_ASSERT(to_fp16_sycl != nullptr);
            size_t ne = src1_ncols*ne10;
@@ -2073,8 +2048,6 @@ inline void ggml_sycl_op_mul_mat_sycl(
            DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
                                      DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
                                      dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
-            scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
-                                                 " : converting dst to fp32");
            const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
            to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
        }
@@ -2090,25 +2063,21 @@ inline void ggml_sycl_op_mul_mat_sycl(
                src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
                dst_f16.get(), dpct::library_data_t::real_half, ldc,
                dpct::library_data_t::real_half)));
-            scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
-                                                 " : converting dst to fp32");
            const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
            to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
        }
-    } else {
+    }
+    else {
+        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp32 path\n");
        ggml_sycl_pool_alloc<float> src0_ddq_as_f32(ctx.pool());
        ggml_sycl_pool_alloc<float> src1_ddq_as_f32(ctx.pool());
        if (src0->type != GGML_TYPE_F32) {
-            scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
-                                                 " : converting src0 to fp32");
            const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst);
            GGML_ASSERT(to_fp32_sycl != nullptr);
            src0_ddq_as_f32.alloc(row_diff*ne00);
            to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
        }
        if (src1->type != GGML_TYPE_F32) {
-            scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
-                                                 " : converting src1 to fp32");
            const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type, dst);
            GGML_ASSERT(to_fp32_sycl != nullptr);
            src1_ddq_as_f32.alloc(src1_ncols*ne10);
@@ -2144,7 +2113,8 @@ catch (sycl::exception const &exc) {
  std::exit(1);
 }

-static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+
    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    dpct::queue_ptr main_stream = ctx.stream();
@@ -2196,7 +2166,8 @@ inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
    sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
 }

-inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+
    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    dpct::queue_ptr main_stream = ctx.stream();
@@ -2227,7 +2198,8 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor *
    argsort_f32_i32_sycl(src0_dd, (int *) dst_dd, ncols, nrows, order, main_stream);
 }

-inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+
    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_I32);

@@ -2242,7 +2214,8 @@ inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * d
    argmax_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
 }

-inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx,ggml_tensor *dst) {
+
    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    dpct::queue_ptr main_stream = ctx.stream();
@@ -2259,7 +2232,8 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_ten
    diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
 }

-inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+
    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    dpct::queue_ptr main_stream = ctx.stream();
@@ -2446,8 +2420,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
            dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);

            if (src1_on_device && src1_is_contiguous) {
-                scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
-                                                     /*num_src=*/2, " : converting src1 to Q8_1");
                quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
                /*
                DPCT1010:90: SYCL uses exceptions to report errors and does not
@@ -2552,8 +2524,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                }

                if (convert_src1_to_q8_1 && !src1_is_contiguous) {
-                    scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
-                                                         /*num_src=*/2, " : converting src1 to Q8_1");
                    quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
                    /*
                    DPCT1010:92: SYCL uses exceptions to report errors and does
@@ -2648,28 +2618,33 @@ catch (sycl::exception const &exc) {


 static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
+    GGML_SYCL_DEBUG("call %s\n", __func__);
    ggml_sycl_op_get_rows(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 static void ggml_sycl_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s\n", __func__);
    ggml_sycl_op_norm(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s\n", __func__);
    ggml_sycl_op_rms_norm(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 static void ggml_sycl_l2_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s\n", __func__);
    ggml_sycl_op_l2_norm(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    GGML_SYCL_DEBUG("call %s\n", __func__);
    ggml_sycl_op_group_norm(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

 static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -2797,8 +2772,6 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons

    // convert src1 to fp16
    if (src1->type != GGML_TYPE_F16) {
-        scope_op_debug_print    scope_dbg_print(__func__, "/to_fp16_nc_sycl", dst, /*num_src=*/2,
-                                                " : converting src1 to fp16");
        const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
        GGML_ASSERT(to_fp16_nc_sycl != nullptr);
        const int64_t ne_src1 = ggml_nelements(src1);
@@ -3054,7 +3027,7 @@ static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_ten
    return !g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
            ctx.opt_feature.reorder &&      //allow this device due to good perf, skip the devices with bad perf.
            dst->op == GGML_OP_MUL_MAT &&   //limit to some supported cases of Q4_0, to do for more cases.
-            dst->src[1]->ne[1]==1 && dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
+            dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
 }

 static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * /* src1 */,
@@ -3102,7 +3075,6 @@ static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor *
 }

 static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
    const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
    int64_t min_compute_capability = INT_MAX;

@@ -3178,8 +3150,11 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
        ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1);
    } else {
        constexpr bool convert_src1_to_q8_1 = false;
+        // MUL_MAT_SYCL supports reorder
+        opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::MUL_MAT_SYCL);
        ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
    }
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }


@@ -3250,7 +3225,6 @@ __dpct_inline__ static void k_copy_dst_from_contiguous(

 static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
                                 ggml_tensor *dst) try {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
    const ggml_tensor *src0 = dst->src[0];
    const ggml_tensor *src1 = dst->src[1];
    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer) && "mul_mat_id does not support split buffers");
@@ -3419,45 +3393,37 @@ catch (sycl::exception const &exc) {
 }

 static void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
    ggml_sycl_op_scale(ctx, dst);
 }

 static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
    ggml_sycl_op_diag_mask_inf(ctx, dst);
 }

 static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
    ggml_sycl_op_pool2d(ctx, dst);
 }

 static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
    ggml_sycl_op_im2col(ctx, dst);
 }

 static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
    GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
    ggml_sycl_op_sum(ctx, dst);
 }

 static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
    GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
    ggml_sycl_op_sum_rows(ctx, dst);
 }

 static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
    GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
    ggml_sycl_op_argsort(ctx, dst);
 }

 static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
    GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
    ggml_sycl_op_argmax(ctx, dst);
 }
@@ -3751,9 +3717,6 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
                                               ggml_tensor *tensor,
                                               const void *data, size_t offset,
                                               size_t size) try {
-    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": tensor=", tensor);
-    GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;

@@ -3772,16 +3735,13 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
                                               const ggml_tensor *tensor,
                                               void *data, size_t offset,
                                               size_t size) try {
-    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": tensor=", tensor);
-    GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;

    GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
    const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
    SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
-        data, (const char *)tensor->data + offset, size)));
+        data, (const char *)tensor->data + offset, size).wait()));
 }
 catch (sycl::exception const &exc) {
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -3793,13 +3753,7 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
                                               const ggml_tensor *src,
                                               ggml_tensor *dst) try {
    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
-    bool is_cpy_supported                = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) &&
-                            ggml_backend_buffer_is_sycl(src->buffer);
-    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": dst=", dst);
-    debug_print_tensor(" src=", src);
-    GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
-    if (is_cpy_supported) {
+    if (dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && ggml_backend_buffer_is_sycl(src->buffer)) {
        /*
        DPCT1009:215: SYCL uses exceptions to report errors and does not use the
        error codes. The original code was commented out and a warning string
@@ -3807,7 +3761,7 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
        */
        const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
        SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
-            dst->data, src->data, ggml_nbytes(dst))));
+            dst->data, src->data, ggml_nbytes(dst)).wait()));
        return true;
    }

@@ -3820,7 +3774,6 @@ catch (sycl::exception const &exc) {
 }

 static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
-    GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
    const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
    SYCL_CHECK(CHECK_TRY_ERROR((stream)->wait()));
@@ -3857,43 +3810,11 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * syc
    }
 }

-#ifdef GGML_SYCL_GRAPH
-static bool check_graph_compatibility(ggml_cgraph * cgraph) {
-    if (ggml_sycl_info().device_count > 1) {
-        // A sycl_ex::command_graph object can only be created for a single device
-        GGML_LOG_INFO("%s: disabling SYCL graphs due to multiple devices\n", __func__);
-        return false;
-    }
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        const ggml_op node_op = cgraph->nodes[i]->op;
-        switch (node_op) {
-            default:
-                break;
-            case GGML_OP_CONCAT:
-                // ggml_sycl_op_concat() does a blocking host wait after memcpy operations,
-                // but wait() can't be called on the events returned by a queue recording
-                // to a graph.
-                [[fallthrough]];
-            case GGML_OP_MUL_MAT_ID:
-                // ggml_sycl_mul_mat_id() does a blocking host wait on the sycl queue after
-                // submitting a memcpy operation, but wait() can't be called on a queue that
-                // is recording to a graph.
-                GGML_LOG_INFO("%s: disabling SYCL graphs due to unsupported node type %s\n", __func__,
-                              ggml_op_name(node_op));
-                return false;
-        }
-    }
-    return true;
-}
-#endif
-
 static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
    auto * sycl_ctx = static_cast<ggml_backend_sycl_context *>(backend->context);

 #ifdef GGML_SYCL_GRAPH
-    bool use_sycl_graph = !g_ggml_sycl_disable_graph && check_graph_compatibility(cgraph);
-    if (use_sycl_graph) {
+    if (!g_ggml_sycl_disable_graph) {
        const bool graph_support = dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_limited_graph);
        if (!graph_support) {
            GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device);
@@ -3954,7 +3875,7 @@ catch (sycl::exception const &exc)
 }

 static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
-    GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
+
    sycl::event* sycl_event = static_cast<sycl::event*>(event->context);

    if (ggml_backend_is_sycl(backend)) {
@@ -4349,7 +4270,6 @@ static void ggml_backend_sycl_device_event_free(ggml_backend_dev_t dev, ggml_bac

 static void ggml_backend_sycl_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) try {
  GGML_UNUSED(dev);
-  GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);

  sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
  SYCL_CHECK(CHECK_TRY_ERROR(sycl_event->wait()));
--- a/ggml/src/ggml-sycl/gla.cpp
+++ b/ggml/src/ggml-sycl/gla.cpp
@@ -76,7 +76,6 @@ static void gated_linear_attn_f32_kernel(const dpct::queue_ptr stream, u_int B,
 }

 void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/5);
    const float * k_d  = static_cast<const float *>(dst->src[0]->data);
    const float * v_d  = static_cast<const float *>(dst->src[1]->data);
    const float * r_d  = static_cast<const float *>(dst->src[2]->data);
--- a/ggml/src/ggml-sycl/mmvq.cpp
+++ b/ggml/src/ggml-sycl/mmvq.cpp
@@ -1059,10 +1059,8 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
            case GGML_TYPE_Q4_K:
                if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
                    ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
-                    GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_k_q8_1_sycl\n");
                    reorder_mul_mat_vec_q4_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
                } else {
-                    GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_K_q8_1_sycl\n");
                    mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
                }
                break;
--- a/ggml/src/ggml-sycl/outprod.cpp
+++ b/ggml/src/ggml-sycl/outprod.cpp
@@ -1,7 +1,6 @@
 #include "outprod.hpp"

 void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
    const ggml_tensor *src0 = dst->src[0];
    const ggml_tensor *src1 = dst->src[1];

--- a/ggml/src/ggml-sycl/rope.cpp
+++ b/ggml/src/ggml-sycl/rope.cpp
@@ -355,7 +355,8 @@ inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
 }

 void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
+    GGML_SYCL_DEBUG("call %s\n", __func__);
    ggml_sycl_op_rope(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }

--- a/ggml/src/ggml-sycl/softmax.cpp
+++ b/ggml/src/ggml-sycl/softmax.cpp
@@ -225,7 +225,7 @@ static void soft_max_f32_sycl(const float * x, const T * mask,
 }

 void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
+
    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

@@ -249,13 +249,16 @@ void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {

    if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F16) {
        const sycl::half * src1_dd = static_cast<sycl::half *>(dst->src[1]->data);
+        GGML_SYCL_DEBUG("%s: F16 mask\n", __func__);
        soft_max_f32_sycl<sycl::half>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias,
                          main_stream, ctx.device);
    } else if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F32) {
        const float * src1_dd = static_cast<const float *>(dst->src[1]->data);
+        GGML_SYCL_DEBUG("%s: F32 mask\n", __func__);
        soft_max_f32_sycl<float>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
    } else {
        /* mask unavailable */
+        GGML_SYCL_DEBUG("%s: No mask\n", __func__);
        soft_max_f32_sycl<float>(src0_dd, nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
    }
 }
--- a/ggml/src/ggml-sycl/tsembd.cpp
+++ b/ggml/src/ggml-sycl/tsembd.cpp
@@ -56,8 +56,8 @@ static void timestep_embedding_f32_sycl(
 }

 void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
-    const ggml_tensor *  src0   = dst->src[0];
+    const ggml_tensor *src0 = dst->src[0];
+    const ggml_tensor *src1 = dst->src[1];
    const float * src0_d = (const float *)src0->data;
    float * dst_d = (float *)dst->data;
    dpct::queue_ptr stream = ctx.stream();
@@ -69,4 +69,5 @@ void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tenso
    const int max_period = dst->op_params[1];

    timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
+    GGML_UNUSED(src1);
 }
--- a/ggml/src/ggml-sycl/wkv.cpp
+++ b/ggml/src/ggml-sycl/wkv.cpp
@@ -180,7 +180,10 @@ static void rwkv_wkv7_f32_kernel(
 }

 void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/6);
+
+    const ggml_tensor *src0 = dst->src[0];
+    const ggml_tensor *src1 = dst->src[1];
+
    const float* k_d = (const float*)dst->src[0]->data;
    const float* v_d = (const float*)dst->src[1]->data;
    const float* r_d = (const float*)dst->src[2]->data;
@@ -233,10 +236,16 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
                });
        });
    }
+
+    GGML_UNUSED(src0);
+    GGML_UNUSED(src1);
 }

 void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/7);
+
+    const ggml_tensor *src0 = dst->src[0];
+    const ggml_tensor *src1 = dst->src[1];
+
    const float* r_d = (const float*)dst->src[0]->data;
    const float* w_d = (const float*)dst->src[1]->data;
    const float* k_d = (const float*)dst->src[2]->data;
@@ -290,4 +299,7 @@ void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
                });
        });
    }
+
+    GGML_UNUSED(src0);
+    GGML_UNUSED(src1);
 }
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -2804,29 +2804,23 @@ static vk_device ggml_vk_get_device(size_t idx) {
                pipeline_robustness = true;
            } else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
                device->subgroup_size_control = true;
-#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
            } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
                       !getenv("GGML_VK_DISABLE_COOPMAT")) {
                device->coopmat_support = true;
                device->coopmat_m = 0;
                device->coopmat_n = 0;
                device->coopmat_k = 0;
-#endif
-#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
            } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
                       !getenv("GGML_VK_DISABLE_COOPMAT2")) {
                coopmat2_support = true;
-#endif
 #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
            } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 &&
                       !getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) {
                device->integer_dot_product = true;
 #endif
-#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
            } else if (strcmp("VK_KHR_shader_bfloat16", properties.extensionName) == 0 &&
                       !getenv("GGML_VK_DISABLE_BFLOAT16")) {
                bfloat16_support = true;
-#endif
            }
        }

@@ -4519,8 +4513,6 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
        return aligned ? mmp->a_m : mmp->m;
    }
    return aligned ? mmp->a_l : mmp->l;
-
-    GGML_UNUSED(src1_type);
 }

 static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type src0_type, ggml_type src1_type) {
@@ -4676,19 +4668,6 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
        }
    }

-    if (src->type == to) {
-        // Copy two or four bytes at a time, depending on block size.
-        // For quantized types, we scale by block size/type size. But
-        // this path is also used for bf16->bf16 for example, where the
-        // type size must be exactly 2 or 4.
-        GGML_ASSERT(ggml_is_quantized(to) || ggml_type_size(src->type) == 2 || ggml_type_size(src->type) == 4);
-        if ((ggml_type_size(src->type) % 4) == 0) {
-            return ctx->device->pipeline_contig_cpy_f32_f32;
-        } else {
-            return ctx->device->pipeline_contig_cpy_f16_f16;
-        }
-    }
-
    std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl;
    GGML_ABORT("fatal error");
 }
@@ -6452,7 +6431,6 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
    case GGML_OP_ROPE:
    case GGML_OP_RMS_NORM:
    case GGML_OP_CONV_2D_DW:
-    case GGML_OP_IM2COL:
        return true;
    default:
        return false;
@@ -6751,16 +6729,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
    case GGML_OP_UNARY:
    case GGML_OP_CONV_2D_DW:
        {
-            uint32_t ne = ggml_nelements(dst);
-            if (op == GGML_OP_CPY && ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
-                // Convert from number of logical elements to 2- or 4-byte units.
-                ne /= ggml_blck_size(src0->type);
-                if ((ggml_type_size(src0->type) % 4) == 0) {
-                    ne *= ggml_type_size(src0->type) / 4;
-                } else {
-                    ne *= ggml_type_size(src0->type) / 2;
-                }
-            }
+            const uint32_t ne = ggml_nelements(dst);
            if (ne > 262144) {
                elements = { 512, 512, CEIL_DIV(ne, 262144) };
            } else if (ne > 512) {
@@ -7310,19 +7279,8 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const
    const uint32_t src0_type_size = ggml_type_size(src0->type);
    const uint32_t dst_type_size = ggml_type_size(dst->type);

-    uint32_t ne = (uint32_t)ggml_nelements(src0);
-    if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
-        // Convert from number of logical elements to 2- or 4-byte units.
-        ne /= ggml_blck_size(src0->type);
-        if ((ggml_type_size(src0->type) % 4) == 0) {
-            ne *= ggml_type_size(src0->type) / 4;
-        } else {
-            ne *= ggml_type_size(src0->type) / 2;
-        }
-    }
-
    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
-        ne,
+        (uint32_t)ggml_nelements(src0),
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
@@ -9304,7 +9262,8 @@ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_
    try {
        ptr = ggml_vk_host_malloc(vk_instance.devices[0], size);
    } catch (vk::SystemError& e) {
-        GGML_LOG_WARN("ggml_vulkan: Failed to allocate pinned memory (%s)\n", e.what());
+        std::cerr << "ggml_vulkan: Failed to allocate pinned memory." << std::endl;
+        std::cerr << "ggml_vulkan: " << e.what() << std::endl;
        // fallback to cpu buffer
        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
    }
@@ -9906,15 +9865,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
                    return true;
                }
-
-                // We can handle copying from a type to the same type if it's
-                // contiguous (memcpy). We use f16 or f32 shaders to do the copy,
-                // so the type/block size must be a multiple of 4.
-                if (src0_type == src1_type &&
-                    ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op) &&
-                    (ggml_type_size(src0_type) % 2) == 0) {
-                    return true;
-                }
                return false;
            } break;
        case GGML_OP_REPEAT:
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp
@@ -1,6 +1,6 @@
 #version 450

-#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require

 #include "dequant_head.comp"

--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
@@ -7,7 +7,7 @@
 #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
 #endif
 #if defined(DATA_A_IQ1_M)
-#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
 #endif

 #if defined(DATA_A_BF16) && defined(COOPMAT)
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1099,10 +1099,9 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
    "HARDSWISH",
    "HARDSIGMOID",
    "EXP",
-    "GELU_ERF",
 };

-static_assert(GGML_UNARY_OP_COUNT == 15, "GGML_UNARY_OP_COUNT != 15");
+static_assert(GGML_UNARY_OP_COUNT == 14, "GGML_UNARY_OP_COUNT != 14");


 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@@ -2502,20 +2501,6 @@ struct ggml_tensor * ggml_gelu_inplace(
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
 }

-// ggml_gelu_erf
-
-struct ggml_tensor * ggml_gelu_erf(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_ERF);
-}
-
-struct ggml_tensor * ggml_gelu_erf_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_ERF);
-}
-
 // ggml_gelu_quick

 struct ggml_tensor * ggml_gelu_quick(
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -219,13 +219,10 @@ class Keys:
        TYPE       = "adapter.type"
        LORA_ALPHA = "adapter.lora.alpha"

-    class Clip:
+    class ClipVision:
        PROJECTOR_TYPE      = "clip.projector_type"
        HAS_VISION_ENCODER  = "clip.has_vision_encoder"
-        HAS_AUDIO_ENCODER   = "clip.has_audio_encoder"
        HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
-
-    class ClipVision:
        IMAGE_SIZE          = "clip.vision.image_size"
        PATCH_SIZE          = "clip.vision.patch_size"
        EMBEDDING_LENGTH    = "clip.vision.embedding_length"
@@ -246,33 +243,19 @@ class Keys:
        class Projector:
            SCALE_FACTOR    = "clip.vision.projector.scale_factor"

-    class ClipAudio:
-        NUM_MEL_BINS        = "clip.audio.num_mel_bins"
-        EMBEDDING_LENGTH    = "clip.audio.embedding_length"
-        FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length"
-        PROJECTION_DIM      = "clip.audio.projection_dim"
-        BLOCK_COUNT         = "clip.audio.block_count"
-
-        class Attention:
-            HEAD_COUNT      = "clip.audio.attention.head_count"
-            LAYERNORM_EPS   = "clip.audio.attention.layer_norm_epsilon"
-
-        class Projector:
-            STACK_FACTOR    = "clip.audio.projector.stack_factor"
-
 #
 # recommended mapping of model tensor names for storage in gguf
 #


 class GGUFType:
-    MODEL   = "model"
-    ADAPTER = "adapter"
-    MMPROJ  = "mmproj" # dummy, unused for now
+    MODEL       = "model"
+    ADAPTER     = "adapter"
+    CLIP_VISION = "clip-vision"


 class MODEL_ARCH(IntEnum):
-    MMPROJ           = auto() # dummy arch for clip.cpp
+    CLIP_VISION      = auto() # dummy arch for clip.cpp
    LLAMA            = auto()
    LLAMA4           = auto()
    DECI             = auto()
@@ -531,28 +514,10 @@ class MODEL_TENSOR(IntEnum):
    V_RESMPL_QUERY       = auto() # minicpmv
    V_TOK_EMBD_IMG_BREAK = auto() # pixtral
    V_MM_PATCH_MERGER    = auto() # mistral small 3.1
-    # audio (mtmd)
-    A_ENC_EMBD_POS       = auto()
-    A_ENC_CONV1D         = auto()
-    A_PRE_NORM           = auto()
-    A_POST_NORM          = auto()
-    A_ENC_ATTN_Q         = auto()
-    A_ENC_ATTN_K         = auto()
-    A_ENC_ATTN_V         = auto()
-    A_ENC_INPUT_NORM     = auto()
-    A_ENC_OUTPUT         = auto()
-    A_ENC_OUTPUT_NORM    = auto()
-    A_ENC_FFN_UP         = auto()
-    A_ENC_FFN_GATE       = auto()
-    A_ENC_FFN_DOWN       = auto()
-    A_MMPROJ             = auto()
-    A_MMPROJ_FC          = auto()
-    A_MM_NORM_PRE        = auto()
-    A_MM_NORM_MID        = auto()


 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
-    MODEL_ARCH.MMPROJ:           "clip", # dummy arch for clip.cpp
+    MODEL_ARCH.CLIP_VISION:      "clip", # dummy arch for clip.cpp
    MODEL_ARCH.LLAMA:            "llama",
    MODEL_ARCH.LLAMA4:           "llama4",
    MODEL_ARCH.DECI:             "deci",
@@ -811,28 +776,10 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.V_RESMPL_QUERY:            "resampler.query",
    MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK:      "v.token_embd.img_break", # pixtral
    MODEL_TENSOR.V_MM_PATCH_MERGER:         "mm.patch_merger", # mistral small 3.1
-    # audio (mtmd)
-    MODEL_TENSOR.A_ENC_EMBD_POS:            "a.position_embd",
-    MODEL_TENSOR.A_ENC_CONV1D:              "a.conv1d.{bid}",
-    MODEL_TENSOR.A_PRE_NORM:                "a.pre_ln",
-    MODEL_TENSOR.A_POST_NORM:               "a.post_ln",
-    MODEL_TENSOR.A_ENC_ATTN_Q:              "a.blk.{bid}.attn_q",
-    MODEL_TENSOR.A_ENC_ATTN_K:              "a.blk.{bid}.attn_k",
-    MODEL_TENSOR.A_ENC_ATTN_V:              "a.blk.{bid}.attn_v",
-    MODEL_TENSOR.A_ENC_INPUT_NORM:          "a.blk.{bid}.ln1",
-    MODEL_TENSOR.A_ENC_OUTPUT:              "a.blk.{bid}.attn_out",
-    MODEL_TENSOR.A_ENC_OUTPUT_NORM:         "a.blk.{bid}.ln2",
-    MODEL_TENSOR.A_ENC_FFN_UP:              "a.blk.{bid}.ffn_up",
-    MODEL_TENSOR.A_ENC_FFN_GATE:            "a.blk.{bid}.ffn_gate",
-    MODEL_TENSOR.A_ENC_FFN_DOWN:            "a.blk.{bid}.ffn_down",
-    MODEL_TENSOR.A_MMPROJ:                  "mm.a.mlp.{bid}",
-    MODEL_TENSOR.A_MMPROJ_FC:               "mm.a.fc",
-    MODEL_TENSOR.A_MM_NORM_PRE:             "mm.a.norm_pre",
-    MODEL_TENSOR.A_MM_NORM_MID:             "mm.a.norm_mid",
 }

 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
-    MODEL_ARCH.MMPROJ: [
+    MODEL_ARCH.CLIP_VISION: [
        MODEL_TENSOR.V_MMPROJ,
        MODEL_TENSOR.V_MMPROJ_FC,
        MODEL_TENSOR.V_MMPROJ_MLP,
@@ -872,24 +819,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.V_RESMPL_QUERY,
        MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK,
        MODEL_TENSOR.V_MM_PATCH_MERGER,
-        # audio
-        MODEL_TENSOR.A_ENC_EMBD_POS,
-        MODEL_TENSOR.A_ENC_CONV1D,
-        MODEL_TENSOR.A_PRE_NORM,
-        MODEL_TENSOR.A_POST_NORM,
-        MODEL_TENSOR.A_ENC_ATTN_Q,
-        MODEL_TENSOR.A_ENC_ATTN_K,
-        MODEL_TENSOR.A_ENC_ATTN_V,
-        MODEL_TENSOR.A_ENC_INPUT_NORM,
-        MODEL_TENSOR.A_ENC_OUTPUT,
-        MODEL_TENSOR.A_ENC_OUTPUT_NORM,
-        MODEL_TENSOR.A_ENC_FFN_UP,
-        MODEL_TENSOR.A_ENC_FFN_GATE,
-        MODEL_TENSOR.A_ENC_FFN_DOWN,
-        MODEL_TENSOR.A_MMPROJ,
-        MODEL_TENSOR.A_MMPROJ_FC,
-        MODEL_TENSOR.A_MM_NORM_PRE,
-        MODEL_TENSOR.A_MM_NORM_MID,
    ],
    MODEL_ARCH.LLAMA: [
        MODEL_TENSOR.TOKEN_EMBD,
@@ -2257,9 +2186,7 @@ class VisionProjectorType:
    LLAMA4 = "llama4"
    QWEN2VL = "qwen2vl_merger"
    QWEN25VL = "qwen2.5vl_merger"
-    ULTRAVOX = "ultravox"
    INTERNVL = "internvl"
-    QWEN2A = "qwen2a" # audio


 # Items here are (block size, type size)
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -251,7 +251,7 @@ class GGUFReader:
                offs += curr_size
            return offs - orig_offs, aparts, data_idxs, types
        # We can't deal with this one.
-        raise ValueError(f'Unknown/unhandled field type {gtype}')
+        raise ValueError('Unknown/unhandled field type {gtype}')

    def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
        offs = orig_offs
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -896,7 +896,7 @@ class GGUFWriter:
    def add_remove_extra_whitespaces(self, value: bool) -> None:
        self.add_bool(Keys.Tokenizer.REMOVE_EXTRA_WS, value)

-    def add_precompiled_charsmap(self, charsmap: bytes) -> None:
+    def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
        self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)

    def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
@@ -936,18 +936,12 @@ class GGUFWriter:

    # for vision models

-    def add_clip_has_vision_encoder(self, value: bool) -> None:
-        self.add_bool(Keys.Clip.HAS_VISION_ENCODER, value)
-
-    def add_clip_has_audio_encoder(self, value: bool) -> None:
-        self.add_bool(Keys.Clip.HAS_AUDIO_ENCODER, value)
-
-    def add_clip_projector_type(self, value: str) -> None:
-        self.add_string(Keys.Clip.PROJECTOR_TYPE, value)
-
    def add_vision_projection_dim(self, value: int) -> None:
        self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value)

+    def add_vision_has_vision_encoder(self, value: bool) -> None:
+        self.add_bool(Keys.ClipVision.HAS_VISION_ENCODER, value)
+
    def add_vision_patch_size(self, value: int) -> None:
        self.add_uint32(Keys.ClipVision.PATCH_SIZE, value)

@@ -963,6 +957,9 @@ class GGUFWriter:
    def add_vision_head_count(self, value: int) -> None:
        self.add_uint32(Keys.ClipVision.Attention.HEAD_COUNT, value)

+    def add_vision_projector_type(self, value: str) -> None:
+        self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value)
+
    def add_vision_attention_layernorm_eps(self, value: float) -> None:
        self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value)

@@ -990,32 +987,6 @@ class GGUFWriter:
    def add_vision_n_wa_pattern(self, value: int) -> None:
        self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value)

-    # audio models
-
-    def add_audio_projection_dim(self, value: int) -> None:
-        self.add_uint32(Keys.ClipAudio.PROJECTION_DIM, value)
-
-    def add_audio_embedding_length(self, value: int) -> None:
-        self.add_uint32(Keys.ClipAudio.EMBEDDING_LENGTH, value)
-
-    def add_audio_feed_forward_length(self, value: int) -> None:
-        self.add_uint32(Keys.ClipAudio.FEED_FORWARD_LENGTH, value)
-
-    def add_audio_block_count(self, value: int) -> None:
-        self.add_uint32(Keys.ClipAudio.BLOCK_COUNT, value)
-
-    def add_audio_head_count(self, value: int) -> None:
-        self.add_uint32(Keys.ClipAudio.Attention.HEAD_COUNT, value)
-
-    def add_audio_attention_layernorm_eps(self, value: float) -> None:
-        self.add_float32(Keys.ClipAudio.Attention.LAYERNORM_EPS, value)
-
-    def add_audio_num_mel_bins(self, value: int) -> None:
-        self.add_uint32(Keys.ClipAudio.NUM_MEL_BINS, value)
-
-    def add_audio_stack_factor(self, value: int) -> None:
-        self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)
-
    def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
        pack_prefix = ''
        if not skip_pack_prefix:
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -1110,72 +1110,6 @@ class TensorNameMap:
        MODEL_TENSOR.V_MM_PATCH_MERGER: (
            "multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1
        ),
-
-        # audio (mtmd)
-
-        MODEL_TENSOR.A_ENC_EMBD_POS: (
-            "audio_tower.embed_positions", # ultravox
-        ),
-
-        MODEL_TENSOR.A_ENC_CONV1D: (
-            "audio_tower.conv{bid}", # ultravox
-        ),
-
-        MODEL_TENSOR.A_PRE_NORM: (),
-
-        MODEL_TENSOR.A_POST_NORM: (
-            "audio_tower.layer_norm", # ultravox
-        ),
-
-        MODEL_TENSOR.A_ENC_ATTN_Q: (
-            "audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
-        ),
-
-        MODEL_TENSOR.A_ENC_ATTN_K: (
-            "audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
-        ),
-
-        MODEL_TENSOR.A_ENC_ATTN_V: (
-            "audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
-        ),
-
-        MODEL_TENSOR.A_ENC_INPUT_NORM: (
-            "audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
-        ),
-
-        MODEL_TENSOR.A_ENC_OUTPUT: (
-            "audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
-        ),
-
-        MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
-            "audio_tower.layers.{bid}.final_layer_norm", # ultravox
-        ),
-
-        MODEL_TENSOR.A_ENC_FFN_UP: (
-            "audio_tower.layers.{bid}.fc1", # ultravox
-        ),
-
-        MODEL_TENSOR.A_ENC_FFN_GATE: (),
-
-        MODEL_TENSOR.A_ENC_FFN_DOWN: (
-            "audio_tower.layers.{bid}.fc2", # ultravox
-        ),
-
-        MODEL_TENSOR.A_MMPROJ: (
-            "audio.multi_modal_projector.linear_{bid}", # ultravox
-        ),
-
-        MODEL_TENSOR.A_MMPROJ_FC: (
-            "audio.multi_modal_projector.linear", # qwen2audio
-        ),
-
-        MODEL_TENSOR.A_MM_NORM_PRE: (
-            "audio.multi_modal_projector.ln_pre", # ultravox
-        ),
-
-        MODEL_TENSOR.A_MM_NORM_MID: (
-            "audio.multi_modal_projector.ln_mid", # ultravox
-        ),
    }

    # architecture-specific block mappings
--- a/include/llama.h
+++ b/include/llama.h
@@ -361,11 +361,10 @@ extern "C" {

        // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
        bool embeddings;  // if true, extract embeddings (together with logits)
-        bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
-        bool flash_attn;  // use flash attention [EXPERIMENTAL]
-        bool no_perf;     // measure performance timings
-        bool op_offload;  // offload host tensor operations to device
-        bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
+        bool no_perf;     // whether to measure performance timings
+        bool op_offload;  // whether to offload host tensor operations to device
    };

    // model quantization parameters
@@ -471,7 +470,6 @@ extern "C" {
    LLAMA_API int64_t llama_time_us(void);

    LLAMA_API size_t llama_max_devices(void);
-    LLAMA_API size_t llama_max_parallel_sequences(void);

    LLAMA_API bool llama_supports_mmap       (void);
    LLAMA_API bool llama_supports_mlock      (void);
@@ -609,14 +607,71 @@ extern "C" {
    // KV cache
    //

+    // TODO: start using struct llama_kv_cache
+
+    // Information associated with an individual cell in the KV cache view.
+    struct llama_kv_cache_view_cell {
+        // The position for this cell. Takes KV cache shifts into account.
+        // May be negative if the cell is not populated.
+        llama_pos pos;
+    };
+
+    // An updateable view of the KV cache.
+    struct llama_kv_cache_view {
+        // Number of KV cache cells. This will be the same as the context size.
+        int32_t n_cells;
+
+        // Maximum number of sequences that can exist in a cell. It's not an error
+        // if there are more sequences in a cell than this value, however they will
+        // not be visible in the view cells_sequences.
+        int32_t n_seq_max;
+
+        // Number of tokens in the cache. For example, if there are two populated
+        // cells, the first with 1 sequence id in it and the second with 2 sequence
+        // ids then you'll have 3 tokens.
+        int32_t token_count;
+
+        // Number of populated cache cells.
+        int32_t used_cells;
+
+        // Maximum contiguous empty slots in the cache.
+        int32_t max_contiguous;
+
+        // Index to the start of the max_contiguous slot range. Can be negative
+        // when cache is full.
+        int32_t max_contiguous_idx;
+
+        // Information for an individual cell.
+        struct llama_kv_cache_view_cell * cells;
+
+        // The sequences for each cell. There will be n_seq_max items per cell.
+        llama_seq_id * cells_sequences;
+    };
+
+    // Create an empty KV cache view. (use only for debugging purposes)
+    LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
+
+    // Free a KV cache view. (use only for debugging purposes)
+    LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
+
+    // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
+    // TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
+    LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
+
+    ///
+
    // Returns the number of tokens in the KV cache (slow, use only for debug)
    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
-               "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
+    LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
+
+    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
+            "use llama_kv_self_n_tokens instead");

    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
-               "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
+    LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
+
+    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
+            "use llama_kv_self_used_cells instead");

    // Clear the KV cache - both cell info is erased and KV data is zeroed
    LLAMA_API void llama_kv_self_clear(
@@ -675,18 +730,10 @@ extern "C" {
                       llama_pos   p1,
                             int   d);

-    // Returns the smallest position present in the KV cache for the specified sequence
-    // This is typically non-zero only for SWA caches
-    // Return -1 if the sequence is empty
-    LLAMA_API llama_pos llama_kv_self_seq_pos_min(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id);
-
    // Returns the largest position present in the KV cache for the specified sequence
-    // Return -1 if the sequence is empty
    LLAMA_API llama_pos llama_kv_self_seq_pos_max(
            struct llama_context * ctx,
-                    llama_seq_id   seq_id);
+                     llama_seq_id   seq_id);

    // Defragment the KV cache
    // This will be applied:
@@ -700,6 +747,61 @@ extern "C" {
    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
    LLAMA_API void llama_kv_self_update(struct llama_context * ctx);

+    DEPRECATED(LLAMA_API void llama_kv_cache_clear(
+            struct llama_context * ctx),
+            "use llama_kv_self_clear instead");
+
+    DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1),
+            "use llama_kv_self_seq_rm instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id_src,
+                    llama_seq_id   seq_id_dst,
+                       llama_pos   p0,
+                       llama_pos   p1),
+            "use llama_kv_self_seq_cp instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id),
+            "use llama_kv_self_seq_keep instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                       llama_pos   delta),
+            "use llama_kv_self_seq_add instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                             int   d),
+            "use llama_kv_self_seq_div instead");
+
+    DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id),
+            "use llama_kv_self_seq_pos_max instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
+            "use llama_kv_self_defrag instead");
+
+    DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
+            "use llama_kv_self_can_shift instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
+            "use llama_kv_self_update instead");
+
+
    //
    // State / sessions
    //
@@ -841,12 +943,9 @@ extern "C" {
    // Requires KV cache.
    // For encode-decoder contexts, processes the batch using the decoder.
    // Positive return values does not mean a fatal error, but rather a warning.
-    // Upon non-zero return values, the KV cache state is restored to the state before this call
-    //    0 - success
-    //    1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-    //    2 - aborted
-    //   -1 - invalid input batch
-    // < -1 - error
+    //   0 - success
+    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
+    // < 0 - error. the KV cache state is restored to the state before this call
    LLAMA_API int32_t llama_decode(
            struct llama_context * ctx,
              struct llama_batch   batch);
--- a/models/ggml-vocab-nomic-bert-moe.gguf
+++ b/models/ggml-vocab-nomic-bert-moe.gguf
--- a/models/ggml-vocab-nomic-bert-moe.gguf.inp
+++ b/models/ggml-vocab-nomic-bert-moe.gguf.inp
@@ -1,112 +0,0 @@
-ied 4 ½ months
-__ggml_vocab_test__
-Führer
-__ggml_vocab_test__
-
-__ggml_vocab_test__
- 
-__ggml_vocab_test__
-  
-__ggml_vocab_test__
-   
-__ggml_vocab_test__
-	
-__ggml_vocab_test__
-
-
-__ggml_vocab_test__
-
-
-
-__ggml_vocab_test__
-
-
-
-
-__ggml_vocab_test__
-	
-
-__ggml_vocab_test__
-Hello world
-__ggml_vocab_test__
- Hello world
-__ggml_vocab_test__
-Hello World
-__ggml_vocab_test__
- Hello World
-__ggml_vocab_test__
- Hello World!
-__ggml_vocab_test__
-Hello, world!
-__ggml_vocab_test__
- Hello, world!
-__ggml_vocab_test__
- this is 🦙.cpp
-__ggml_vocab_test__
-w048 7tuijk dsdfhu
-__ggml_vocab_test__
-нещо на Български
-__ggml_vocab_test__
-កាន់តែពិសេសអាចខលចេញ
-__ggml_vocab_test__
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
-__ggml_vocab_test__
-Hello
-__ggml_vocab_test__
- Hello
-__ggml_vocab_test__
-  Hello
-__ggml_vocab_test__
-   Hello
-__ggml_vocab_test__
-    Hello
-__ggml_vocab_test__
-    Hello
-    Hello
-__ggml_vocab_test__
- (
-__ggml_vocab_test__
-
- =
-__ggml_vocab_test__
-' era
-__ggml_vocab_test__
-Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
-__ggml_vocab_test__
-!!!!!!
-__ggml_vocab_test__
-3
-__ggml_vocab_test__
-33
-__ggml_vocab_test__
-333
-__ggml_vocab_test__
-3333
-__ggml_vocab_test__
-33333
-__ggml_vocab_test__
-333333
-__ggml_vocab_test__
-3333333
-__ggml_vocab_test__
-33333333
-__ggml_vocab_test__
-333333333
-__ggml_vocab_test__
-Cửa Việt
-__ggml_vocab_test__
- discards
-__ggml_vocab_test__
-
- 
-
- 
-
-
- 	 		 	
-  
-   
-    
-     
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
-__ggml_vocab_test__
--- a/models/ggml-vocab-nomic-bert-moe.gguf.out
+++ b/models/ggml-vocab-nomic-bert-moe.gguf.out
@@ -1,46 +0,0 @@
- 17 297 201 78660 21775
- 72805 4097 56
-
-
-
-
-
-
-
-
-
- 35378 8999
- 35378 8999
- 35378 6661
- 35378 6661
- 35378 6661 38
- 35378 4 8999 38
- 35378 4 8999 38
- 903 83 6 3 5 238 6366
- 148 7709 1019 361 458 134362 104 7 71 420 1132
- 14271 29 117152
- 6 149561 78270 48967 64254 7616 81705
- 6 247206 15 33176 16 6 247442 6 3 15755 15 144227 8705 18255 40292 158 4460 33 27686 16 6 142325 15 191 538 28 121505 450 1556 6863 10002 47 1098 16
- 35378
- 35378
- 35378
- 35378
- 35378
- 35378 35378
- 15
- 2203
- 242 1615
- 35378 4 113 25 5584 38 11249 621 398 6 201344 705 23638 213 9007 133 1879 2681 2592 135224 1906 6087
- 6 90827
- 138
- 3912
- 6 66000
- 138 66000
- 3912 66000
- 6 66000 66000
- 138 66000 66000
- 3912 66000 66000
- 6 66000 66000 66000
- 199152 3763
- 17116 99397
- 6 247206 15 33176 16 6 247442 6 3 15755 15 144227 8705 18255 40292 158 4460 33 27686 16 6 142325 6 3 138 3912 6 66000 138 66000 3912 66000 6 66000 66000 138 66000 66000 3912 66000 66000 80308 1031 5 363 138 27 363 6 149561 78270 48967 201344 705 23638 213 9007 133 1879 2681 2592 135224 1906 6087 6 110405 1369 69112 69112 69112 14271 29 117152 5106 4765 4765 1135 164721 164721 164721 58 58 58 58 2551 90827 32 85908 87 25 272 2809 242 18 18345 764 25 7 2685 4 242 11766 398 9077 32 242 594 959 9077 87 25 1181 3249 442 4 242 397 398 1884 3060 26156 32 1401 25 26455 10 25 141 866
--- a/models/templates/Qwen-QwQ-32B.jinja
+++ b/models/templates/Qwen-QwQ-32B.jinja
@@ -1,62 +0,0 @@
-{%- if tools %}
-    {{- '<|im_start|>system\n' }}
-    {%- if messages[0]['role'] == 'system' %}
-        {{- messages[0]['content'] }}
-    {%- else %}
-        {{- '' }}
-    {%- endif %}
-    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
-    {%- for tool in tools %}
-        {{- "\n" }}
-        {{- tool | tojson }}
-    {%- endfor %}
-    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
-{%- else %}
-    {%- if messages[0]['role'] == 'system' %}
-        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
-  {%- endif %}
-{%- endif %}
-{%- for message in messages %}
-    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
-        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
-    {%- elif message.role == "assistant" and not message.tool_calls %}
-        {%- set content = message.content %}
-        {%- if not loop.last %}
-            {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
-        {%- endif %}
-        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
-    {%- elif message.role == "assistant" %}
-        {%- set content = message.content %}
-        {%- if not loop.last %}
-            {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
-        {%- endif %}
-        {{- '<|im_start|>' + message.role }}
-        {%- if message.content %}
-            {{- '\n' + content }}
-        {%- endif %}
-        {%- for tool_call in message.tool_calls %}
-            {%- if tool_call.function is defined %}
-                {%- set tool_call = tool_call.function %}
-            {%- endif %}
-            {{- '\n<tool_call>\n{"name": "' }}
-            {{- tool_call.name }}
-            {{- '", "arguments": ' }}
-            {{- tool_call.arguments | tojson }}
-            {{- '}\n</tool_call>' }}
-        {%- endfor %}
-        {{- '<|im_end|>\n' }}
-    {%- elif message.role == "tool" %}
-        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
-            {{- '<|im_start|>user' }}
-        {%- endif %}
-        {{- '\n<tool_response>\n' }}
-        {{- message.content }}
-        {{- '\n</tool_response>' }}
-        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
-            {{- '<|im_end|>\n' }}
-        {%- endif %}
-    {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n<think>\n' }}
-{%- endif %}
--- a/models/templates/Qwen-Qwen3-0.6B.jinja
+++ b/models/templates/Qwen-Qwen3-0.6B.jinja
@@ -1,85 +0,0 @@
-{%- if tools %}
-    {{- '<|im_start|>system\n' }}
-    {%- if messages[0].role == 'system' %}
-        {{- messages[0].content + '\n\n' }}
-    {%- endif %}
-    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
-    {%- for tool in tools %}
-        {{- "\n" }}
-        {{- tool | tojson }}
-    {%- endfor %}
-    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
-{%- else %}
-    {%- if messages[0].role == 'system' %}
-        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
-    {%- endif %}
-{%- endif %}
-{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
-{%- for message in messages[::-1] %}
-    {%- set index = (messages|length - 1) - loop.index0 %}
-    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
-        {%- set ns.multi_step_tool = false %}
-        {%- set ns.last_query_index = index %}
-    {%- endif %}
-{%- endfor %}
-{%- for message in messages %}
-    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
-        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
-    {%- elif message.role == "assistant" %}
-        {%- set content = message.content %}
-        {%- set reasoning_content = '' %}
-        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
-            {%- set reasoning_content = message.reasoning_content %}
-        {%- else %}
-            {%- if '</think>' in message.content %}
-                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
-                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
-            {%- endif %}
-        {%- endif %}
-        {%- if loop.index0 > ns.last_query_index %}
-            {%- if loop.last or (not loop.last and reasoning_content) %}
-                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
-            {%- else %}
-                {{- '<|im_start|>' + message.role + '\n' + content }}
-            {%- endif %}
-        {%- else %}
-            {{- '<|im_start|>' + message.role + '\n' + content }}
-        {%- endif %}
-        {%- if message.tool_calls %}
-            {%- for tool_call in message.tool_calls %}
-                {%- if (loop.first and content) or (not loop.first) %}
-                    {{- '\n' }}
-                {%- endif %}
-                {%- if tool_call.function %}
-                    {%- set tool_call = tool_call.function %}
-                {%- endif %}
-                {{- '<tool_call>\n{"name": "' }}
-                {{- tool_call.name }}
-                {{- '", "arguments": ' }}
-                {%- if tool_call.arguments is string %}
-                    {{- tool_call.arguments }}
-                {%- else %}
-                    {{- tool_call.arguments | tojson }}
-                {%- endif %}
-                {{- '}\n</tool_call>' }}
-            {%- endfor %}
-        {%- endif %}
-        {{- '<|im_end|>\n' }}
-    {%- elif message.role == "tool" %}
-        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
-            {{- '<|im_start|>user' }}
-        {%- endif %}
-        {{- '\n<tool_response>\n' }}
-        {{- message.content }}
-        {{- '\n</tool_response>' }}
-        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
-            {{- '<|im_end|>\n' }}
-        {%- endif %}
-    {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n' }}
-    {%- if enable_thinking is defined and enable_thinking is false %}
-        {{- '<think>\n\n</think>\n\n' }}
-    {%- endif %}
-{%- endif %}
--- a/models/templates/README.md
+++ b/models/templates/README.md
@@ -19,6 +19,4 @@ These templates can be updated with the following commands:
 ./scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use > models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja
 ./scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use   > models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja
 ./scripts/get_chat_template.py Qwen/Qwen2.5-7B-Instruct                      > models/templates/Qwen-Qwen2.5-7B-Instruct.jinja
-./scripts/get_chat_template.py Qwen/QwQ-32B                                  > models/templates/Qwen-QwQ-32B.jinja
-./scripts/get_chat_template.py Qwen/Qwen3-0.6B                               > models/templates/Qwen-Qwen3-0.6B.jinja
 ```
--- a/requirements/requirements-convert_hf_to_gguf.txt
+++ b/requirements/requirements-convert_hf_to_gguf.txt
@@ -1,7 +1,3 @@
 -r ./requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch~=2.2.1; platform_machine != "s390x"
-
-# torch s390x packages can only be found from nightly builds
--extra-index-url https://download.pytorch.org/whl/nightly
-torch>=0.0.0.dev0; platform_machine == "s390x"
+torch~=2.2.1
--- a/requirements/requirements-convert_hf_to_gguf_update.txt
+++ b/requirements/requirements-convert_hf_to_gguf_update.txt
@@ -1,7 +1,3 @@
 -r ./requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch~=2.2.1; platform_machine != "s390x"
-
-# torch s390x packages can only be found from nightly builds
--extra-index-url https://download.pytorch.org/whl/nightly
-torch>=0.0.0.dev0; platform_machine == "s390x"
+torch~=2.2.1
--- a/requirements/requirements-convert_lora_to_gguf.txt
+++ b/requirements/requirements-convert_lora_to_gguf.txt
@@ -1,4 +1,2 @@
 -r ./requirements-convert_hf_to_gguf.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
-# torch s390x packages can only be found from nightly builds
--extra-index-url https://download.pytorch.org/whl/nightly
--- a/scripts/tool_bench.py
+++ b/scripts/tool_bench.py
@@ -12,7 +12,6 @@
        export LLAMA_SERVER_BIN_PATH=$PWD/build/bin/llama-server
        export LLAMA_CACHE=${LLAMA_CACHE:-$HOME/Library/Caches/llama.cpp}

-        ./scripts/tool_bench.py run --n 10 --temp -1 --temp 0 --temp 1 --temp 2 --temp 5 --llama-baseline $PWD/buildMaster/bin/llama-server --output qwen14b.jsonl --hf bartowski/Qwen2.5-14B-Instruct-GGUF:Q4_K_L
        ./scripts/tool_bench.py run --n 30 --temp -1 --temp 0 --temp 1 --model "Qwen 2.5 1.5B Q4_K_M"      --output qwen1.5b.jsonl  --hf bartowski/Qwen2.5-1.5B-Instruct-GGUF      --ollama qwen2.5:1.5b-instruct-q4_K_M
        ./scripts/tool_bench.py run --n 30 --temp -1 --temp 0 --temp 1 --model "Qwen 2.5 Coder 7B Q4_K_M"  --output qwenc7b.jsonl   --hf bartowski/Qwen2.5-Coder-7B-Instruct-GGUF  --ollama qwen2.5-coder:7b

@@ -206,7 +205,6 @@ def run(
    model: Annotated[Optional[str], typer.Option(help="Name of the model to test (server agnostic)")] = None,
    hf: Annotated[Optional[str], typer.Option(help="GGUF huggingface model repo id (+ optional quant) to test w/ llama-server")] = None,
    chat_template: Annotated[Optional[str], typer.Option(help="Chat template override for llama-server")] = None,
-    chat_template_file: Annotated[Optional[str], typer.Option(help="Chat template file override for llama-server")] = None,
    ollama: Annotated[Optional[str], typer.Option(help="Ollama model tag to test")] = None,
    llama_baseline: Annotated[Optional[str], typer.Option(help="llama-server baseline binary path to use as baseline")] = None,
    n: Annotated[int, typer.Option(help="Number of times to run each test")] = 10,
@@ -231,12 +229,6 @@ def run(
    # n_ctx = 8192
    n_ctx = 2048

-    if model is None:
-        if hf is not None:
-            model = hf.split("/")[-1]
-        elif ollama is not None:
-            model = ollama
-
    assert force or append or not output.exists(), f"Output file already exists: {output}; use --force to overwrite"

    with output.open('a' if append else 'w') as output_file:
@@ -328,7 +320,6 @@ def run(
                    server.model_hf_repo = hf
                    server.model_hf_file = None
                    server.chat_template = chat_template
-                    server.chat_template_file = chat_template_file
                    server.server_path = server_path
                    if port is not None:
                        server.server_port = port
@@ -344,7 +335,6 @@ def run(
                                temp=t,
                                output_kwargs=dict(
                                    chat_template=chat_template,
-                                    chat_template_file=chat_template_file,
                                ),
                                request_kwargs=dict(
                                    ignore_chat_grammar=ignore_chat_grammar,
@@ -365,7 +355,6 @@ def run(
                        temp=t,
                        output_kwargs=dict(
                            chat_template=None,
-                            chat_template_file=None,
                        ),
                        request_kwargs=dict(
                            model=ollama,
--- a/src/llama-batch.cpp
+++ b/src/llama-batch.cpp
@@ -1,6 +1,5 @@
 #include "llama-batch.h"

-#include <cassert>
 #include <cstring>
 #include <algorithm>

@@ -282,10 +281,9 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
    batch = in_batch;
    GGML_ASSERT(batch.n_tokens > 0);
    if (!batch.pos) {
-        assert(p0 >= 0);
        pos.resize(batch.n_tokens);
        for (int32_t i = 0; i < batch.n_tokens; i++) {
-            pos[i] = p0 + i;
+            pos[i] = i + p0;
        }
        batch.pos = pos.data();
    }
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -25,11 +25,7 @@ llama_context::llama_context(

    const auto & hparams = model.hparams;

-    cparams.n_seq_max = std::max(1u, params.n_seq_max);
-    if (cparams.n_seq_max > LLAMA_MAX_PARALLEL_SEQUENCES) {
-        throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_PARALLEL_SEQUENCES));
-    }
-
+    cparams.n_seq_max        = std::max(1u, params.n_seq_max);
    cparams.n_threads        = params.n_threads;
    cparams.n_threads_batch  = params.n_threads_batch;
    cparams.yarn_ext_factor  = params.yarn_ext_factor;
@@ -97,7 +93,6 @@ llama_context::llama_context(
    }

    cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
-
    cparams.op_offload = params.op_offload;

    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
@@ -181,9 +176,8 @@ llama_context::llama_context(
    // init the memory module
    if (!hparams.vocab_only) {
        llama_memory_params params_mem = {
-            /*.type_k   =*/ params.type_k,
-            /*.type_v   =*/ params.type_v,
-            /*.swa_full =*/ params.swa_full,
+            /*.type_k =*/ params.type_k,
+            /*.type_v =*/ params.type_v,
        };

        memory.reset(model.create_memory(params_mem, cparams));
@@ -738,10 +732,12 @@ int llama_context::encode(llama_batch & inp_batch) {

    const auto causal_attn_org = cparams.causal_attn;

-    // always use non-causal attention for encoder graphs
-    // TODO: this is a tmp solution until we have a proper way to support enc-dec models
-    //       ref: https://github.com/ggml-org/llama.cpp/pull/12181#issuecomment-2730451223
-    cparams.causal_attn = false;
+    if (model.arch == LLM_ARCH_T5) {
+        // always use non-causal attention for encoder graphs
+        // TODO: this is a tmp solution until we have a proper way to support enc-dec models
+        //       ref: https://github.com/ggml-org/llama.cpp/pull/12181#issuecomment-2730451223
+        cparams.causal_attn = false;
+    }

    auto * gf = graph_init();
    auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_ENCODER);
@@ -861,17 +857,11 @@ int llama_context::decode(llama_batch & inp_batch) {
        return -1;
    }

-    if (!inp_batch.pos) {
-        if (inp_batch.seq_id) {
-            LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__);
-            return -1;
-        }
-    }
-
    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());

    // temporary allocate memory for the input batch if needed
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->seq_pos_max(0) + 1);
+    // TODO: this is incorrect for multiple sequences because get_pos_max() is the maximum across all sequences
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->get_pos_max() + 1);

    const llama_batch & batch = batch_allocr.batch;

@@ -959,6 +949,8 @@ int llama_context::decode(llama_batch & inp_batch) {

        // find KV slot
        if (!kv_self->find_slot(ubatch)) {
+            LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+
            return 1;
        }

@@ -2103,7 +2095,6 @@ llama_context_params llama_context_default_params() {
        /*.flash_attn                  =*/ false,
        /*.no_perf                     =*/ true,
        /*.op_offload                  =*/ true,
-        /*.swa_full                    =*/ true,
    };

    return result;
@@ -2298,51 +2289,65 @@ int32_t llama_apply_adapter_cvec(
    return res ? 0 : -1;
 }

+//
+// kv cache view
+//
+
+llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) {
+    const auto * kv = ctx->get_kv_self();
+    if (kv == nullptr) {
+        LLAMA_LOG_WARN("%s: the context does not have a KV cache\n", __func__);
+        return {};
+    }
+
+    return llama_kv_cache_view_init(*kv, n_seq_max);
+}
+
+void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) {
+    const auto * kv = ctx->get_kv_self();
+    if (kv == nullptr) {
+        LLAMA_LOG_WARN("%s: the context does not have a KV cache\n", __func__);
+        return;
+    }
+
+    llama_kv_cache_view_update(view, kv);
+}
+
 //
 // kv cache
 //

 // deprecated
+int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
+    return llama_kv_self_n_tokens(ctx);
+}
+
 int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
    const auto * kv = ctx->get_kv_self();
    if (!kv) {
        return 0;
    }

-    int32_t res = 0;
-
-    for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
-        const llama_pos p0 = kv->seq_pos_min(s);
-        const llama_pos p1 = kv->seq_pos_max(s);
-
-        if (p0 >= 0) {
-            res += (p1 - p0) + 1;
-        }
-    }
-
-    return res;
+    return kv->get_n_tokens();
 }

 // deprecated
-// note: this is the same as above - will be removed anyway, so it's ok
+int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) {
+    return llama_kv_self_used_cells(ctx);
+}
+
 int32_t llama_kv_self_used_cells(const llama_context * ctx) {
    const auto * kv = ctx->get_kv_self();
    if (!kv) {
        return 0;
    }

-    int32_t res = 0;
+    return kv->get_used_cells();
+}

-    for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
-        const llama_pos p0 = kv->seq_pos_min(s);
-        const llama_pos p1 = kv->seq_pos_max(s);
-
-        if (p0 >= 0) {
-            res += (p1 - p0) + 1;
-        }
-    }
-
-    return res;
+// deprecated
+void llama_kv_cache_clear(llama_context * ctx) {
+    llama_kv_self_clear(ctx);
 }

 void llama_kv_self_clear(llama_context * ctx) {
@@ -2354,6 +2359,15 @@ void llama_kv_self_clear(llama_context * ctx) {
    kv->clear();
 }

+// deprecated
+bool llama_kv_cache_seq_rm(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_self_seq_rm(ctx, seq_id, p0, p1);
+}
+
 bool llama_kv_self_seq_rm(
        llama_context * ctx,
         llama_seq_id   seq_id,
@@ -2367,6 +2381,16 @@ bool llama_kv_self_seq_rm(
    return kv->seq_rm(seq_id, p0, p1);
 }

+// deprecated
+void llama_kv_cache_seq_cp(
+        llama_context * ctx,
+         llama_seq_id   seq_id_src,
+         llama_seq_id   seq_id_dst,
+            llama_pos   p0,
+            llama_pos   p1) {
+    llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
+}
+
 void llama_kv_self_seq_cp(
        llama_context * ctx,
         llama_seq_id   seq_id_src,
@@ -2381,6 +2405,13 @@ void llama_kv_self_seq_cp(
    kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
 }

+// deprecated
+void llama_kv_cache_seq_keep(
+        llama_context * ctx,
+         llama_seq_id   seq_id) {
+    llama_kv_self_seq_keep(ctx, seq_id);
+}
+
 void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
    auto * kv = ctx->get_kv_self();
    if (!kv) {
@@ -2390,6 +2421,16 @@ void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
    kv->seq_keep(seq_id);
 }

+// deprecated
+void llama_kv_cache_seq_add(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+            llama_pos   delta) {
+    llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
+}
+
 void llama_kv_self_seq_add(
        llama_context * ctx,
         llama_seq_id   seq_id,
@@ -2404,6 +2445,16 @@ void llama_kv_self_seq_add(
    kv->seq_add(seq_id, p0, p1, delta);
 }

+// deprecated
+void llama_kv_cache_seq_div(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+                  int   d) {
+    llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
+}
+
 void llama_kv_self_seq_div(
        llama_context * ctx,
         llama_seq_id   seq_id,
@@ -2418,24 +2469,25 @@ void llama_kv_self_seq_div(
    kv->seq_div(seq_id, p0, p1, d);
 }

-llama_pos llama_kv_self_seq_pos_min(llama_context * ctx, llama_seq_id seq_id) {
-    const auto * kv = ctx->get_kv_self();
-    if (!kv) {
-        return -1;
-    }
-
-    return kv->seq_pos_min(seq_id);
+// deprecated
+llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
+    return llama_kv_self_seq_pos_max(ctx, seq_id);
 }

 llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
    const auto * kv = ctx->get_kv_self();
    if (!kv) {
-        return -1;
+        return 0;
    }

    return kv->seq_pos_max(seq_id);
 }

+// deprecated
+void llama_kv_cache_defrag(llama_context * ctx) {
+    llama_kv_self_defrag(ctx);
+}
+
 void llama_kv_self_defrag(llama_context * ctx) {
    auto * kv = ctx->get_kv_self();
    if (!kv) {
@@ -2446,6 +2498,11 @@ void llama_kv_self_defrag(llama_context * ctx) {
    kv->defrag_sched(-1.0f);
 }

+// deprecated
+bool llama_kv_cache_can_shift(const llama_context * ctx) {
+    return llama_kv_self_can_shift(ctx);
+}
+
 bool llama_kv_self_can_shift(const llama_context * ctx) {
    const auto * kv = ctx->get_kv_self();
    if (!kv) {
@@ -2455,6 +2512,11 @@ bool llama_kv_self_can_shift(const llama_context * ctx) {
    return kv->get_can_shift();
 }

+// deprecated
+void llama_kv_cache_update(llama_context * ctx) {
+    llama_kv_self_update(ctx);
+}
+
 // llama state API

 // deprecated
@@ -2577,21 +2639,7 @@ int32_t llama_encode(
 int32_t llama_decode(
        llama_context * ctx,
          llama_batch   batch) {
-    int ret = ctx->decode(batch);
-
-    // defrag and try again
-    // TODO: distinguish return code when we are sure that even after defrag there is no space available
-    if (ret == 1) {
-        llama_kv_self_defrag(ctx);
-        ret = ctx->decode(batch);
-
-        if (ret == 1) {
-            LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
-
-            return ret;
-        }
-    }
-
+    const int ret = ctx->decode(batch);
    if (ret != 0) {
        LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
    }
--- a/src/llama-cparams.cpp
+++ b/src/llama-cparams.cpp
@@ -1,5 +1 @@
 #include "llama-cparams.h"
-
-size_t llama_max_parallel_sequences(void) {
-    return LLAMA_MAX_PARALLEL_SEQUENCES;
-}
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -4,8 +4,6 @@

 #include <cstdint>

-#define LLAMA_MAX_PARALLEL_SEQUENCES 64
-
 struct llama_cparams {
    uint32_t n_ctx;           // context size used during inference
    uint32_t n_batch;
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@@ -1177,18 +1177,8 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
            for (const auto & trigger_pattern : grammar.trigger_patterns) {
                if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
                    grammar.awaiting_trigger = false;
-                    // get from the first matched capturing group to the end of the string
-                    size_t start = std::string::npos;
-                    for (auto i = 1u; i < match.size(); i++) {
-                        if (match.length(i) > 0) {
-                            start = match.position(i);
-                            break;
-                        }
-                    }
-                    if (start == std::string::npos) {
-                        start = match.position(0);
-                    }
-                    auto constrained_str = grammar.trigger_buffer.substr(start);
+                    // get from the first match to the end of the string
+                    auto constrained_str = grammar.trigger_buffer.substr(match.position(1));
                    // std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
                    grammar.trigger_buffer.clear();
                    llama_grammar_accept_str(grammar, constrained_str);
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -9,6 +9,33 @@
 #include <cmath>
 #include <cstring>

+static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
+    // TODO move to hparams if a T5 variant appears that uses a different value
+    const int64_t max_distance = 128;
+
+    if (bidirectional) {
+        n_buckets >>= 1;
+    }
+
+    const int64_t max_exact = n_buckets >> 1;
+
+    int32_t relative_position = x - y;
+    int32_t relative_bucket = 0;
+
+    if (bidirectional) {
+        relative_bucket += (relative_position > 0) * n_buckets;
+        relative_position = abs(relative_position);
+    } else {
+        relative_position = -std::min<int32_t>(relative_position, 0);
+    }
+
+    int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
+    relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
+    relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
+
+    return relative_bucket;
+}
+
 void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
    if (ubatch->token) {
        const int64_t n_tokens = ubatch->n_tokens;
@@ -83,7 +110,22 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {

 void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
    if (pos_bucket) {
-        kv_self->set_input_pos_bucket(pos_bucket, ubatch);
+        const int64_t n_tokens = ubatch->n_tokens;
+
+        GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer));
+        GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
+
+        int32_t * data = (int32_t *) pos_bucket->data;
+
+        const int64_t n_kv = kv_self->n;
+
+        for (int h = 0; h < 1; ++h) {
+            for (int j = 0; j < n_tokens; ++j) {
+                for (int i = 0; i < n_kv; ++i) {
+                    data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self->cells[i].pos, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
+                }
+            }
+        }
    }
 }

@@ -361,18 +403,99 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
 }

 void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
-    if (self_kq_mask) {
-        kv_self->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
-    }
-}
+    if (self_kq_mask || self_kq_mask_swa) {
+        const int64_t n_kv         = kv_self->n;
+        const int64_t n_tokens     = ubatch->n_tokens;
+        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
+        const int64_t n_seqs       = ubatch->n_seqs;

-void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
-    if (self_kq_mask) {
-        kv_self->get_kv_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
-    }
+        float * data     = nullptr;
+        float * data_swa = nullptr;

-    if (self_kq_mask_swa) {
-        kv_self->get_kv_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
+        if (self_kq_mask) {
+            GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
+            data = (float *) self_kq_mask->data;
+        }
+
+        if (self_kq_mask_swa) {
+            GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
+            data_swa = (float *) self_kq_mask_swa->data;
+        }
+
+        // Use only the previous KV cells of the correct sequence for each token of the ubatch.
+        // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
+        // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
+        //   Causal mask:
+        //      xxx-------
+        //      xxxx------
+        //      xxxxx-----
+        //   Non-causal mask:
+        //      xxxxx-----
+        //      xxxxx-----
+        //      xxxxx-----
+        // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
+        for (int h = 0; h < 1; ++h) {
+            for (int s = 0; s < n_seqs; ++s) {
+                const llama_seq_id seq_id = ubatch->seq_id[s][0];
+
+                for (int j = 0; j < n_seq_tokens; ++j) {
+                    const llama_pos pos = ubatch->pos[s*n_seq_tokens + j];
+                    for (int i = 0; i < n_kv; ++i) {
+                        float f;
+                        // mask the token if:
+                        if (!kv_self->cells[i].has_seq_id(seq_id) // not the correct sequence
+                            || (cparams.causal_attn && kv_self->cells[i].pos > pos) // for causal, mask future tokens
+                        ) {
+                            f = -INFINITY;
+                        } else {
+                            if (hparams.use_alibi) {
+                                f = -std::abs(kv_self->cells[i].pos - pos);
+                            } else {
+                                f = 0.0f;
+                            }
+                        }
+
+                        if (data) {
+                            data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
+                        }
+
+                        // may need to cut off old tokens for sliding window
+                        // TODO @ngxson : we are currently re-using the swa logic to store the chunked mask, we should rename SWA to something more generic like "aux mask"
+                        if (data_swa) {
+                            if (hparams.n_attn_chunk) {
+                                llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
+                                if (kv_self->cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
+                                    f = -INFINITY;
+                                }
+                            } else {
+                                if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
+                                    f = -INFINITY;
+                                }
+                            }
+                            data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
+                        }
+                    }
+                }
+            }
+
+            // mask padded tokens
+            if (data) {
+                for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+                    for (int j = 0; j < n_kv; ++j) {
+                        data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
+                    }
+                }
+            }
+
+            // mask padded tokens
+            if (data_swa) {
+                for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+                    for (int j = 0; j < n_kv; ++j) {
+                        data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
+                    }
+                }
+            }
+        }
    }
 }

@@ -422,6 +545,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    n_layer          (hparams.n_layer),
    n_rot            (hparams.n_rot),
    n_ctx            (cparams.n_ctx),
+    n_ctx_per_seq    (cparams.n_ctx / cparams.n_seq_max),
    n_head           (hparams.n_head()),
    n_head_kv        (hparams.n_head_kv()),
    n_embd_head_k    (hparams.n_embd_head_k),
@@ -1029,7 +1153,7 @@ ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {

    auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, kv_self);

-    const auto n_kv = kv_self->get_n();
+    const auto n_kv = kv_self->n;

    auto & cur = inp->pos_bucket;

@@ -1064,12 +1188,16 @@ ggml_tensor * llm_graph_context::build_attn_mha(
         ggml_tensor * kq_b,
         ggml_tensor * kq_mask,
         ggml_tensor * v_mla,
+             bool      v_trans,
             float     kq_scale) const {
-    const bool v_trans = v->nb[1] > v->nb[2];
+  //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+  //const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);

-    q = ggml_permute(ctx0, q, 0, 2, 1, 3);
-    k = ggml_permute(ctx0, k, 0, 2, 1, 3);
-    v = ggml_permute(ctx0, v, 0, 2, 1, 3);
+  //const int64_t n_head    = hparams.n_head(il);
+  //const int64_t n_head_kv = hparams.n_head_kv(il);
+
+  //const auto & n_embd_head_k = hparams.n_embd_head_k;
+  //const auto & n_embd_head_v = hparams.n_embd_head_v;

    const auto n_tokens = q->ne[1];
    const auto n_head   = q->ne[2];
@@ -1208,11 +1336,17 @@ ggml_tensor * llm_graph_context::build_attn(

    const auto & kq_mask = inp->get_kq_mask();

-    ggml_tensor * q = q_cur;
-    ggml_tensor * k = k_cur;
-    ggml_tensor * v = v_cur;
+    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
+    //cb(q, "q", il);
+
+    ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
+    //cb(k, "k", il);
+
+    ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
+    //cb(k, "v", il);
+
+    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);

-    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
    cb(cur, "kqv_out", il);

    if (wo) {
@@ -1235,16 +1369,22 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()

    auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);

-    {
-        GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA");
+    const auto n_kv = kv_self->n;

-        const auto n_kv = kv_self->get_n();
+    inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+    //cb(inp->self_kq_mask, "KQ_mask", -1);
+    ggml_set_input(inp->self_kq_mask);

-        inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-        //cb(inp->self_kq_mask, "KQ_mask", -1);
-        ggml_set_input(inp->self_kq_mask);
+    inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;

-        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+    if (hparams.n_swa_pattern > 1) {
+        GGML_ASSERT(hparams.n_swa > 0);
+
+        inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+        //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
+        ggml_set_input(inp->self_kq_mask_swa);
+
+        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
    }

    return (llm_graph_input_attn_kv_unified *) res->add_input(std::move(inp));
@@ -1269,104 +1409,81 @@ ggml_tensor * llm_graph_context::build_attn(
    ggml_build_forward_expand(gf, v_cur);

    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const auto & n_ctx = cparams.n_ctx;
+
+    const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+    const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+    const auto n_tokens = q_cur->ne[2];
+
+    const bool v_trans = !cparams.flash_attn;

    // store to KV cache
    {
-        ggml_build_forward_expand(gf, kv_self->cpy_k(ctx0, k_cur, il));
-        ggml_build_forward_expand(gf, kv_self->cpy_v(ctx0, v_cur, il));
-    }
+        const auto kv_head = kv_self->head;

-    const auto & kq_mask = inp->get_kq_mask();
+        GGML_ASSERT(kv_self->size == n_ctx);

-    ggml_tensor * q = q_cur;
-    ggml_tensor * k = kv_self->get_k(ctx0, il);
-    ggml_tensor * v = kv_self->get_v(ctx0, il);
+        ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self->k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa)*kv_head);
+        //cb(k_cache_view, "k_cache_view", il);

-    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
-    cb(cur, "kqv_out", il);
+        // note: storing RoPE-ed version of K in the KV cache
+        ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_cur, k_cache_view));

-    if (wo) {
-        cur = build_lora_mm(wo, cur);
-        if (arch == LLM_ARCH_GLM4) {
-            // GLM4 seems to have numerical issues with half-precision accumulators
-            ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
+        v_cur = ggml_reshape_2d(ctx0, v_cur, n_embd_v_gqa, n_tokens);
+
+        ggml_tensor * v_cache_view = nullptr;
+
+        if (!v_trans) {
+            v_cache_view = ggml_view_1d(ctx0, kv_self->v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa)*kv_head);
+        } else {
+            // note: the V cache is transposed when not using flash attention
+            v_cache_view = ggml_view_2d(ctx0, kv_self->v_l[il], n_tokens, n_embd_v_gqa,
+                    (  n_ctx)*ggml_element_size(kv_self->v_l[il]),
+                    (kv_head)*ggml_element_size(kv_self->v_l[il]));
+
+            v_cur = ggml_transpose(ctx0, v_cur);
        }
+        //cb(v_cache_view, "v_cache_view", il);
+
+        ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view));
    }

-    if (wo_b) {
-        cur = ggml_add(ctx0, cur, wo_b);
-    }
-
-    return cur;
-}
-
-llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const {
-    const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
-
-    auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, kv_self);
-
-    {
-        const auto n_kv = kv_self->get_kv_base()->get_n();
-
-        inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-        //cb(inp->self_kq_mask, "KQ_mask", -1);
-        ggml_set_input(inp->self_kq_mask);
-
-        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
-    }
-
-    {
-        GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA");
-
-        const auto n_kv = kv_self->get_kv_swa()->get_n();
-
-        inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-        //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
-        ggml_set_input(inp->self_kq_mask_swa);
-
-        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
-    }
-
-    return (llm_graph_input_attn_kv_unified_iswa *) res->add_input(std::move(inp));
-}
-
-ggml_tensor * llm_graph_context::build_attn(
-        llm_graph_input_attn_kv_unified_iswa * inp,
-        ggml_cgraph * gf,
-        ggml_tensor * wo,
-        ggml_tensor * wo_b,
-        ggml_tensor * q_cur,
-        ggml_tensor * k_cur,
-        ggml_tensor * v_cur,
-        ggml_tensor * kq_b,
-        ggml_tensor * v_mla,
-            float     kq_scale,
-            int       il) const {
-    // these nodes are added to the graph together so that they are not reordered
-    // by doing so, the number of splits in the graph is reduced
-    ggml_build_forward_expand(gf, q_cur);
-    ggml_build_forward_expand(gf, k_cur);
-    ggml_build_forward_expand(gf, v_cur);
-
    const bool is_swa = hparams.is_swa(il);

-    const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
-
-    const auto * kv = is_swa ? kv_self->get_kv_swa() : kv_self->get_kv_base();
-
-    // store to KV cache
-    {
-        ggml_build_forward_expand(gf, kv->cpy_k(ctx0, k_cur, il));
-        ggml_build_forward_expand(gf, kv->cpy_v(ctx0, v_cur, il));
-    }
-
    const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();

-    ggml_tensor * q = q_cur;
-    ggml_tensor * k = kv->get_k(ctx0, il);
-    ggml_tensor * v = kv->get_v(ctx0, il);
+    const auto n_kv = kv_self->n;

-    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
+    const int64_t n_head_kv = hparams.n_head_kv(il);
+
+    const auto & n_embd_head_k = hparams.n_embd_head_k;
+    const auto & n_embd_head_v = hparams.n_embd_head_v;
+
+    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
+    //cb(q, "q", il);
+
+    ggml_tensor * k =
+        ggml_view_3d(ctx0, kv_self->k_l[il],
+                n_embd_head_k, n_kv, n_head_kv,
+                ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
+                ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k),
+                0);
+    //cb(k, "k", il);
+
+    ggml_tensor * v = !v_trans ?
+        ggml_view_3d(ctx0, kv_self->v_l[il],
+                n_embd_head_v, n_kv, n_head_kv,
+                ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
+                ggml_row_size(kv_self->v_l[il]->type, n_embd_head_v),
+                0) :
+        ggml_view_3d(ctx0, kv_self->v_l[il],
+                n_kv, n_embd_head_v, n_head_kv,
+                ggml_element_size(kv_self->v_l[il])*n_ctx,
+                ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v,
+                0);
+
+    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, v_trans, kq_scale);
    cb(cur, "kqv_out", il);

    if (wo) {
@@ -1417,11 +1534,17 @@ ggml_tensor * llm_graph_context::build_attn(

    const auto & kq_mask = inp->get_kq_mask_cross();

-    ggml_tensor * q = q_cur;
-    ggml_tensor * k = k_cur;
-    ggml_tensor * v = v_cur;
+    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
+    //cb(q, "q", il);
+
+    ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
+    //cb(k, "k", il);
+
+    ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
+    //cb(k, "v", il);
+
+    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);

-    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
    cb(cur, "kqv_out", il);

    if (wo) {
@@ -1589,30 +1712,3 @@ void llm_graph_context::build_pooling(

    ggml_build_forward_expand(gf, cur);
 }
-
-int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
-    // TODO move to hparams if a T5 variant appears that uses a different value
-    const int64_t max_distance = 128;
-
-    if (bidirectional) {
-        n_buckets >>= 1;
-    }
-
-    const int64_t max_exact = n_buckets >> 1;
-
-    int32_t relative_position = x - y;
-    int32_t relative_bucket = 0;
-
-    if (bidirectional) {
-        relative_bucket += (relative_position > 0) * n_buckets;
-        relative_position = abs(relative_position);
-    } else {
-        relative_position = -std::min<int32_t>(relative_position, 0);
-    }
-
-    int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
-    relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
-    relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
-
-    return relative_bucket;
-}
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -19,7 +19,6 @@ struct llama_cparams;

 class llama_memory_i;
 class llama_kv_cache_unified;
-class llama_kv_cache_unified_iswa;
 class llama_kv_cache_recurrent;

 // certain models (typically multi-modal) can produce different types of graphs
@@ -256,31 +255,6 @@ public:

    void set_input(const llama_ubatch * ubatch) override;

-    ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
-
-    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch]
-    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch]
-
-    const llama_hparams & hparams;
-    const llama_cparams & cparams;
-
-    const llama_kv_cache_unified * kv_self;
-};
-
-class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
-public:
-    llm_graph_input_attn_kv_unified_iswa(
-            const llama_hparams & hparams,
-            const llama_cparams & cparams,
-            const llama_kv_cache_unified_iswa * kv_self) :
-        hparams(hparams),
-        cparams(cparams),
-        kv_self(kv_self) {
-    }
-    ~llm_graph_input_attn_kv_unified_iswa() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
    ggml_tensor * get_kq_mask()     const { return self_kq_mask_cnv; }
    ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }

@@ -292,7 +266,7 @@ public:
    const llama_hparams & hparams;
    const llama_cparams & cparams;

-    const llama_kv_cache_unified_iswa * kv_self;
+    const llama_kv_cache_unified * kv_self;
 };

 class llm_graph_input_attn_cross : public llm_graph_input_i {
@@ -404,6 +378,7 @@ struct llm_graph_context {
    const int64_t n_layer;
    const int64_t n_rot;
    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
+    const int64_t n_ctx_per_seq;
    const int64_t n_head;
    const int64_t n_head_kv;
    const int64_t n_embd_head_k;
@@ -532,12 +507,13 @@ struct llm_graph_context {

    ggml_tensor * build_attn_mha(
             ggml_cgraph * gf,
-             ggml_tensor * q,       // [n_embd_head_q, n_head_q, n_tokens]
-             ggml_tensor * k,       // [n_embd_head_k, n_head_k, n_tokens]
-             ggml_tensor * v,       // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
+             ggml_tensor * q,     // [n_embd_head_q, n_tokens, n_head_q]
+             ggml_tensor * k,     // [n_embd_head_k, n_tokens, n_head_k]
+             ggml_tensor * v,     // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
             ggml_tensor * kq_b,
             ggml_tensor * kq_mask,
-             ggml_tensor * v_mla,   // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+             ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+                    bool   v_trans,
                   float   kq_scale) const;

    llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
@@ -570,21 +546,6 @@ struct llm_graph_context {
                  float   kq_scale,
                    int   il) const;

-    llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const;
-
-    ggml_tensor * build_attn(
-            llm_graph_input_attn_kv_unified_iswa * inp,
-            ggml_cgraph * gf,
-            ggml_tensor * wo,
-            ggml_tensor * wo_b,
-            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
-            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
-            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
-            ggml_tensor * kq_b,
-            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
-                  float   kq_scale,
-                    int   il) const;
-
    llm_graph_input_attn_cross * build_attn_inp_cross() const;

    ggml_tensor * build_attn(
@@ -635,6 +596,3 @@ struct llm_graph_context {
            ggml_tensor * cls_out,
            ggml_tensor * cls_out_b) const;
 };
-
-// TODO: better name
-int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional);
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -2,22 +2,6 @@

 #include "ggml.h"

-void llama_hparams::set_swa_pattern(uint32_t n_pattern) {
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
-    }
-}
-
-bool llama_hparams::is_swa_any() const {
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        if (swa_layers[il]) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
 uint32_t llama_hparams::n_head(uint32_t il) const {
    if (il < n_layer) {
        return n_head_arr[il];
@@ -88,7 +72,7 @@ uint32_t llama_hparams::n_embd_v_s() const {

 bool llama_hparams::is_swa(uint32_t il) const {
    if (il < n_layer) {
-        return swa_layers[il];
+        return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
    }

    GGML_ABORT("fatal error");
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -14,12 +14,6 @@ enum llama_expert_gating_func_type {
    LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
 };

-enum llama_swa_type {
-    LLAMA_SWA_TYPE_NONE     = 0,
-    LLAMA_SWA_TYPE_STANDARD = 1,
-    LLAMA_SWA_TYPE_CHUNKED  = 2,
-};
-
 struct llama_hparams_posnet {
    uint32_t n_embd;
    uint32_t n_layer;
@@ -41,6 +35,8 @@ struct llama_hparams {
    uint32_t n_embd_features = 0;
    uint32_t n_layer;
    uint32_t n_rot;
+    uint32_t n_swa = 0; // sliding window attention (SWA)
+    uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention
    uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
    uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
    uint32_t n_expert = 0;
@@ -100,15 +96,6 @@ struct llama_hparams {

    std::array<int, 4> rope_sections;

-    // Sliding Window Attention (SWA)
-    llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
-    // the size of the sliding window (0 - no SWA)
-    uint32_t n_swa = 0;
-    // if swa_layers[il] == true, then layer il is SWA
-    // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
-    // by default, all layers are dense
-    std::array<bool, LLAMA_MAX_LAYERS> swa_layers;
-
    // for State Space Models
    uint32_t ssm_d_conv  = 0;
    uint32_t ssm_d_inner = 0;
@@ -129,10 +116,11 @@ struct llama_hparams {
    bool causal_attn   = true;
    bool use_alibi     = false;
    bool attn_soft_cap = false;
-    bool use_kq_norm   = true;

-    // llama4
    uint32_t n_moe_layer_step        = 0;
+    bool     use_kq_norm             = true;
+    uint32_t n_attn_chunk            = 0;
+    // values below seems to be fixed on llama4
    uint32_t n_no_rope_layer_step    = 4;
    uint32_t n_attn_temp_floor_scale = 8192;
    float    f_attn_temp_scale       = 0.1;
@@ -145,23 +133,6 @@ struct llama_hparams {
    enum llama_rope_type         rope_type               = LLAMA_ROPE_TYPE_NONE;
    enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;

-    // this value n_pattern means that every nth layer is dense (i.e. non-SWA)
-    // note that if n_pattern == 0, all layers are SWA
-    //           if n_pattern == 1, all layers are dense
-    // example: n_pattern = 3
-    //   il == 0: swa
-    //   il == 1: swa
-    //   il == 2: dense
-    //   il == 3: swa
-    //   il == 4: swa
-    //   il == 5: dense
-    //   il == 6: swa
-    //   etc ...
-    void set_swa_pattern(uint32_t n_pattern);
-
-    // return true if one of the layers is SWA
-    bool is_swa_any() const;
-
    uint32_t n_head(uint32_t il = 0) const;

    uint32_t n_head_kv(uint32_t il = 0) const;
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -4,12 +4,10 @@
 #include "llama-io.h"
 #include "llama-graph.h"
 #include "llama-memory.h"
-#include "llama-kv-cells.h"

 #include "ggml-cpp.h"

 #include <set>
-#include <unordered_map>
 #include <vector>

 struct llama_cparams;
@@ -36,16 +34,12 @@ struct llama_kv_cache : public llama_memory_i {
    virtual void defrag_sched(float thold) = 0;

    // simulate full cache, used for allocating worst-case compute buffers
-    // TODO: remove
    virtual void set_full() = 0;

    //
    // batch processing
    //

-    // =============================================================================================================
-    // TODO: refactor and simplify this [TAG: KV_API]
-
    virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0;

    // different KV caches require different batch splitting strategies
@@ -54,10 +48,11 @@ struct llama_kv_cache : public llama_memory_i {
    // find an empty slot of size "n_tokens" in the cache
    virtual bool find_slot(const llama_ubatch & batch) = 0;

-    // =============================================================================================================
-
    // getters
-    virtual bool get_can_shift() const = 0;
+    virtual int32_t   get_n_tokens()   const = 0;
+    virtual int32_t   get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
+    virtual llama_pos get_pos_max()    const = 0;
+    virtual bool      get_can_shift()  const = 0;

    bool get_can_edit() const override { return get_can_shift(); }

@@ -92,25 +87,38 @@ private:
 // llama_kv_cache_unified
 //

+// TODO: add notion of max sequences
 class llama_kv_cache_unified : public llama_kv_cache {
 public:
+    struct kv_cell {
+        llama_pos pos   = -1;
+        llama_pos delta =  0;
+
+        std::set<llama_seq_id> seq_id;
+
+        bool has_seq_id(const llama_seq_id & id) const {
+            return seq_id.find(id) != seq_id.end();
+        }
+
+        bool is_empty() const {
+            return seq_id.empty();
+        }
+
+        bool is_same_seq(const kv_cell & other) const {
+            return seq_id == other.seq_id;
+        }
+    };
+
    static uint32_t get_padding(const llama_cparams & cparams);

-    // this callback is used to filter out layers that should not be included in the cache
-    using layer_filter_cb = std::function<bool(int32_t il)>;
-
    llama_kv_cache_unified(
-            const llama_model &  model,
-              layer_filter_cb && filter,
-                    ggml_type    type_k,
-                    ggml_type    type_v,
-                         bool    v_trans,
-                         bool    offload,
-                     uint32_t    kv_size,
-                     uint32_t    n_seq_max,
-                     uint32_t    n_pad,
-                     uint32_t    n_swa,
-               llama_swa_type    swa_type);
+            const llama_model & model,
+                    ggml_type   type_k,
+                    ggml_type   type_v,
+                         bool   v_trans,
+                         bool   offload,
+                     uint32_t   kv_size,
+                     uint32_t   padding);

    ~llama_kv_cache_unified() = default;

@@ -122,11 +130,10 @@ public:

    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
-    void seq_keep(llama_seq_id seq_id)                                                          override;
-    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
+    void seq_keep(llama_seq_id seq_id) override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;

-    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
    llama_pos seq_pos_max(llama_seq_id seq_id) const override;

    //
@@ -143,6 +150,7 @@ public:
    void set_full() override;

    llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
+
    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;

    // updates the cache head
@@ -150,94 +158,50 @@ public:
    // to the first cell of the slot.
    bool find_slot(const llama_ubatch & batch) override;

+    int32_t get_n_tokens()   const override;
+    int32_t get_used_cells() const override;
+
+    // TODO: better data structures to reduce the cost of this operation
+    llama_pos get_pos_max() const override;
+
    bool get_can_shift() const override;

    // state write/load

    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;

-    //
-    // llama_kv_cache_unified specific API
-    //
+    uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
+    uint32_t size = 0; // total number of cells, shared across all sequences
+    uint32_t used = 0; // used cells (i.e. at least one seq_id)

-    uint32_t get_n()    const;
-    uint32_t get_size() const;
+    // computed before each graph build
+    uint32_t n = 0;

-    // get views of the current state of the cache
-    ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
-    ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
+    std::vector<kv_cell> cells;

-    // store k_cur and v_cur in the cache based on the current head location
-    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const;
-    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const;
-
-    void prune_swa(llama_seq_id seq_id, llama_pos pmin, llama_pos pmax);
-
-    void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
-    void set_input_k_shift   (ggml_tensor * dst) const;
-    void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
+    std::vector<ggml_tensor *> k_l; // per layer
+    std::vector<ggml_tensor *> v_l;

 private:
    const llama_model & model;
    const llama_hparams & hparams;

-    struct kv_layer {
-        // layer index in the model
-        // note: can be different from the layer index in the KV cache
-        uint32_t il;
-
-        ggml_tensor * k;
-        ggml_tensor * v;
-    };
-
+    bool has_shift = false;
    bool do_defrag = false;
+
    bool v_trans   = true;  // the value tensor is transposed
-
-    uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
-
-    // computed before each graph build
-    // TODO: cells should start to maintain this value dynamically based on the edits
-    uint32_t n = 0;
-
-    const uint32_t n_seq_max = 1;
+    bool can_shift = false;

    // required padding
-    const uint32_t n_pad = 1;
+    uint32_t padding = 1;

-    // SWA
-    const uint32_t n_swa = 0;
-
-    const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
+    ggml_type type_k = GGML_TYPE_F16;
+    ggml_type type_v = GGML_TYPE_F16;

    std::vector<ggml_context_ptr>        ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;

-    llama_kv_cells_unified cells;
-
-    std::vector<kv_layer> layers;
-
-    // model layer id -> KV cache layer id
-    std::unordered_map<int32_t, int32_t> map_layer_ids;
-
-    // recovery information used to restore the KV cells to their original state in case of a failure
-    // TODO: do not store as a state in the llama_kv_cache object, instead return upon batch preparation
-    //       to achieve that, first need to refactor the llama_kv_cache interface [TAG: KV_API]
-    struct {
-        void clear() {
-            states.clear();
-        }
-
-        struct state {
-            uint32_t i;
-
-            llama_kv_cells_unified cells;
-        };
-
-        // stack with the partial states before each ubatch
-        std::vector<state> states;
-    } recovery;
-
    // defrag
    struct {
        std::vector<uint32_t> ids;
@@ -246,8 +210,18 @@ private:
    // return true if cells have been moved
    bool defrag_prepare(int32_t n_max_nodes);

+    // commit/restore cache
+    struct slot_range {
+        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
+        uint32_t c1 = 0;
+    };
+
+    // pending cell updates that are not yet committed
+    struct {
+        std::vector<slot_range> ranges;
+    } pending;
+
    // find how many cells are currently in use
-    // TODO: optimize
    uint32_t cell_max() const;

    size_t total_size() const;
@@ -255,8 +229,6 @@ private:
    size_t size_k_bytes() const;
    size_t size_v_bytes() const;

-    bool is_masked_swa(llama_pos p0, llama_pos p1) const;
-
    ggml_tensor * build_rope_shift(
            const llama_cparams & cparams,
                   ggml_context * ctx,
@@ -283,100 +255,6 @@ private:
    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
 };

-//
-// llama_kv_cache_unified_iswa
-//
-
-// utilizes two instances of llama_kv_cache_unified
-//   the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
-//   upon successful commit, the SWA cache removes old tokens outside the n_swa window
-
-class llama_kv_cache_unified_iswa : public llama_kv_cache {
-public:
-    llama_kv_cache_unified_iswa(
-            const llama_model & model,
-                    ggml_type   type_k,
-                    ggml_type   type_v,
-                         bool   v_trans,
-                         bool   offload,
-                         bool   swa_full,
-                     uint32_t   kv_size,
-                     uint32_t   n_seq_max,
-                     uint32_t   n_batch,
-                     uint32_t   n_pad);
-
-    ~llama_kv_cache_unified_iswa() = default;
-
-    //
-    // llama_memory_i
-    //
-
-    void clear() override;
-
-    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
-    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
-    void seq_keep(llama_seq_id seq_id)                                                          override;
-    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
-    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
-
-    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
-    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
-
-    //
-    // llama_kv_cache
-    //
-
-    void restore() override;
-    void commit()  override;
-
-    bool update(llama_context & ctx) override;
-
-    void defrag_sched(float thold) override;
-
-    void set_full() override;
-
-    llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
-    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
-
-    bool find_slot(const llama_ubatch & batch) override;
-
-    bool get_can_shift() const override;
-
-    // state write/load
-
-    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
-
-    //
-    // llama_kv_cache_unified_iswa specific API
-    //
-
-    llama_kv_cache_unified * get_kv_base() const;
-    llama_kv_cache_unified * get_kv_swa () const;
-
-private:
-    const llama_hparams & hparams;
-
-    bool do_prune = true;
-
-    struct {
-        struct entry {
-            llama_pos pmin;
-            llama_pos pmax;
-        };
-
-        void clear() {
-            pos.clear();
-        }
-
-        // used to perform SWA pruning of old tokens
-        std::unordered_map<llama_seq_id, entry> pos;
-    } pending;
-
-    std::unique_ptr<llama_kv_cache_unified> kv_base;
-    std::unique_ptr<llama_kv_cache_unified> kv_swa;
-};
-
 //
 // llama_kv_cache_recurrent
 //
@@ -408,8 +286,7 @@ public:
                    ggml_type   type_k,
                    ggml_type   type_v,
                         bool   offload,
-                     uint32_t   kv_size,
-                     uint32_t   n_seq_max);
+                     uint32_t   kv_size);

    ~llama_kv_cache_recurrent() = default;

@@ -421,11 +298,10 @@ public:

    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
-    void seq_keep(llama_seq_id seq_id)                                                          override;
-    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
+    void seq_keep(llama_seq_id seq_id) override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;

-    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
    llama_pos seq_pos_max(llama_seq_id seq_id) const override;

    //
@@ -435,17 +311,24 @@ public:
    void restore() override;
    void commit()  override;

-    bool update(llama_context & ctx) override;
+    bool update(llama_context & lctx) override;

    void defrag_sched(float thold) override;

    void set_full() override;

    llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
+
    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;

    bool find_slot(const llama_ubatch & batch) override;

+    int32_t get_n_tokens()   const override;
+    int32_t get_used_cells() const override;
+
+    // TODO: better data structures to reduce the cost of this operation
+    llama_pos get_pos_max() const override;
+
    bool get_can_shift() const override;

    // TODO: temporary methods - they are not really const as they do const_cast<>, fix this
@@ -485,7 +368,8 @@ private:
        std::vector<slot_range> ranges;
    } pending;

-    const uint32_t n_seq_max = 1;
+    ggml_type type_k = GGML_TYPE_F16;
+    ggml_type type_v = GGML_TYPE_F16;

    std::vector<ggml_context_ptr>        ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;
@@ -504,3 +388,12 @@ private:
    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
 };
+
+
+//
+// kv cache view
+//
+
+llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max);
+
+void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv);
--- a/src/llama-kv-cells.h
+++ b/src/llama-kv-cells.h
@@ -1,273 +0,0 @@
-#pragma once
-
-#include "llama.h"
-#include "llama-cparams.h"
-
-#include <bitset>
-#include <cassert>
-#include <vector>
-
-// meta information about KV cells that can be part of multiple sequences at the same time
-// TODO: add unit tests
-class llama_kv_cells_unified {
-public:
-    void reset() {
-        for (uint32_t i = 0; i < pos.size(); ++i) {
-            pos[i]   = -1;
-            shift[i] =  0;
-            seq[i].reset();
-        }
-
-        used      = 0;
-        has_shift = false;
-    }
-
-    void reset_shift() {
-        has_shift = false;
-
-        for (uint32_t i = 0; i < shift.size(); ++i) {
-            shift[i] = 0;
-        }
-    }
-
-    uint32_t size() const {
-        return pos.size();
-    }
-
-    void resize(uint32_t n) {
-        pos.resize(n);
-        shift.resize(n);
-        seq.resize(n);
-
-        reset();
-    }
-
-    bool is_empty(uint32_t i) const {
-        assert(i < pos.size());
-        assert((pos[i] < 0 && pos[i] == -1) || pos[i] >= 0);
-
-        return pos[i] == -1;
-    }
-
-    uint32_t get_used() const {
-        return used;
-    }
-
-    bool get_has_shift() const {
-        return has_shift;
-    }
-
-    // move cell isrc to idst (used during defrag)
-    void mv(uint32_t isrc, uint32_t idst) {
-        assert(isrc < pos.size());
-        assert(idst < pos.size());
-
-        pos  [idst] = pos  [isrc];
-        shift[idst] = shift[isrc];
-        seq  [idst] = seq  [isrc];
-
-        pos  [isrc] = -1;
-        shift[isrc] =  0;
-        seq  [isrc].reset();
-    }
-
-    // copy the state of cells [i, i + n) (used for save/restore the state of the cells)
-    llama_kv_cells_unified cp(uint32_t i, uint32_t n) const {
-        assert(i + n <= pos.size());
-
-        llama_kv_cells_unified res;
-
-        res.resize(n);
-
-        for (uint32_t j = 0; j < n; ++j) {
-            res.pos[j] = pos[i + j];
-            res.seq[j] = seq[i + j];
-
-            assert(shift[i + j] == 0);
-        }
-
-        return res;
-    }
-
-    // set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells)
-    void set(uint32_t i, const llama_kv_cells_unified & other) {
-        assert(i + other.pos.size() <= pos.size());
-
-        for (uint32_t j = 0; j < other.pos.size(); ++j) {
-            if (pos[i + j] == -1 && other.pos[j] != -1) {
-                used++;
-            }
-
-            if (pos[i + j] != -1 && other.pos[j] == -1) {
-                used--;
-            }
-
-            pos[i + j] = other.pos[j];
-            seq[i + j] = other.seq[j];
-
-            assert(shift[i + j] == 0);
-        }
-    }
-
-    // note: call only if the cell has seq_id
-    // return true if the cell becomes empty
-    bool seq_rm(uint32_t i, llama_seq_id seq_id) {
-        assert(i < pos.size());
-        assert(seq[i].test(seq_id));
-        assert(pos[i] != -1);
-        assert(seq_id >= 0);
-
-        seq[i].reset(seq_id);
-
-        if (seq[i].none()) {
-            pos[i] = -1;
-
-            used--;
-
-            return true;
-        }
-
-        return false;
-    }
-
-    // return true if the cell becomes empty (i.e. it did not contain seq_id before the call)
-    bool seq_keep(uint32_t i, llama_seq_id seq_id) {
-        assert(i < pos.size());
-
-        if (seq[i].test(seq_id)) {
-            seq[i].reset();
-            seq[i].set(seq_id);
-
-            return false;
-        }
-
-        if (seq[i].any()) {
-            seq[i].reset();
-            pos[i] = -1;
-
-            used--;
-
-            return true;
-        }
-
-        assert(pos[i] == -1);
-
-        return false;
-    }
-
-    bool seq_has(uint32_t i, llama_seq_id seq_id) const {
-        assert(i < pos.size());
-        assert(seq_id >= 0);
-
-        return seq[i].test(seq_id);
-    }
-
-    // note: call only if the cell is not empty and the seq_id is not in the cell
-    void seq_add(uint32_t i, llama_seq_id seq_id) {
-        assert(i < pos.size());
-        assert(pos[i] != -1);
-        assert(!seq[i].test(seq_id));
-
-        seq[i].set(seq_id);
-    }
-
-    // note: call only if the cell is not empty
-    llama_pos pos_get(uint32_t i) const {
-        assert(i < pos.size());
-        assert(pos[i] != -1);
-
-        return pos[i];
-    }
-
-    // note: call only if the cell is not empty
-    llama_pos get_shift(uint32_t i) const {
-        assert(i < pos.size());
-        assert(pos[i] != -1);
-
-        return shift[i];
-    }
-
-    // check if a cell is not empty and its position is within [p0, p1)
-    bool pos_in(uint32_t i, llama_pos p0, llama_pos p1) const {
-        assert(i < pos.size());
-
-        return pos[i] >= p0 && pos[i] < p1;
-    }
-
-    // set the position of an empty cell
-    // does not modify "has_shift"
-    // note: call only if the cell is empty
-    void pos_set(uint32_t i, llama_pos p) {
-        assert(i < pos.size());
-        assert(pos[i] == -1);
-
-        pos[i] = p;
-        used++;
-    }
-
-    // pos[i] = pos[i] + d
-    // sets "has_shift" to true
-    // note: call only if the cell is not empty
-    bool pos_add(uint32_t i, llama_pos d) {
-        assert(i < pos.size());
-        assert(pos[i] != -1);
-
-        pos[i]   += d;
-        shift[i] += d;
-
-        has_shift = true;
-
-        if (pos[i] < 0) {
-            pos[i] = -1;
-            seq[i].reset();
-
-            used--;
-
-            return true;
-        }
-
-        return false;
-    }
-
-    // pos[i] = pos[i] / d
-    // sets "has_shift" to true
-    // note: call only if the cell is not empty
-    void pos_div(uint32_t i, int d) {
-        assert(i < pos.size());
-        assert(pos[i] != -1);
-
-        const llama_pos p_old = pos[i];
-
-        pos[i]   /= d;
-        shift[i] += p_old - pos[i];
-
-        has_shift = true;
-    }
-
-private:
-    uint32_t used = 0; // used cells (i.e. pos[i] != -1, allowed to not have any seq_id)
-
-    bool has_shift = false;
-
-    std::vector<llama_pos> pos;
-
-    // this array accumulates any applied shifts to the pos array since the last reset_shift() call
-    // this is used to queue multiple updates to the pos array, which in the end can be applied in one go:
-    //
-    //   cells.pos_add(x, shift_x);
-    //   cells.pos_div(y, shift_y);
-    //   ...
-    //
-    //   if (cells.has_shift()) {
-    //      for (int i = 0; i < n; ++i) {
-    //          auto shift_i = cells.get_shift(i);
-    //          ...
-    //      }
-    //      cells.reset_shift();
-    //   }
-    //
-    std::vector<llama_pos> shift;
-
-    std::vector<std::bitset<LLAMA_MAX_PARALLEL_SEQUENCES>> seq;
-};
-
--- a/Show More
+++ b/Show More