arg: Add LLAMA_ARG_API_KEY_FILE environment variable for --api-key-file (#23167 )

test-llama-archs: fix table format [no release] (#23810 )
ggml: auto apply iGPU flag CUDA/HIP if integrated device (#23007 )
2026-05-28 17:27:26 +03:00 · 2026-05-28 16:25:40 +02:00 · 2026-05-28 15:53:54 +02:00 · 2026-05-28 15:01:14 +02:00 · 2026-05-28 14:51:14 +02:00 · 2026-05-28 14:50:25 +02:00
148 changed files with 10290 additions and 3945 deletions
--- a/.devops/zendnn.Dockerfile
+++ b/.devops/zendnn.Dockerfile
@@ -0,0 +1,101 @@
+ARG UBUNTU_VERSION=24.04
+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+RUN apt-get update && \
+    apt-get install -y gcc-13 g++-13 build-essential git cmake libssl-dev libomp-dev libnuma-dev python3 ca-certificates
+
+ENV CC=gcc-13 CXX=g++-13
+
+WORKDIR /app
+
+COPY . .
+
+RUN cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_ZENDNN=ON && \
+    cmake --build build -j $(nproc)
+
+RUN mkdir -p /app/lib && \
+    find build -name "*.so*" -exec cp -P {} /app/lib \;
+
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r conversion /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ubuntu:$UBUNTU_VERSION AS base
+
+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A
+ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
+ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
+LABEL org.opencontainers.image.created=$BUILD_DATE \
+      org.opencontainers.image.version=$APP_VERSION \
+      org.opencontainers.image.revision=$APP_REVISION \
+      org.opencontainers.image.title="llama.cpp" \
+      org.opencontainers.image.description="LLM inference in C/C++" \
+      org.opencontainers.image.url=$IMAGE_URL \
+      org.opencontainers.image.source=$IMAGE_SOURCE
+
+RUN apt-get update \
+    && apt-get install -y libgomp1 libnuma1 curl \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3 \
+    python3-pip \
+    python3-wheel \
+    && pip install --break-system-packages --upgrade setuptools \
+    && pip install --break-system-packages -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
--- a/.github/actions/windows-setup-cuda/action.yml
+++ b/.github/actions/windows-setup-cuda/action.yml
@@ -96,3 +96,34 @@ runs:
          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
          echo "CUDA_PATH_V13_1=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+
+    - name: Install Cuda Toolkit 13.3
+      if: ${{ inputs.cuda_version == '13.3' }}
+      shell: pwsh
+      run: |
+          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3"
+          choco install unzip -y
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_crt/windows-x86_64/cuda_crt-windows-x86_64-13.3.33-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-13.3.29-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-13.3.33-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-13.3.33-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-13.5.1.27-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libnvvm/windows-x86_64/libnvvm-windows-x86_64-13.3.33-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-13.3.29-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-13.3.27-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-13.3.27-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cccl/windows-x86_64/cccl-windows-x86_64-13.3.3.3.1-archive.zip"
+          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3"
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_crt-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_cudart-windows-x86_64-13.3.29-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvcc-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvrtc-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\libcublas-windows-x86_64-13.5.1.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\libnvvm-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvtx-windows-x86_64-13.3.29-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_profiler_api-windows-x86_64-13.3.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\visual_studio_integration-windows-x86_64-13.3.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cccl-windows-x86_64-13.3.3.3.1-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V13_3=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
--- a/.github/workflows/build-3rd-party.yml
+++ b/.github/workflows/build-3rd-party.yml
@@ -22,9 +22,9 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
  ubuntu-24-llguidance:
--- a/.github/workflows/build-and-test-snapdragon.yml
+++ b/.github/workflows/build-and-test-snapdragon.yml
@@ -31,7 +31,7 @@ jobs:
  android-ndk-snapdragon:
    runs-on: ubuntu-latest
    container:
-      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.6'
+      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.7'
    defaults:
      run:
        shell: bash
@@ -61,7 +61,7 @@ jobs:
  linux-iot-snapdragon:
    runs-on: ubuntu-latest
    container:
-      image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.6'
+      image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.7'
    defaults:
      run:
        shell: bash
--- a/.github/workflows/build-android.yml
+++ b/.github/workflows/build-android.yml
@@ -27,12 +27,12 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
-  android:
+  default:
    runs-on: ubuntu-latest

    steps:
@@ -58,7 +58,7 @@ jobs:
          cd examples/llama.android
          ./gradlew build --no-daemon

-  android-ndk:
+  ndk:
    runs-on: ubuntu-latest
    container:
      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
@@ -91,3 +91,59 @@ jobs:
        with:
          name: llama-cpp-android-arm64-cpu
          path: pkg-adb/llama.cpp
+
+  arm64:
+    runs-on: ubuntu-latest
+
+    env:
+      NDK_VERSION: "29.0.14206865"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      # note : disabled to spare some cache space (https://github.com/ggml-org/llama.cpp/pull/23789)
+      #        for some reason, the ccache does not improve the build time in this case
+      # example:
+      #   cache off: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78160400831
+      #   cache on:  https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78224189394
+      #
+      #- name: ccache
+      #  uses: ggml-org/ccache-action@v1.2.21
+      #  with:
+      #    key: android-ubuntu-arm64
+      #    evict-old-files: 1d
+      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Set up JDK
+        uses: actions/setup-java@v5
+        with:
+          java-version: 17
+          distribution: temurin
+
+      - name: Setup Android SDK
+        uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
+        with:
+          log-accepted-android-sdk-licenses: false
+
+      - name: Install NDK
+        run: |
+          sdkmanager "ndk;${{ env.NDK_VERSION }}"
+          echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
+            -DANDROID_ABI=arm64-v8a \
+            -DANDROID_PLATFORM=android-28 \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DGGML_BACKEND_DL=ON \
+            -DGGML_NATIVE=OFF \
+            -DGGML_CPU_ALL_VARIANTS=ON \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_BORINGSSL=ON \
+            -DGGML_RPC=ON
+          time cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build-apple.yml
+++ b/.github/workflows/build-apple.yml
@@ -32,12 +32,12 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
-  macOS-latest-ios:
+  macos-latest-arm64:
    runs-on: macos-latest

    steps:
@@ -48,7 +48,80 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: macOS-latest-ios
+          key: apple-arm64
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          cmake -B build \
+            -DCMAKE_BUILD_RPATH="@loader_path" \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_BUILD_BORINGSSL=ON \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=OFF \
+            -DGGML_METAL_SHADER_DEBUG=ON \
+            -DGGML_RPC=ON
+          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+          leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main -E "test-llama-archs" --verbose --timeout 900
+
+  macos-latest-x64:
+    runs-on: macos-15-intel
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: apple-x64
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
+          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+          cmake -B build \
+            -DCMAKE_BUILD_RPATH="@loader_path" \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_BUILD_BORINGSSL=ON \
+            -DGGML_METAL=OFF \
+            -DGGML_RPC=ON \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
+          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+  macos-latest-ios:
+    runs-on: macos-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      # TODO: this likely does not do anything - if yes, remove it
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: apple-ios
          evict-old-files: 1d
          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

@@ -117,7 +190,7 @@ jobs:
          xcodebuild -downloadPlatform iOS
          xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build

-  macOS-latest-tvos:
+  macos-latest-tvos:
    runs-on: macos-latest

    steps:
@@ -125,10 +198,11 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

+      # TODO: this likely does not do anything - if yes, remove it
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: macOS-latest-tvos
+          key: apple-tvos
          evict-old-files: 1d
          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

@@ -150,7 +224,7 @@ jobs:
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO

-  macOS-latest-visionos:
+  macos-latest-visionos:
    runs-on: macos-latest

    steps:
@@ -158,6 +232,14 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

+      # TODO: this likely does not do anything - if yes, remove it
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: apple-visionos
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
      - name: Build
        id: cmake_build
        run: |
@@ -176,7 +258,7 @@ jobs:
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO

-  macOS-latest-swift:
+  macos-latest-swift:
    runs-on: macos-latest
    needs: macos-latest-ios-xcode

@@ -189,10 +271,11 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

+      # TODO: this likely does not do anything - if yes, remove it
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: macOS-latest-swift
+          key: apple-swift
          evict-old-files: 1d
          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

--- a/.github/workflows/build-cache.yml
+++ b/.github/workflows/build-cache.yml
@@ -28,7 +28,7 @@ jobs:
        id: cache-sdk
        with:
          path: ./vulkan_sdk
-          key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
+          key: cache-gha-vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}

      - name: Setup Vulkan SDK
        if: steps.cache-sdk.outputs.cache-hit != 'true'
@@ -54,7 +54,7 @@ jobs:
  #      id: cache-toolchain
  #      with:
  #        path: ./spacemit_toolchain
-  #        key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
+  #        key: cache-gha-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}

  #    - name: Setup SpacemiT Toolchain
  #      if: steps.cache-toolchain.outputs.cache-hit != 'true'
@@ -81,7 +81,7 @@ jobs:
        id: cache-openvino
        with:
          path: ./openvino_toolkit
-          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+          key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}

      - name: Setup OpenVINO Toolkit
        if: steps.cache-openvino.outputs.cache-hit != 'true'
@@ -108,7 +108,7 @@ jobs:
        id: cache-rocm
        with:
          path: C:\Program Files\AMD\ROCm
-          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
+          key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}

      - name: Setup ROCm
        if: steps.cache-rocm.outputs.cache-hit != 'true'
--- a/.github/workflows/build-cann.yml
+++ b/.github/workflows/build-cann.yml
@@ -29,74 +29,76 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
-  openEuler-latest-cann:
-    defaults:
-      run:
-        shell: bash -el {0}
-    strategy:
-      matrix:
-        arch: [x86, aarch64]
-        chip_type: ['910b', '310p']
-        build: ['Release']
-        use_acl_graph: ['on', 'off']
-        exclude:
-          # 310P does not support USE_ACL_GRAPH=on
-          - chip_type: '310p'
-            use_acl_graph: 'on'
-    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Free up disk space
-        uses: ggml-org/free-disk-space@v1.3.1
-        with:
-          tool-cache: true
-
-      - name: Set container image
-        id: cann-image
-        run: |
-          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
-          echo "image=${image}" >> "${GITHUB_OUTPUT}"
-
-      - name: Pull container image
-        run: docker pull "${{ steps.cann-image.outputs.image }}"
-
-      - name: Build
-        env:
-          BUILD_TYPE: ${{ matrix.build }}
-          SOC_TYPE: ascend${{ matrix.chip_type }}
-          USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
-        run: |
-          HOST_UID=$(id -u)
-          HOST_GID=$(id -g)
-
-          docker run --rm \
-            -v "${PWD}:/workspace" \
-            -w /workspace \
-            -e SOC_TYPE=${SOC_TYPE} \
-            -e BUILD_TYPE=${BUILD_TYPE} \
-            -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
-            "${{ steps.cann-image.outputs.image }}" \
-            bash -lc '
-              set -e
-              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
-              yum clean all && rm -rf /var/cache/yum
-              git config --global --add safe.directory "/workspace"
-              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-              cmake -S . -B build \
-                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-                  -DGGML_CANN=on \
-                  -DSOC_TYPE=${SOC_TYPE} \
-                  -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
-              cmake --build build -j $(nproc)
-
-              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
-            '
+# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
+#       in order to enable it again, we have to provision dedicated runners  to run it
+#  openEuler-latest-cann:
+#    defaults:
+#      run:
+#        shell: bash -el {0}
+#    strategy:
+#      matrix:
+#        arch: [x86, aarch64]
+#        chip_type: ['910b', '310p']
+#        build: ['Release']
+#        use_acl_graph: ['on', 'off']
+#        exclude:
+#          # 310P does not support USE_ACL_GRAPH=on
+#          - chip_type: '310p'
+#            use_acl_graph: 'on'
+#    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+#    steps:
+#      - name: Checkout
+#        uses: actions/checkout@v6
+#        with:
+#          fetch-depth: 0
+#
+#      - name: Free up disk space
+#        uses: ggml-org/free-disk-space@v1.3.1
+#        with:
+#          tool-cache: true
+#
+#      - name: Set container image
+#        id: cann-image
+#        run: |
+#          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
+#          echo "image=${image}" >> "${GITHUB_OUTPUT}"
+#
+#      - name: Pull container image
+#        run: docker pull "${{ steps.cann-image.outputs.image }}"
+#
+#      - name: Build
+#        env:
+#          BUILD_TYPE: ${{ matrix.build }}
+#          SOC_TYPE: ascend${{ matrix.chip_type }}
+#          USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
+#        run: |
+#          HOST_UID=$(id -u)
+#          HOST_GID=$(id -g)
+#
+#          docker run --rm \
+#            -v "${PWD}:/workspace" \
+#            -w /workspace \
+#            -e SOC_TYPE=${SOC_TYPE} \
+#            -e BUILD_TYPE=${BUILD_TYPE} \
+#            -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
+#            "${{ steps.cann-image.outputs.image }}" \
+#            bash -lc '
+#              set -e
+#              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
+#              yum clean all && rm -rf /var/cache/yum
+#              git config --global --add safe.directory "/workspace"
+#              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+#              cmake -S . -B build \
+#                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+#                  -DGGML_CANN=on \
+#                  -DSOC_TYPE=${SOC_TYPE} \
+#                  -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
+#              cmake --build build -j $(nproc)
+#
+#              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
+#            '
--- a/.github/workflows/build-cpu.yml
+++ b/.github/workflows/build-cpu.yml
@@ -0,0 +1,231 @@
+name: CI (cpu)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-cpu.yml',
+      '.github/workflows/build-cmake-pkg.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+      '**/*.cu',
+      '**/*.cuh',
+      '**/*.swift',
+      '**/*.m',
+      '**/*.metal',
+      '**/*.comp',
+      '**/*.glsl',
+      '**/*.wgsl'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-cpu.yml',
+      '.github/workflows/build-cmake-pkg.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+      '**/*.cu',
+      '**/*.cuh',
+      '**/*.swift',
+      '**/*.m',
+      '**/*.metal',
+      '**/*.comp',
+      '**/*.glsl',
+      '**/*.wgsl'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+
+jobs:
+  build-cmake-pkg:
+    uses: ./.github/workflows/build-cmake-pkg.yml
+
+  ubuntu:
+    strategy:
+      matrix:
+        include:
+          - build: 'x64'
+            os: ubuntu-22.04
+          - build: 'arm64'
+            os: ubuntu-24.04-arm
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: cpu-${{ matrix.os }}
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build Dependencies
+        id: build_depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends \
+            python3 python3-pip python3-dev python3-wheel \
+            libjpeg-dev build-essential libssl-dev \
+            git-lfs
+
+      - name: Toolchain workaround (GCC 14)
+        if: ${{ contains(matrix.os, 'ubuntu-24.04') }}
+        run: |
+          sudo apt-get install -y gcc-14 g++-14
+          echo "CC=gcc-14" >> "$GITHUB_ENV"
+          echo "CXX=g++-14" >> "$GITHUB_ENV"
+
+      - name: Python Dependencies
+        id: python_depends
+        run: |
+          export PIP_BREAK_SYSTEM_PACKAGES="1"
+          python3 -m pip install --upgrade pip setuptools
+          pip3 install ./gguf-py
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DGGML_RPC=ON
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+      - name: Test llama2c conversion
+        id: llama2c_test
+        run: |
+          cd build
+          echo "Fetch tokenizer"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
+          echo "Fetch llama2c model"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
+          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+
+  windows:
+    runs-on: windows-2025
+
+    env:
+      OPENBLAS_VERSION: 0.3.23
+      SDE_VERSION: 9.33.0-2024-01-07
+      VULKAN_VERSION: 1.4.313.2
+
+    strategy:
+      matrix:
+        include:
+          - build: 'x64-cpu-static'
+            arch: 'x64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
+          - build: 'x64-openblas'
+            arch: 'x64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+          - build: 'x64-vulkan'
+            arch: 'x64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
+          - build: 'arm64'
+            arch: 'arm64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: cpu-windows-2025-${{ matrix.build }}
+          variant: ccache
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Download OpenBLAS
+        id: get_openblas
+        if: ${{ matrix.build == 'x64-openblas' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
+          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
+          mkdir $env:RUNNER_TEMP/openblas
+          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
+          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
+          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
+          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
+          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
+
+      - name: Install Vulkan SDK
+        id: get_vulkan
+        if: ${{ matrix.build == 'x64-vulkan' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
+          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
+          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
+          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
+
+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -S . -B build ${{ matrix.defines }} `
+            -DLLAMA_BUILD_BORINGSSL=ON
+          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
+
+      - name: Add libopenblas.dll
+        id: add_libopenblas_dll
+        if: ${{ matrix.build == 'x64-openblas' }}
+        run: |
+          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
+          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
+
+      - name: Test
+        id: cmake_test
+        if: ${{ matrix.arch == 'x64' }}
+        run: |
+          cd build
+          ctest -L main -C Release --verbose --timeout 900
+
+      # TODO: disabled for now, consider adding tests for all CPU variants instead
+      # - name: Test (Intel SDE)
+      #   id: cmake_test_sde
+      #   if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
+      #   run: |
+      #     curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
+      #     # for some weird reason windows tar doesn't like sde tar.xz
+      #     7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
+      #     7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
+      #     $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
+      #     cd build
+      #     $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
+      #     & $sde -future -- ctest -L main -C Release --verbose --timeout 900
--- a/.github/workflows/build-cross.yml
+++ b/.github/workflows/build-cross.yml
@@ -287,7 +287,7 @@ jobs:
      #  id: cache-toolchain
      #  with:
      #    path: ./spacemit_toolchain
-      #    key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
+      #    key: cache-gha-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}

      - name: Setup SpacemiT Toolchain
        #if: steps.cache-toolchain.outputs.cache-hit != 'true'
--- a/.github/workflows/build-cuda-ubuntu.yml
+++ b/.github/workflows/build-cuda-ubuntu.yml
@@ -0,0 +1,134 @@
+name: CI (CUDA, ubuntu)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-cuda-ubuntu.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+      '**/*.cu',
+      '**/*.cuh'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-cuda-ubuntu.yml',
+      'ggml/src/ggml-cuda/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+
+jobs:
+  cuda:
+    runs-on: ubuntu-24.04
+    container: nvidia/cuda:12.6.2-devel-ubuntu24.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Install dependencies
+        env:
+          DEBIAN_FRONTEND: noninteractive
+        run: |
+          apt update
+          apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: cuda-ubuntu-24.04-cuda
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build with CMake
+        # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
+        run: |
+          cmake -S . -B build -G Ninja \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DCMAKE_CUDA_ARCHITECTURES=89-real \
+            -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
+            -DGGML_NATIVE=OFF \
+            -DGGML_CUDA=ON \
+            -DGGML_CUDA_CUB_3DOT2=ON
+          cmake --build build
+
+  hip:
+    runs-on: ubuntu-22.04
+    container: rocm/dev-ubuntu-22.04:6.1.2
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: cuda-ubuntu-22.04-hip
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build with native CMake HIP support
+        id: cmake_build
+        run: |
+          cmake -B build -S . \
+            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
+            -DGGML_HIP_ROCWMMA_FATTN=ON \
+            -DGPU_TARGETS="gfx1030" \
+            -DGGML_HIP=ON
+          cmake --build build --config Release -j $(nproc)
+
+  musa:
+    runs-on: ubuntu-22.04
+    container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          apt-get update
+          apt-get install -y build-essential git cmake libssl-dev
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: cuda-ubuntu-22.04-musa
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build with native CMake MUSA support
+        id: cmake_build
+        run: |
+          cmake -B build -S . \
+            -DGGML_MUSA=ON
+          time cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build-cuda-windows.yml
+++ b/.github/workflows/build-cuda-windows.yml
@@ -0,0 +1,146 @@
+name: CI (CUDA, windows)
+
+# TODO: this workflow is only triggered manually because it is very heavy on the CI
+#       when we provision dedicated windows runners, we can enable it for pushes too
+# note: running this workflow manually will populate the ccache for the release builds
+#       this can be used before merging a PR to speed up the release workflow
+on:
+  workflow_dispatch: # allows manual triggering
+
+# note: this will run in queue with the release workflow
+concurrency:
+  group: release
+  queue: max
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+
+jobs:
+  cuda:
+    runs-on: windows-2022
+
+    strategy:
+      matrix:
+        cuda: ['12.4', '13.3']
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
+          append-timestamp: false # note: use this only with non-concurrent jobs!
+
+      - name: Install Cuda Toolkit
+        uses: ./.github/actions/windows-setup-cuda
+        with:
+          cuda_version: ${{ matrix.cuda }}
+
+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
+
+      - name: Build
+        id: cmake_build
+        shell: cmd
+        # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
+        run: |
+          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
+          cmake -S . -B build -G "Ninja Multi-Config" ^
+            -DLLAMA_BUILD_SERVER=ON ^
+            -DLLAMA_BUILD_BORINGSSL=ON ^
+            -DGGML_NATIVE=OFF ^
+            -DGGML_BACKEND_DL=ON ^
+            -DGGML_CPU_ALL_VARIANTS=ON ^
+            -DGGML_CUDA=ON ^
+            -DGGML_RPC=ON ^
+            -DGGML_CUDA_CUB_3DOT2=ON
+          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
+          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
+          cmake --build build --config Release
+
+  hip:
+    runs-on: windows-2022
+
+    env:
+      # Make sure this is in sync with build-cache.yml
+      HIPSDK_INSTALLER_VERSION: "26.Q1"
+
+    strategy:
+      matrix:
+        include:
+          # sync with release.yml
+          - name: "radeon"
+            gpu_targets: "gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Grab rocWMMA package
+        id: grab_rocwmma
+        run: |
+          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
+          7z x rocwmma.deb
+          7z x data.tar
+
+      - name: Use ROCm Installation Cache
+        uses: actions/cache@v5
+        id: cache-rocm
+        with:
+          path: C:\Program Files\AMD\ROCm
+          key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: Setup ROCm
+        if: steps.cache-rocm.outputs.cache-hit != 'true'
+        uses: ./.github/actions/windows-setup-rocm
+        with:
+          version: ${{ env.HIPSDK_INSTALLER_VERSION }}
+
+      - name: Verify ROCm
+        id: verify
+        run: |
+          # Find and test ROCm installation
+          $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
+          if (-not $clangPath) {
+            Write-Error "ROCm installation not found"
+            exit 1
+          }
+          & $clangPath.FullName --version
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          # TODO: this build does not match the build in release.yml, so we use a different cache key
+          #       ideally, the builds should match, similar to the CUDA build above so that we would be able
+          #       to populate the ccache for the release with manual runs of this workflow
+          #key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
+          key: cuda-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
+          append-timestamp: false # note: use this only with non-concurrent jobs!
+
+      - name: Build
+        id: cmake_build
+        run: |
+          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake -G "Unix Makefiles" -B build -S . `
+            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
+            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/" `
+            -DCMAKE_BUILD_TYPE=Release `
+            -DLLAMA_BUILD_BORINGSSL=ON `
+            -DROCM_DIR="${env:HIP_PATH}" `
+            -DGGML_HIP=ON `
+            -DGGML_HIP_ROCWMMA_FATTN=ON `
+            -DGPU_TARGETS="gfx1100"  `
+            -DGGML_RPC=ON
+          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
--- a/.github/workflows/build-ibm.yml
+++ b/.github/workflows/build-ibm.yml
@@ -0,0 +1,150 @@
+name: CI (ibm)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-ibm.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-ibm.yml',
+      'ggml/src/ggml-cpu/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+
+jobs:
+
+  ubuntu-24-s390x:
+    runs-on: ubuntu-24.04-s390x
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Build Dependencies
+        id: build_depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends \
+            python3 python3-pip python3-dev python3-wheel \
+            libjpeg-dev build-essential libssl-dev \
+            git-lfs
+
+      - name: Toolchain workaround (GCC 14)
+        run: |
+          sudo apt-get install -y gcc-14 g++-14
+          echo "CC=gcc-14" >> "$GITHUB_ENV"
+          echo "CXX=g++-14" >> "$GITHUB_ENV"
+
+      - name: Python Dependencies
+        id: python_depends
+        run: |
+          export PIP_BREAK_SYSTEM_PACKAGES="1"
+          python3 -m pip install --upgrade pip setuptools
+          pip3 install ./gguf-py
+
+      - name: Swap Endianness
+        id: endianness
+        run: |
+          for f in models/*.gguf; do
+            echo YES | python3 gguf-py/gguf/scripts/gguf_convert_endian.py $f big
+          done
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DGGML_RPC=ON
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+      - name: Test llama2c (s390x)
+        id: llama2c_test_s390x
+        run: |
+          cd build
+          echo "Fetch llama2c big-endian model"
+          wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
+          ./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+
+  ubuntu-24-ppc64le:
+    runs-on: ubuntu-24.04-ppc64le
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Build Dependencies
+        id: build_depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends \
+            python3 python3-pip python3-dev python3-wheel \
+            libjpeg-dev build-essential libssl-dev \
+            git-lfs
+
+      - name: Toolchain workaround (GCC 14)
+        run: |
+          sudo apt-get install -y gcc-14 g++-14
+          echo "CC=gcc-14" >> "$GITHUB_ENV"
+          echo "CXX=g++-14" >> "$GITHUB_ENV"
+
+      - name: Python Dependencies
+        id: python_depends
+        run: |
+          export PIP_BREAK_SYSTEM_PACKAGES="1"
+          python3 -m pip install --upgrade pip setuptools
+          pip3 install ./gguf-py
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DGGML_RPC=ON
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+      - name: Test llama2c conversion
+        id: llama2c_test
+        run: |
+          cd build
+          echo "Fetch tokenizer"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
+          echo "Fetch llama2c model"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
+          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
--- a/.github/workflows/build-msys.yml
+++ b/.github/workflows/build-msys.yml
@@ -15,9 +15,9 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
  windows-msys2:
@@ -37,7 +37,7 @@ jobs:
      #- name: ccache
      #  uses: ggml-org/ccache-action@v1.2.16
      #  with:
-      #    key: windows-msys2
+      #    key: msys-windows-2025-x64
      #    variant: ccache
      #    evict-old-files: 1d
      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
--- a/.github/workflows/build-opencl.yml
+++ b/.github/workflows/build-opencl.yml
@@ -0,0 +1,82 @@
+name: CI (opencl)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-opencl.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+      '**/*.cl'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-opencl.yml',
+      'ggml/src/ggml-opencl/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+
+jobs:
+  windows-2025-opencl-adreno:
+    runs-on: windows-2025
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: opencl-windows-2025-x64
+          variant: ccache
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
+
+      - name: Install OpenCL Headers and Libs
+        id: install_opencl
+        run: |
+          git clone https://github.com/KhronosGroup/OpenCL-Headers
+          cd OpenCL-Headers
+          cmake -B build `
+            -DBUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build build --target install
+          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+          cd OpenCL-ICD-Loader
+          cmake -B build-arm64-release `
+            -A arm64 `
+            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build build-arm64-release --target install --config release
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -S . -B build -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON -DLLAMA_BUILD_BORINGSSL=ON
+          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
--- a/.github/workflows/build-openvino.yml
+++ b/.github/workflows/build-openvino.yml
@@ -29,9 +29,9 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
  ubuntu-24-openvino:
@@ -67,7 +67,7 @@ jobs:
        if: runner.environment == 'github-hosted'
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
+          key: openvino-ubuntu-24.04-${{ matrix.variant }}-no-preset-v1
          evict-old-files: 1d
          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

@@ -84,7 +84,7 @@ jobs:
        id: cache-openvino
        with:
          path: ./openvino_toolkit
-          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+          key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}

      - name: Setup OpenVINO Toolkit
        if: steps.cache-openvino.outputs.cache-hit != 'true'
--- a/.github/workflows/build-riscv.yml
+++ b/.github/workflows/build-riscv.yml
@@ -29,11 +29,84 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
+  ubuntu-cpu-riscv64-native:
+    runs-on: ubuntu-24.04-riscv
+
+    steps:
+      - name: Install dependencies
+        run: |
+          # Install necessary packages
+          sudo apt-get update
+          sudo apt-get install -y libssl-dev
+
+          # Set gcc-14 and g++-14 as the default compilers
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+
+          git lfs install
+
+      - name: Check environment
+        run: |
+          uname -a
+          gcc --version
+          g++ --version
+          ldd --version
+          cmake --version
+          rustc --version
+          env
+          echo "nproc=$(nproc)"
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      # note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation
+      #- name: ccache
+      #  uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
+      #  with:
+      #    key: riscv-ubuntu-native
+      #    evict-old-files: 1d
+      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_EXAMPLES=ON \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_TESTS=ON \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DGGML_RPC=ON \
+            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+      - name: Test llama2c conversion
+        id: llama2c_test
+        run: |
+          cd build
+          echo "Fetch tokenizer"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
+          echo "Fetch llama2c model"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
+          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+
  ubuntu-riscv64-native-sanitizer:
    runs-on: ubuntu-24.04-riscv

@@ -62,12 +135,13 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

-      - name: ccache
-        uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
-        with:
-          key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+      # note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation
+      #- name: ccache
+      #  uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
+      #  with:
+      #    key: riscv-ubuntu-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}
+      #    evict-old-files: 1d
+      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
--- a/.github/workflows/build-rpc.yml
+++ b/.github/workflows/build-rpc.yml
@@ -0,0 +1,66 @@
+name: CI (rpc)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-rpc.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-rpc.yml',
+      'ggml/src/ggml-rpc/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+
+jobs:
+  ubuntu-latest-rpc:
+    runs-on: ubuntu-latest
+
+    continue-on-error: true
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential libssl-dev ninja-build
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_RPC=ON
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose
--- a/.github/workflows/build-sanitize.yml
+++ b/.github/workflows/build-sanitize.yml
@@ -22,66 +22,65 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
-  ubuntu-latest-sanitizer:
-    runs-on: ubuntu-latest
+  ctest:
+    runs-on: [self-hosted, X64, CPU, Linux]

    continue-on-error: true

    strategy:
      matrix:
        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug]

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6

-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
+      # with UNDEFINED sanitizer, we have to build in Debug to avoid GCC 13 false-positive warnings
+      - name: Build (undefined)
+        id: cmake_build_undefined
+        if: ${{ matrix.sanitizer == 'UNDEFINED' }}
        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libssl-dev
+          cmake -B build \
+            -DCMAKE_BUILD_TYPE=Debug \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON
+
+          cmake --build build --config Debug -j $(nproc)

      - name: Build
        id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
+        if: ${{ matrix.sanitizer == 'ADDRESS' }}
        run: |
          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON

-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+          cmake --build build --config RelWithDebInfo -j $(nproc)

      - name: Build (no OpenMP)
        id: cmake_build_no_openmp
        if: ${{ matrix.sanitizer == 'THREAD' }}
        run: |
          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
            -DGGML_OPENMP=OFF

-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+          cmake --build build --config RelWithDebInfo -j $(nproc)

      - name: Test
        id: cmake_test
+        # skip run in Debug - very slow
+        if: ${{ matrix.sanitizer != 'UNDEFINED' }}
        run: |
          cd build
-          ctest -L main --verbose --timeout 900
+          ctest -L main -E tokenizer --verbose --timeout 900
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@@ -50,12 +50,12 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
-  ggml-ci-nvidia-cuda:
+  gpu-cuda:
    runs-on: [self-hosted, Linux, NVIDIA]

    steps:
@@ -69,7 +69,7 @@ jobs:
          nvidia-smi
          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  ggml-ci-nvidia-vulkan-cm:
+  gpu-vulkan-nvidia-cm:
    runs-on: [self-hosted, Linux, NVIDIA]

    steps:
@@ -83,7 +83,7 @@ jobs:
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  ggml-ci-nvidia-vulkan-cm2:
+  gpu-vulkan-nvidia-cm2:
    runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]

    steps:
@@ -97,7 +97,7 @@ jobs:
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  ggml-ci-nvidia-webgpu:
+  gpu-webgpu-nvidia:
    runs-on: [self-hosted, Linux, NVIDIA, X64]

    steps:
@@ -127,7 +127,7 @@ jobs:
            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  # TODO: provision AMX-compatible machine
-  #ggml-ci-cpu-amx:
+  #cpu-amx:
  #  runs-on: [self-hosted, Linux, CPU, AMX]

  #  steps:
@@ -141,7 +141,7 @@ jobs:
  #        bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  # TODO: provision AMD GPU machine
-  # ggml-ci-amd-vulkan:
+  # amd-vulkan:
  #   runs-on: [self-hosted, Linux, AMD]

  #   steps:
@@ -156,7 +156,7 @@ jobs:
  #         GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  # TODO: provision AMD GPU machine
-  # ggml-ci-amd-rocm:
+  # amd-rocm:
  #   runs-on: [self-hosted, Linux, AMD]

  #   steps:
@@ -170,7 +170,7 @@ jobs:
  #         amd-smi static
  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  ggml-ci-mac-metal:
+  gpu-metal:
    runs-on: [self-hosted, macOS, ARM64]

    steps:
@@ -183,7 +183,7 @@ jobs:
        run: |
          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  ggml-ci-mac-webgpu:
+  gpu-webgpu-apple:
    runs-on: [self-hosted, macOS, ARM64]

    steps:
@@ -210,7 +210,7 @@ jobs:
          GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  ggml-ci-mac-vulkan:
+  gpu-vulkan:
    runs-on: [self-hosted, macOS, ARM64]

    steps:
@@ -224,7 +224,7 @@ jobs:
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  ggml-ci-linux-intel-vulkan:
+  gpu-vulkan-intel-linux:
    runs-on: [self-hosted, Linux, Intel]

    steps:
@@ -240,7 +240,7 @@ jobs:
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  ggml-ci-win-intel-vulkan:
+  gpu-vulkan-intel-windows:
    runs-on: [self-hosted, Windows, X64, Intel]

    steps:
@@ -261,7 +261,7 @@ jobs:
          # a valid python environment for testing
          LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp

-  ggml-ci-intel-openvino-gpu-low-perf:
+  cpu-openvino-low-perf:
    runs-on: [self-hosted, Linux, Intel, OpenVINO]

    concurrency:
@@ -297,8 +297,8 @@ jobs:
          source ./openvino_toolkit/setupvars.sh
          GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  ggml-ci-arm64-cpu-low-perf:
-    runs-on: [self-hosted, Linux, ARM64, CPU]
+  cpu-any-low-perf:
+    runs-on: [self-hosted, CPU]

    steps:
      - name: Clone
@@ -310,8 +310,8 @@ jobs:
        run: |
          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  ggml-ci-arm64-cpu-high-perf:
-    runs-on: [self-hosted, Linux, ARM64, CPU]
+  cpu-any-high-perf:
+    runs-on: [self-hosted, CPU]

    steps:
      - name: Clone
@@ -323,34 +323,82 @@ jobs:
        run: |
          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-# TODO: not sure how to detect ARM flags on DGX Spark. currently get this error during cmake:
-#         CMake Warning at ggml/src/ggml-cpu/CMakeLists.txt:147 (message):
-#           ARM -march/-mcpu not found, -mcpu=native will be used
-#
-#       if we resolve this, we should be able to offload these jobs to the self-hosted runners
-#
-#  ggml-ci-arm64-cpu-high-perf-sve:
-#    runs-on: [self-hosted, Linux, ARM64, CPU]
-#
-#    steps:
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v6
-#
-#      - name: Test
-#        id: ggml-ci
-#        run: |
-#          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-#
-#  ggml-ci-arm64-cpu-kleidiai:
-#    runs-on: [self-hosted, Linux, ARM64, CPU]
-#
-#    steps:
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v6
-#
-#      - name: Test
-#        id: ggml-ci
-#        run: |
-#          GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+  cpu-arm64-graviton4:
+    runs-on: ah-ubuntu_22_04-c8g_8x
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          set -euxo pipefail
+          sudo apt-get update
+          sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
+          apt-get install -y \
+          build-essential \
+          python3-venv \
+          gpg \
+          wget \
+          time \
+          git-lfs
+
+          git lfs install
+
+          # install the latest cmake
+          sudo install -d /usr/share/keyrings
+          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
+            | gpg --dearmor \
+            | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
+          echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
+            | sudo tee /etc/apt/sources.list.d/kitware.list
+          sudo apt-get update
+          sudo apt-get install -y cmake
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+  cpu-arm64-graviton4-kleidiai:
+    runs-on: ah-ubuntu_22_04-c8g_8x
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          set -euxo pipefail
+          sudo apt-get update
+          sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
+          apt-get install -y \
+          build-essential \
+          python3-venv \
+          gpg \
+          wget \
+          time \
+          git-lfs
+
+          git lfs install
+
+          # install the latest cmake
+          sudo install -d /usr/share/keyrings
+          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
+            | gpg --dearmor \
+            | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
+          echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
+            | sudo tee /etc/apt/sources.list.d/kitware.list
+          sudo apt-get update
+          sudo apt-get install -y cmake
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          GG_BUILD_KLEIDIAI=1 \
+          GG_BUILD_EXTRA_TESTS_0=1 \
+          bash ./ci/run.sh ./tmp/results ./tmp/mnt
--- a/.github/workflows/build-sycl.yml
+++ b/.github/workflows/build-sycl.yml
@@ -29,132 +29,134 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:

-  ubuntu-24-sycl:
-    strategy:
-      matrix:
-        build: [fp32, fp16]
-        include:
-          - build: fp32
-            fp16: OFF
-          - build: fp16
-            fp16: ON
+# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
+#       in order to enable it again, we have to provision dedicated runners  to run it
+#  ubuntu-24-sycl:
+#    strategy:
+#      matrix:
+#        build: [fp32]
+#        include:
+#          - build: fp32
+#            fp16: OFF
+#
+#    runs-on: ubuntu-24.04
+#
+#    env:
+#      ONEAPI_ROOT: /opt/intel/oneapi/
+#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+#      LEVEL_ZERO_VERSION: "1.28.2"
+#      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
+#
+#    continue-on-error: true
+#
+#    steps:
+#      - uses: actions/checkout@v6
+#
+#      - name: Use oneAPI Installation Cache
+#        uses: actions/cache@v5
+#        id: cache-sycl
+#        with:
+#          path: ${{ env.ONEAPI_ROOT }}
+#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+#
+#      - name: Download & Install oneAPI
+#        shell: bash
+#        if: steps.cache-sycl.outputs.cache-hit != 'true'
+#        run: |
+#          cd /tmp
+#          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
+#          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+#
+#      - name: Install Level Zero SDK
+#        shell: bash
+#        run: |
+#          cd /tmp
+#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
+#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
+#          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
+#
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#
+#      - name: ccache
+#        uses: ggml-org/ccache-action@v1.2.21
+#        with:
+#          key: sycl-ubuntu-24-${{ matrix.build }}
+#          evict-old-files: 1d
+#          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+#
+#      - name: Build
+#        id: cmake_build
+#        run: |
+#          source /opt/intel/oneapi/setvars.sh
+#          cmake -B build \
+#            -G "Ninja" \
+#            -DCMAKE_BUILD_TYPE=Release \
+#            -DGGML_SYCL=ON \
+#            -DCMAKE_C_COMPILER=icx \
+#            -DCMAKE_CXX_COMPILER=icpx \
+#            -DLLAMA_OPENSSL=OFF \
+#            -DGGML_NATIVE=OFF \
+#            -DGGML_SYCL_F16=${{ matrix.fp16 }}
+#          time cmake --build build --config Release -j $(nproc)

-    runs-on: ubuntu-24.04
-
-    env:
-      ONEAPI_ROOT: /opt/intel/oneapi/
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-      LEVEL_ZERO_VERSION: "1.28.2"
-      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v6
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          cd /tmp
-          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-
-      - name: Install Level Zero SDK
-        shell: bash
-        run: |
-          cd /tmp
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
-          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-sycl-${{ matrix.build }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DLLAMA_OPENSSL=OFF \
-            -DGGML_NATIVE=OFF \
-            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-          time cmake --build build --config Release -j $(nproc)
-
-  windows-latest-sycl:
-    runs-on: windows-2022
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      - name: Install Level Zero SDK
-        shell: pwsh
-        run: |
-          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
-          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
-          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: windows-latest-sycl
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
-      - name: Build
-        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
+# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
+#       in order to enable it again, we have to provision dedicated runners  to run it
+#  windows-latest-sycl:
+#    runs-on: windows-2022
+#
+#    defaults:
+#      run:
+#        shell: bash
+#
+#    env:
+#      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
+#      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+#      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
+#      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#
+#      - name: Use oneAPI Installation Cache
+#        uses: actions/cache@v5
+#        id: cache-sycl
+#        with:
+#          path: ${{ env.ONEAPI_ROOT }}
+#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+#
+#      - name: Download & Install oneAPI
+#        shell: bash
+#        if: steps.cache-sycl.outputs.cache-hit != 'true'
+#        run: |
+#          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+#
+#      - name: Install Level Zero SDK
+#        shell: pwsh
+#        run: |
+#          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
+#          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
+#          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
+#
+#      - name: ccache
+#        uses: ggml-org/ccache-action@v1.2.21
+#        with:
+#          key: sycl-windows-latest
+#          variant: ccache
+#          evict-old-files: 1d
+#          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+#
+#      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+#
+#      - name: Build
+#        id: cmake_build
+#        run:  examples/sycl/win-build-sycl.bat
--- a/.github/workflows/build-vulkan.yml
+++ b/.github/workflows/build-vulkan.yml
@@ -31,12 +31,59 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
-  ubuntu-24-vulkan-llvmpipe:
+  ubuntu:
+    strategy:
+      matrix:
+        include:
+          - build: 'x64'
+            os: ubuntu-24.04
+          - build: 'arm64'
+            os: ubuntu-24.04-arm
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: vulkan-${{ matrix.os }}
+          variant: ccache
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev spirv-headers libssl-dev ninja-build
+          echo "CC=gcc-14" >> "$GITHUB_ENV"
+          echo "CXX=g++-14" >> "$GITHUB_ENV"
+
+      - name: Configure
+        id: cmake_configure
+        run: |
+          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+            -DGGML_BACKEND_DL=ON \
+            -DGGML_CPU_ALL_VARIANTS=ON \
+            -DGGML_VULKAN=ON
+
+      - name: Build
+        id: cmake_build
+        run: |
+          time cmake --build build -j $(nproc)
+
+  ubuntu-llvmpipe:
    runs-on: ubuntu-24.04

    steps:
@@ -47,7 +94,7 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: ubuntu-24-vulkan-llvmpipe
+          key: vulkan-ubuntu-24.04-llvmpipe
          evict-old-files: 1d
          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

@@ -68,7 +115,7 @@ jobs:
        id: cache-sdk
        with:
          path: ./vulkan_sdk
-          key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
+          key: cache-gha-vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}

      - name: Setup Vulkan SDK
        if: steps.cache-sdk.outputs.cache-hit != 'true'
--- a/.github/workflows/build-webgpu.yml
+++ b/.github/workflows/build-webgpu.yml
@@ -0,0 +1,181 @@
+name: CI (webgpu)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-webgpu.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+      '**/*.wgsl'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-webgpu.yml',
+      'ggml/src/ggml-webgpu/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+
+jobs:
+  macos:
+    runs-on: macos-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: webgpu-macos-latest
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Dawn Dependency
+        id: dawn-depends
+        run: |
+          DAWN_VERSION="v20260317.182325"
+          DAWN_OWNER="google"
+          DAWN_REPO="dawn"
+          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
+          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          curl -L -o artifact.tar.gz \
+            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          mkdir dawn
+          tar -xvf artifact.tar.gz -C dawn --strip-components=1
+
+      - name: Build
+        id: cmake_build
+        run: |
+          export CMAKE_PREFIX_PATH=dawn
+          cmake -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
+          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+  ubuntu:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: webgpu-ubuntu-24.04
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo add-apt-repository -y ppa:kisak/kisak-mesa
+          sudo apt-get update -y
+          sudo apt-get install -y build-essential mesa-vulkan-drivers \
+            libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
+
+      - name: Dawn Dependency
+        id: dawn-depends
+        run: |
+          sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
+          DAWN_VERSION="v20260317.182325"
+          DAWN_OWNER="google"
+          DAWN_REPO="dawn"
+          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
+          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          curl -L -o artifact.tar.gz \
+            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          mkdir dawn
+          tar -xvf artifact.tar.gz -C dawn --strip-components=1
+
+      - name: Build
+        id: cmake_build
+        run: |
+          export Dawn_DIR=dawn/lib64/cmake/Dawn
+          cmake -B build \
+            -DGGML_WEBGPU=ON
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          # This is using llvmpipe and runs slower than other backends
+          # test-backend-ops is too slow on llvmpipe, skip it
+          ctest -L main -E test-backend-ops --verbose --timeout 900
+
+  ubuntu-wasm:
+    strategy:
+      matrix:
+        include:
+          - build: 'x64'
+            os: ubuntu-24.04
+          - build: 'arm64'
+            os: ubuntu-24.04-arm
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: webgpu-${{ matrix.os }}-wasm
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Install Emscripten
+        run: |
+          git clone https://github.com/emscripten-core/emsdk.git
+          cd emsdk
+          ./emsdk install latest
+          ./emsdk activate latest
+
+      - name: Fetch emdawnwebgpu
+        run: |
+          DAWN_TAG="v20260317.182325"
+          EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
+          echo "Downloading ${EMDAWN_PKG}"
+          curl -L -o emdawn.zip \
+            "https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
+          unzip emdawn.zip
+
+      - name: Build WASM WebGPU
+        run: |
+          source emsdk/emsdk_env.sh
+          emcmake cmake -B build-wasm \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_WEBGPU=ON \
+            -DLLAMA_OPENSSL=OFF \
+            -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
+
+          time cmake --build build-wasm --config Release --target test-backend-ops -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
--- a/.github/workflows/hip-quality-check.yml
+++ b/.github/workflows/hip-quality-check.yml
@@ -28,9 +28,9 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
  ubuntu-22-hip-quality-check:
@@ -50,7 +50,7 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: ubuntu-22-hip-quality-check
+          key: hip-quality-check-ubuntu-22.04
          evict-old-files: 1d
          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -27,18 +27,40 @@ on:
      '**/*.glsl'
    ]

-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
  CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"

+# note: run this workflow one at a time for better cache reuse
+concurrency:
+  group: release
+  queue: max
+
 jobs:
+  check_release:
+    runs-on: ubuntu-slim

-  macOS-cpu:
+    outputs:
+      should_release: ${{ steps.check.outputs.should_release }}

+    steps:
+      - id: check
+        run: |
+          if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+            echo "should_release=true" >> $GITHUB_OUTPUT
+          elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/master" ]]; then
+            if echo "${{ github.event.head_commit.message }}" | grep -q '\[no release\]'; then
+              echo "should_release=false" >> $GITHUB_OUTPUT
+            else
+              echo "should_release=true" >> $GITHUB_OUTPUT
+            fi
+          else
+            echo "should_release=false" >> $GITHUB_OUTPUT
+          fi
+
+  macos-cpu:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}
    strategy:
      matrix:
        include:
@@ -46,10 +68,12 @@ jobs:
            arch: 'arm64'
            os: macos-14
            defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON"
-          - build: 'arm64-kleidiai'
-            arch: 'arm64'
-            os: macos-14
-            defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON"
+          # TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23780)
+          #       in order to enable it again, we have to provision dedicated runners  to run it
+          #- build: 'arm64-kleidiai'
+          #  arch: 'arm64'
+          #  os: macos-14
+          #  defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON"
          - build: 'x64'
            arch: 'x64'
            os: macos-15-intel
@@ -76,8 +100,8 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: macOS-latest-${{ matrix.arch }}
-          evict-old-files: 1d
+          key: release-${{ matrix.os }}-${{ matrix.arch }}
+          append-timestamp: false # note: use this only with non-concurrent jobs!

      - name: Build
        id: cmake_build
@@ -109,7 +133,8 @@ jobs:
          name: llama-bin-macos-${{ matrix.build }}.tar.gz

  ubuntu-cpu:
-
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}
    strategy:
      matrix:
        include:
@@ -140,8 +165,8 @@ jobs:
        if: ${{ matrix.build != 's390x' }}
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: ubuntu-cpu-${{ matrix.build }}
-          evict-old-files: 1d
+          key: release-${{ matrix.os }}-cpu
+          append-timestamp: false # note: use this only with non-concurrent jobs!

      - name: Dependencies
        id: depends
@@ -186,6 +211,8 @@ jobs:
          name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz

  ubuntu-vulkan:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}

    strategy:
      matrix:
@@ -214,8 +241,8 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: ubuntu-vulkan-${{ matrix.build }}
-          evict-old-files: 1d
+          key: release-${{ matrix.os }}-vulkan
+          append-timestamp: false # note: use this only with non-concurrent jobs!

      - name: Dependencies
        id: depends
@@ -262,6 +289,8 @@ jobs:
          name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz

  android-arm64:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}

    runs-on: ubuntu-latest

@@ -282,11 +311,17 @@ jobs:
          cache: "npm"
          cache-dependency-path: "tools/ui/package-lock.json"

-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: android-arm64
-          evict-old-files: 1d
+      # note : disabled to spare some cache space (https://github.com/ggml-org/llama.cpp/pull/23789)
+      #        for some reason, the ccache does not improve the build time in this case
+      # example:
+      #   cache off: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78160400831
+      #   cache on:  https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78224189394
+      #
+      #- name: ccache
+      #  uses: ggml-org/ccache-action@v1.2.21
+      #  with:
+      #    key: release-android-arm64
+      #    append-timestamp: false # note: use this only with non-concurrent jobs!

      - name: Set up JDK
        uses: actions/setup-java@v5
@@ -339,6 +374,8 @@ jobs:
          name: llama-bin-android-arm64.tar.gz

  ubuntu-24-openvino:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}

    runs-on: ubuntu-24.04

@@ -371,8 +408,8 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: ubuntu-24-openvino-release-no-preset-v1
-          evict-old-files: 1d
+          key: release-ubuntu-24.04-openvino-release-no-preset-v1
+          append-timestamp: false # note: use this only with non-concurrent jobs!

      - name: Dependencies
        run: |
@@ -385,7 +422,7 @@ jobs:
        id: cache-openvino
        with:
          path: ./openvino_toolkit
-          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+          key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}

      - name: Setup OpenVINO Toolkit
        if: steps.cache-openvino.outputs.cache-hit != 'true'
@@ -427,6 +464,8 @@ jobs:
          name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz

  windows-cpu:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}

    runs-on: windows-2025

@@ -452,9 +491,8 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: windows-latest-cpu-${{ matrix.arch }}
-          variant: ccache
-          evict-old-files: 1d
+          key: release-windows-2025-${{ matrix.arch }}-cpu
+          append-timestamp: false # note: use this only with non-concurrent jobs!

      - name: Install Ninja
        run: |
@@ -487,6 +525,8 @@ jobs:
          name: llama-bin-win-cpu-${{ matrix.arch }}.zip

  windows:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}

    runs-on: windows-2025

@@ -521,9 +561,8 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: windows-latest-${{ matrix.backend }}-${{ matrix.arch }}
-          variant: ccache
-          evict-old-files: 1d
+          key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
+          append-timestamp: false # note: use this only with non-concurrent jobs!

      - name: Install Vulkan SDK
        id: get_vulkan
@@ -577,12 +616,14 @@ jobs:
          name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip

  windows-cuda:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}

    runs-on: windows-2022

    strategy:
      matrix:
-        cuda: ['12.4', '13.1']
+        cuda: ['12.4', '13.3']

    steps:
      - name: Clone
@@ -596,12 +637,11 @@ jobs:
          cache: "npm"
          cache-dependency-path: "tools/ui/package-lock.json"

-      - name: Install ccache
+      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: windows-cuda-${{ matrix.cuda }}
-          variant: ccache
-          evict-old-files: 1d
+          key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
+          append-timestamp: false # note: use this only with non-concurrent jobs!

      - name: Install Cuda Toolkit
        uses: ./.github/actions/windows-setup-cuda
@@ -655,214 +695,216 @@ jobs:
          path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
          name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip

-  windows-sycl:
+# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
+#       in order to enable it again, we have to provision dedicated runners  to run it
+#  windows-sycl:
+#
+#    runs-on: windows-2022
+#
+#    defaults:
+#      run:
+#        shell: bash
+#
+#    env:
+#      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
+#      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+#      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
+#      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#
+#      - name: Use oneAPI Installation Cache
+#        uses: actions/cache@v5
+#        id: cache-sycl
+#        with:
+#          path: ${{ env.ONEAPI_ROOT }}
+#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+#
+#      - name: Download & Install oneAPI
+#        shell: bash
+#        if: steps.cache-sycl.outputs.cache-hit != 'true'
+#        run: |
+#          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+#
+#      - name: Install Level Zero SDK
+#        shell: pwsh
+#        run: |
+#          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
+#          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
+#          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
+#
+#      - name: Setup Node.js
+#        uses: actions/setup-node@v6
+#        with:
+#          node-version: "24"
+#          cache: "npm"
+#          cache-dependency-path: "tools/ui/package-lock.json"
+#
+#      - name: ccache
+#        uses: ggml-org/ccache-action@v1.2.21
+#        with:
+#          key: release-windows-2022-x64-sycl
+#          append-timestamp: false # note: use this only with non-concurrent jobs!
+#
+#      - name: Build
+#        id: cmake_build
+#        shell: cmd
+#        run: |
+#          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+#          cmake -G "Ninja" -B build ^
+#            -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
+#            -DCMAKE_BUILD_TYPE=Release ^
+#            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
+#            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
+#            -DLLAMA_BUILD_BORINGSSL=ON
+#          cmake --build build --target ggml-sycl -j
+#
+#      - name: Build the release package
+#        id: pack_artifacts
+#        run: |
+#          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
+#
+#          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
+#
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
+#          ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
+#          if [ -n "$ZE_LOADER_DLL" ]; then
+#            echo "Using Level Zero loader: $ZE_LOADER_DLL"
+#            cp "$ZE_LOADER_DLL" ./build/bin
+#          else
+#            echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
+#          fi
+#
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
+#
+#          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
+#
+#          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
+#
+#          echo "cp oneAPI running time dll files to ./build/bin done"
+#          7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
+#
+#      - name: Upload the release package
+#        uses: actions/upload-artifact@v6
+#        with:
+#          path: llama-bin-win-sycl-x64.zip
+#          name: llama-bin-win-sycl-x64.zip

-    runs-on: windows-2022
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      - name: Install Level Zero SDK
-        shell: pwsh
-        run: |
-          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
-          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
-          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: windows-latest-sycl
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        run: |
-          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
-          cmake -G "Ninja" -B build ^
-            -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
-            -DCMAKE_BUILD_TYPE=Release ^
-            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
-            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
-            -DLLAMA_BUILD_BORINGSSL=ON
-          cmake --build build --target ggml-sycl -j
-
-      - name: Build the release package
-        id: pack_artifacts
-        run: |
-          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
-
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
-          ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
-          if [ -n "$ZE_LOADER_DLL" ]; then
-            echo "Using Level Zero loader: $ZE_LOADER_DLL"
-            cp "$ZE_LOADER_DLL" ./build/bin
-          else
-            echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
-          fi
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
-
-          echo "cp oneAPI running time dll files to ./build/bin done"
-          7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
-
-      - name: Upload the release package
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-bin-win-sycl-x64.zip
-          name: llama-bin-win-sycl-x64.zip
-
-  ubuntu-24-sycl:
-
-    strategy:
-      matrix:
-        build: [fp32, fp16]
-        include:
-          - build: fp32
-            fp16: OFF
-          - build: fp16
-            fp16: ON
-
-    runs-on: ubuntu-24.04
-
-    env:
-      ONEAPI_ROOT: /opt/intel/oneapi/
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-      LEVEL_ZERO_VERSION: "1.28.2"
-      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          cd /tmp
-          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-
-      - name: Install Level Zero SDK
-        shell: bash
-        run: |
-          cd /tmp
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
-          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-sycl-${{ matrix.build }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DLLAMA_OPENSSL=OFF \
-            -DGGML_NATIVE=OFF \
-            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
-          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
+# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
+#       in order to enable it again, we have to provision dedicated runners  to run it
+#  ubuntu-24-sycl:
+#
+#    strategy:
+#      matrix:
+#        build: [fp32]
+#        include:
+#          - build: fp32
+#            fp16: OFF
+#
+#    runs-on: ubuntu-24.04
+#
+#    env:
+#      ONEAPI_ROOT: /opt/intel/oneapi/
+#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+#      LEVEL_ZERO_VERSION: "1.28.2"
+#      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#        with:
+#          fetch-depth: 0
+#
+#      - name: Use oneAPI Installation Cache
+#        uses: actions/cache@v5
+#        id: cache-sycl
+#        with:
+#          path: ${{ env.ONEAPI_ROOT }}
+#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+#
+#      - name: Download & Install oneAPI
+#        shell: bash
+#        if: steps.cache-sycl.outputs.cache-hit != 'true'
+#        run: |
+#          cd /tmp
+#          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
+#          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+#
+#      - name: Install Level Zero SDK
+#        shell: bash
+#        run: |
+#          cd /tmp
+#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
+#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
+#          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
+#
+#      - name: Setup Node.js
+#        uses: actions/setup-node@v6
+#        with:
+#          node-version: "24"
+#          cache: "npm"
+#          cache-dependency-path: "tools/ui/package-lock.json"
+#
+#      - name: ccache
+#        uses: ggml-org/ccache-action@v1.2.21
+#        with:
+#          key: release-ubuntu-24.04-sycl
+#          append-timestamp: false # note: use this only with non-concurrent jobs!
+#
+#      - name: Build
+#        id: cmake_build
+#        run: |
+#          source /opt/intel/oneapi/setvars.sh
+#          cmake -B build \
+#            -G "Ninja" \
+#            -DCMAKE_BUILD_TYPE=Release \
+#            -DGGML_SYCL=ON \
+#            -DCMAKE_C_COMPILER=icx \
+#            -DCMAKE_CXX_COMPILER=icpx \
+#            -DLLAMA_OPENSSL=OFF \
+#            -DGGML_NATIVE=OFF \
+#            -DGGML_SYCL_F16=${{ matrix.fp16 }}
+#          time cmake --build build --config Release -j $(nproc)
+#
+#      - name: Determine tag name
+#        id: tag
+#        uses: ./.github/actions/get-tag-name
+#
+#      - name: Pack artifacts
+#        id: pack_artifacts
+#        run: |
+#          cp LICENSE ./build/bin/
+#          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
+#
+#      - name: Upload artifacts
+#        uses: actions/upload-artifact@v6
+#        with:
+#          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
+#          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz

  ubuntu-22-rocm:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}

    runs-on: ubuntu-22.04

@@ -895,8 +937,8 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
-          evict-old-files: 1d
+          key: release-ubuntu-22.04-rocm-${{ matrix.ROCM_VERSION }}
+          append-timestamp: false # note: use this only with non-concurrent jobs!

      - name: Dependencies
        id: depends
@@ -974,6 +1016,8 @@ jobs:
          name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz

  windows-hip:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}

    runs-on: windows-2022

@@ -1010,13 +1054,13 @@ jobs:
        uses: actions/cache@v5
        with:
          path: C:\Program Files\AMD\ROCm
-          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
+          key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}

      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: windows-latest-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
-          evict-old-files: 1d
+          key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
+          append-timestamp: false # note: use this only with non-concurrent jobs!

      - name: Install ROCm
        if: steps.cache-rocm.outputs.cache-hit != 'true'
@@ -1088,6 +1132,8 @@ jobs:
          name: llama-bin-win-hip-${{ matrix.name }}-x64.zip

  ios-xcode-build:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}
    runs-on: macos-15

    steps:
@@ -1143,98 +1189,101 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
          name: llama-${{ steps.tag.outputs.name }}-xcframework.zip

-
-  openEuler-cann:
-    strategy:
-      matrix:
-        include:
-          # 910b with aclgraph (both architectures)
-          - arch: x86
-            chip_type: '910b'
-            build: 'Release'
-            use_acl_graph: 'on'
-          - arch: aarch64
-            chip_type: '910b'
-            build: 'Release'
-            use_acl_graph: 'on'
-          # 310p without aclgraph (both architectures)
-          - arch: x86
-            chip_type: '310p'
-            build: 'Release'
-            use_acl_graph: 'off'
-          - arch: aarch64
-            chip_type: '310p'
-            build: 'Release'
-            use_acl_graph: 'off'
-    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Free up disk space
-        uses: ggml-org/free-disk-space@v1.3.1
-        with:
-          tool-cache: true
-
-      - name: Set container image
-        id: cann-image
-        run: |
-          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
-          echo "image=${image}" >> "${GITHUB_OUTPUT}"
-
-      - name: Pull container image
-        run: docker pull "${{ steps.cann-image.outputs.image }}"
-
-      - name: Build
-        env:
-          BUILD_TYPE: ${{ matrix.build }}
-          SOC_TYPE: ascend${{ matrix.chip_type }}
-          USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
-        run: |
-          HOST_UID=$(id -u)
-          HOST_GID=$(id -g)
-
-          docker run --rm \
-            -v "${PWD}:/workspace" \
-            -w /workspace \
-            -e SOC_TYPE=${SOC_TYPE} \
-            -e BUILD_TYPE=${BUILD_TYPE} \
-            -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
-            "${{ steps.cann-image.outputs.image }}" \
-            bash -lc '
-              set -e
-              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
-              yum clean all && rm -rf /var/cache/yum
-              git config --global --add safe.directory "/workspace"
-              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-              cmake -S . -B build \
-                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-                  -DGGML_CANN=on \
-                  -DSOC_TYPE=${SOC_TYPE} \
-                  -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
-              cmake --build build -j $(nproc)
-
-              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
-            '
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
-          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
+# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
+#       in order to enable it again, we have to provision dedicated runners  to run it
+#  openEuler-cann:
+#    strategy:
+#      matrix:
+#        include:
+#          # 910b with aclgraph (both architectures)
+#          - arch: x86
+#            chip_type: '910b'
+#            build: 'Release'
+#            use_acl_graph: 'on'
+#          - arch: aarch64
+#            chip_type: '910b'
+#            build: 'Release'
+#            use_acl_graph: 'on'
+#          # 310p without aclgraph (both architectures)
+#          - arch: x86
+#            chip_type: '310p'
+#            build: 'Release'
+#            use_acl_graph: 'off'
+#          - arch: aarch64
+#            chip_type: '310p'
+#            build: 'Release'
+#            use_acl_graph: 'off'
+#    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+#    steps:
+#      - name: Checkout
+#        uses: actions/checkout@v6
+#        with:
+#          fetch-depth: 0
+#
+#      - name: Free up disk space
+#        uses: ggml-org/free-disk-space@v1.3.1
+#        with:
+#          tool-cache: true
+#
+#      - name: Set container image
+#        id: cann-image
+#        run: |
+#          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
+#          echo "image=${image}" >> "${GITHUB_OUTPUT}"
+#
+#      - name: Pull container image
+#        run: docker pull "${{ steps.cann-image.outputs.image }}"
+#
+#      - name: Build
+#        env:
+#          BUILD_TYPE: ${{ matrix.build }}
+#          SOC_TYPE: ascend${{ matrix.chip_type }}
+#          USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
+#        run: |
+#          HOST_UID=$(id -u)
+#          HOST_GID=$(id -g)
+#
+#          docker run --rm \
+#            -v "${PWD}:/workspace" \
+#            -w /workspace \
+#            -e SOC_TYPE=${SOC_TYPE} \
+#            -e BUILD_TYPE=${BUILD_TYPE} \
+#            -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
+#            "${{ steps.cann-image.outputs.image }}" \
+#            bash -lc '
+#              set -e
+#              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
+#              yum clean all && rm -rf /var/cache/yum
+#              git config --global --add safe.directory "/workspace"
+#              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+#              cmake -S . -B build \
+#                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+#                  -DGGML_CANN=on \
+#                  -DSOC_TYPE=${SOC_TYPE} \
+#                  -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
+#              cmake --build build -j $(nproc)
+#
+#              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
+#            '
+#
+#      - name: Determine tag name
+#        id: tag
+#        uses: ./.github/actions/get-tag-name
+#
+#      - name: Pack artifacts
+#        run: |
+#          cp LICENSE ./build/bin/
+#          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
+#
+#      - name: Upload artifacts
+#        uses: actions/upload-artifact@v6
+#        with:
+#          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
+#          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz

  ui-build:
+    needs: [check_release]
+    if: ${{ needs.check_release.outputs.should_release == 'true' }}
    uses: ./.github/workflows/ui-build.yml

  release:
@@ -1251,17 +1300,17 @@ jobs:
      - windows
      - windows-cpu
      - windows-cuda
-      - windows-sycl
+      #- windows-sycl
      - windows-hip
      - ubuntu-22-rocm
      - ubuntu-cpu
      - ubuntu-vulkan
      - ubuntu-24-openvino
-      - ubuntu-24-sycl
+      #- ubuntu-24-sycl
      - android-arm64
-      - macOS-cpu
+      - macos-cpu
      - ios-xcode-build
-      - openEuler-cann
+      #- openEuler-cann
      - ui-build

    outputs:
@@ -1350,7 +1399,7 @@ jobs:

            **macOS/iOS:**
            - [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
-            - [macOS Apple Silicon (arm64, KleidiAI enabled)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64-kleidiai.tar.gz)
+            - macOS Apple Silicon (arm64, KleidiAI enabled) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23780)
            - [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
            - [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.zip)

@@ -1362,8 +1411,7 @@ jobs:
            - [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
            - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
            - [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
-            - [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
-            - [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)
+            - Ubuntu x64 (SYCL FP32) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)

            **Android:**
            - [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
@@ -1372,16 +1420,17 @@ jobs:
            - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
            - [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
            - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
-            - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip) - [CUDA 13.1 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.1-x64.zip)
+            - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip)
            - [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
-            - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
+            - Windows x64 (SYCL) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
            - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)

            **openEuler:**
-            - [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
-            - [openEuler x86 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86-aclgraph.tar.gz)
-            - [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
-            - [openEuler aarch64 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64-aclgraph.tar.gz)
+            - [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
+            - openEuler x86 (310p)
+            - openEuler x86 (910b, ACL Graph)
+            - openEuler aarch64 (310p)
+            - openEuler aarch64 (910b, ACL Graph)

            **UI:**
            - [UI](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-ui.tar.gz)
--- a/.github/workflows/server-sanitize.yml
+++ b/.github/workflows/server-sanitize.yml
@@ -26,10 +26,10 @@ on:
    ]

 env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_VERBOSITY: 10

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
@@ -37,7 +37,7 @@ concurrency:

 jobs:
  server:
-    runs-on: ubuntu-latest
+    runs-on: [self-hosted, CPU, Linux, llama-server]

    strategy:
      matrix:
@@ -46,19 +46,19 @@ jobs:
      fail-fast: false

    steps:
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install \
-            build-essential \
-            xxd \
-            git \
-            cmake \
-            curl \
-            wget \
-            language-pack-en \
-            libssl-dev
+      #- name: Dependencies
+      #  id: depends
+      #  run: |
+      #    sudo apt-get update
+      #    sudo apt-get -y install \
+      #      build-essential \
+      #      xxd \
+      #      git \
+      #      cmake \
+      #      curl \
+      #      wget \
+      #      language-pack-en \
+      #      libssl-dev

      - name: Clone
        id: checkout
--- a/.github/workflows/server-self-hosted.yml
+++ b/.github/workflows/server-self-hosted.yml
@@ -29,10 +29,10 @@ on:
    ]

 env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_VERBOSITY: 10

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -44,25 +44,20 @@ on:
    ]

 env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_VERBOSITY: 10

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

 jobs:
-  ui-build:
-    name: Build Web UI
-    uses: ./.github/workflows/ui-build.yml
+  ubuntu:
+    runs-on: ubuntu-24.04

-  server:
-    runs-on: ubuntu-latest
-    needs: ui-build
-
-    name: server (${{ matrix.wf_name }})
+    name: ubuntu (${{ matrix.wf_name }})
    strategy:
      matrix:
        build_type: [Release]
@@ -98,17 +93,17 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

-      - name: Download built UI
-        uses: actions/download-artifact@v7
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
        with:
-          name: ui-build
-          path: tools/ui/dist
+          key: server-ubuntu-24.04-x64
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
        run: |
          cmake -B build \
-            -DLLAMA_BUILD_BORINGSSL=ON \
            -DGGML_SCHED_NO_REALLOC=ON
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server

@@ -135,8 +130,8 @@ jobs:
          export ${{ matrix.extra_args }}
          SLOW_TESTS=1 pytest -v -x

-  server-windows:
-    runs-on: windows-2022
+  windows:
+    runs-on: windows-2025

    steps:
      - name: Clone
@@ -146,16 +141,24 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
        with:
-          node-version: "24"
+          key: server-windows-2025-x64
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
+        shell: cmd
        run: |
-          cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
+          cmake -B build -G "Ninja Multi-Config" ^
+            -DCMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake ^
+            -DCMAKE_BUILD_TYPE=Release ^
+            -DLLAMA_BUILD_BORINGSSL=ON ^
+            -DGGML_SCHED_NO_REALLOC=ON
+          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
+          cmake --build build --config Release -j %NINJA_JOBS% --target llama-server

      - name: Python setup
        id: setup_python
--- a/.github/workflows/ui-self-hosted.yml
+++ b/.github/workflows/ui-self-hosted.yml
@@ -30,10 +30,10 @@ on:
    ]

 env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_VERBOSITY: 10

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
--- a/.github/workflows/ui.yml
+++ b/.github/workflows/ui.yml
@@ -26,10 +26,10 @@ on:
    ]

 env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_VERBOSITY: 10

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -63,6 +63,7 @@ After submitting your PR:
 - Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
 - Let other maintainers merge their own PRs
 - When merging a PR, make sure you have a good understanding of the changes
+- If a PR does not warrant a new release, add `[no release]` in the squashed commit to spare CI resources
 - Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)

 Maintainers reserve the right to decline review or close pull requests for any reason, without any questions, particularly under any of the following conditions:
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -66,6 +66,8 @@ fi

 if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
+else
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF"
 fi

 if [ ! -z ${GG_BUILD_CUDA} ]; then
@@ -114,10 +116,7 @@ fi
 if [ ! -z ${GG_BUILD_VULKAN} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"

-    # if on Mac, disable METAL
    if [[ "$OSTYPE" == "darwin"* ]]; then
-        CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
-
        MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION="/usr/local/lib/cmake/vulkan"
        MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION="${MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION}/SPIRV-Headers/SPIRV-HeadersConfig.cmake"
        if [[ -f "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" || -h "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" ]]; then
@@ -133,7 +132,7 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
 fi

 if [ ! -z ${GG_BUILD_WEBGPU} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1 -DGGML_METAL=OFF -DGGML_BLAS=OFF"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"

    if [ ! -z "${GG_BUILD_WEBGPU_DAWN_PREFIX}" ]; then
        if [ -z "${CMAKE_PREFIX_PATH}" ]; then
@@ -167,6 +166,8 @@ fi

 if [ ! -z ${GG_BUILD_BLAS} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=${GG_BUILD_BLAS_VENDOR:-OpenBLAS}"
+else
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=OFF"
 fi

 if [ ! -z ${GG_BUILD_OPENVINO} ]; then
@@ -700,8 +701,8 @@ function gg_sum_test_backend_ops_cpu {

 ## main

-export LLAMA_LOG_PREFIX=1
-export LLAMA_LOG_TIMESTAMPS=1
+export LLAMA_ARG_LOG_PREFIX=1
+export LLAMA_ARG_LOG_TIMESTAMPS=1

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2998,7 +2998,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
            key_file.close();
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_KEY_FILE"));
    add_opt(common_arg(
        {"--ssl-key-file"}, "FNAME",
        "path to file a PEM-encoded SSL private key",
@@ -3026,7 +3026,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                params.default_template_kwargs[item.key()] = item.value().dump();
            }
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CHAT_TEMPLATE_KWARGS"));
    add_opt(common_arg(
        {"-to", "--timeout"}, "N",
        string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@@ -3327,7 +3327,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params &, const std::string & value) {
            common_log_set_file(common_log_main(), value.c_str());
        }
-    ).set_env("LLAMA_LOG_FILE"));
+    ).set_env("LLAMA_ARG_LOG_FILE"));
    add_opt(common_arg(
        {"--log-colors"}, "[on|off|auto]",
        "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
@@ -3344,7 +3344,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                    string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
            }
        }
-    ).set_env("LLAMA_LOG_COLORS"));
+    ).set_env("LLAMA_ARG_LOG_COLORS"));
    add_opt(common_arg(
        {"-v", "--verbose", "--log-verbose"},
        "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
@@ -3359,7 +3359,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.offline = true;
        }
-    ).set_env("LLAMA_OFFLINE"));
+    ).set_env("LLAMA_ARG_OFFLINE"));
    add_opt(common_arg(
        {"-lv", "--verbosity", "--log-verbosity"}, "N",
        string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
@@ -3374,7 +3374,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.verbosity = value;
            common_log_set_verbosity_thold(value);
        }
-    ).set_env("LLAMA_LOG_VERBOSITY"));
+    ).set_env("LLAMA_ARG_LOG_VERBOSITY"));
    add_opt(common_arg(
        {"--log-prefix"},
        {"--no-log-prefix"},
--- a/conversion/init.py
+++ b/conversion/init.py
@@ -74,6 +74,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "Gemma3nForCausalLM": "gemma",
    "Gemma3nForConditionalGeneration": "gemma",
    "Gemma4ForConditionalGeneration": "gemma",
+    "Gemma4ForCausalLM": "gemma",
    "GemmaForCausalLM": "gemma",
    "Glm4ForCausalLM": "glm",
    "Glm4MoeForCausalLM": "glm",
@@ -215,6 +216,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "T5EncoderModel": "t5",
    "T5ForConditionalGeneration": "t5",
    "T5WithLMHeadModel": "t5",
+    "TalkieForCausalLM": "talkie",
    "UMT5ForConditionalGeneration": "t5",
    "UMT5Model": "t5",
    "UltravoxModel": "ultravox",
--- a/conversion/base.py
+++ b/conversion/base.py
@@ -119,7 +119,8 @@ class ModelBase:
                 small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
                 disable_mistral_community_chat_template: bool = False,
                 sentence_transformers_dense_modules: bool = False,
-                 fuse_gate_up_exps: bool = False):
+                 fuse_gate_up_exps: bool = False,
+                 fp8_as_q8: bool = False):
        if type(self) is ModelBase or \
                type(self) is TextModel or \
                type(self) is MmprojModel:
@@ -148,6 +149,8 @@ class ModelBase:
        self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
        self._is_nvfp4 = False
        self._is_mxfp4 = False
+        self._fp8_as_q8 = fp8_as_q8
+        self._fp8_dequantized: set[str] = set()

        # Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
        # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
@@ -429,6 +432,8 @@ class ModelBase:
                        s = self.model_tensors[name]
                        self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
                        tensors_to_remove.append(name)
+                        if self._fp8_as_q8:
+                            self._fp8_dequantized.add(weight_name)
                    if name.endswith(".activation_scale"):  # unused
                        tensors_to_remove.append(name)
                    if name.endswith("_activation_scale"):  # Mistral-Small-4-119B-2602, unused
@@ -440,6 +445,8 @@ class ModelBase:
                        s = self.model_tensors[name]
                        self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
                        tensors_to_remove.append(name)
+                        if self._fp8_as_q8:
+                            self._fp8_dequantized.add(weight_name)
                    if name.endswith(".qscale_act"):
                        tensors_to_remove.append(name)
            elif quant_method == "gptq":
@@ -467,7 +474,14 @@ class ModelBase:
            elif quant_method == "compressed-tensors":
                quant_format = quant_config["format"]
                groups = quant_config["config_groups"]
-                if len(groups) > 1:
+                nvfp4_compressed_tensors = (
+                    quant_format == "nvfp4-pack-quantized"
+                    or quant_format == "mixed-precision"
+                    and bool(groups)
+                    and all(g.get("format") == "nvfp4-pack-quantized" for g in groups.values() if isinstance(g, dict))
+                )
+
+                if len(groups) > 1 and not nvfp4_compressed_tensors:
                    raise NotImplementedError("Can't handle multiple config groups for compressed-tensors yet")
                weight_config = tuple(groups.values())[0]["weights"]

@@ -476,6 +490,11 @@ class ModelBase:
                    strategy = weight_config.get("strategy")
                    assert strategy == "channel" or strategy == "block"
                    assert weight_config.get("group_size") is None  # didn't find a model using this yet
+                    is_fp8 = (
+                        quant_format == "float-quantized"
+                        and weight_config.get("type") == "float"
+                        and weight_config.get("num_bits") == 8
+                    )
                    for name in self.model_tensors.keys():
                        if name.endswith(".weight_scale"):
                            weight_name = name.removesuffix("_scale")
@@ -483,6 +502,8 @@ class ModelBase:
                            s = self.model_tensors[name]
                            self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), block_size)
                            tensors_to_remove.append(name)
+                            if self._fp8_as_q8 and is_fp8:
+                                self._fp8_dequantized.add(weight_name)
                elif quant_format == "pack-quantized":
                    assert weight_config.get("strategy") == "group"
                    assert weight_config.get("type", "int") == "int"
@@ -505,6 +526,9 @@ class ModelBase:
                            tensors_to_remove += [base_name + n for n in ("_packed", "_shape", "_scale")]
                            if (base_name + "_zero_point") in self.model_tensors:
                                tensors_to_remove.append(base_name + "_zero_point")
+                elif nvfp4_compressed_tensors:
+                    # Don't error from compressed-tensors, we'll handle them in _generate_nvfp4_tensors
+                    pass
                else:
                    raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported")
            elif quant_method == "modelopt":
@@ -514,10 +538,18 @@ class ModelBase:
                for name in self.model_tensors.keys():
                    if name.endswith(".weight_scale"):
                        weight_name = name.removesuffix("_scale")
+                        if weight_name not in self.model_tensors:
+                            tensors_to_remove.append(name)
+                            continue
                        w = self.model_tensors[weight_name]
                        s = self.model_tensors[name]
+                        is_fp8_weight = False
+                        if self._fp8_as_q8:
+                            is_fp8_weight = w().dtype in (torch.float8_e4m3fn, torch.float8_e5m2)
                        self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None)
                        tensors_to_remove.append(name)
+                        if is_fp8_weight:
+                            self._fp8_dequantized.add(weight_name)
                    if name.endswith((".input_scale", ".k_scale", ".v_scale")):
                        tensors_to_remove.append(name)
            elif quant_method is not None:
@@ -605,8 +637,10 @@ class ModelBase:
        return [(new_name, data_torch)]

    def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
-        del name, new_name, bid, n_dims  # unused
-
+        del new_name, bid  # unused
+        # Force FP8-original tensors to Q8_0 when requested; Q8_0 is faster than F16/BF16.
+        if self._fp8_as_q8 and name in self._fp8_dequantized and n_dims >= 2:
+            return gguf.GGMLQuantizationType.Q8_0
        return False

    # some models need extra generated tensors (like rope_freqs)
@@ -746,10 +780,13 @@ class ModelBase:
        del experts, merged

    def prepare_tensors(self):
-        # detect NVFP4 quantization (ModelOpt format)
-        quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
-        quant_method = (self.hparams.get("quantization_config") or {}).get("quant_method")
-        quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {}
+        # detect NVFP4 quantization (ModelOpt and Compressed-tensors formats)
+        quantization_config = self.hparams.get("quantization_config") or {}
+        quant_algo = quantization_config.get("quant_algo")
+        quant_method = quantization_config.get("quant_method")
+        quant_format = quantization_config.get("format")
+        quant_groups = quantization_config.get("config_groups") or {}
+        quant_layers = quantization_config.get("quantized_layers") or {}
        quant_config_file = self.dir_model / "hf_quant_config.json"

        if (not quant_algo or not quant_layers) and quant_config_file.is_file():
@@ -760,13 +797,25 @@ class ModelBase:
                producer_name = (producer.get("name") or "").lower()
                if quant_method is None:
                    self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name
+                    quant_method = producer_name
                quant_algo = quant_config.get("quant_algo", quant_algo)
+                quant_method = quant_config.get("quant_method", quant_method)
+                quant_format = quant_config.get("format", quant_format)
+                quant_groups = quant_config.get("config_groups", quant_groups) or {}
                quant_layers = quant_config.get("quantized_layers", quant_layers) or {}

        # Some models use per-tensor quant_algo (e.g. "MIXED_PRECISION" with
        # per-layer NVFP4/FP8) instead of a single global "NVFP4" value.
+        nvfp4_compressed_tensors = quant_method == "compressed-tensors" and (
+            quant_format == "nvfp4-pack-quantized"
+            or quant_format == "mixed-precision"
+            and bool(quant_groups)
+            and all(g.get("format") == "nvfp4-pack-quantized" for g in quant_groups.values() if isinstance(g, dict))
+        )
        if quant_algo != "NVFP4":
-            if any(v.get("quant_algo") == "NVFP4" for v in quant_layers.values() if isinstance(v, dict)):
+            if nvfp4_compressed_tensors:
+                quant_algo = "NVFP4"
+            elif any(str(v.get("quant_algo")).endswith("NVFP4") for v in quant_layers.values() if isinstance(v, dict)):
                quant_algo = "NVFP4"

        self._is_nvfp4 = quant_algo == "NVFP4"
@@ -776,6 +825,28 @@ class ModelBase:
        # This must run before dequant_model so NVFP4 tensors are removed
        # from model_tensors, leaving only non-NVFP4 (e.g. FP8) for dequant.
        if self._is_nvfp4:
+            if nvfp4_compressed_tensors:
+                # Convert compressed-tensors 'global' scales into the reciprocal
+                def inverse_scale(gen):
+                    def load():
+                        scale = LazyTorchTensor.to_eager(gen()).float()
+                        return 1.0 / scale
+                    return load
+
+                # Change the compressed-tensors names to the ModelOpt names for handling consistently later
+                for name in list(self.model_tensors.keys()):
+                    if name.endswith(".weight_packed"):
+                        weight_name = name.removesuffix("_packed")
+                        if weight_name not in self.model_tensors:
+                            self.model_tensors[weight_name] = self.model_tensors.pop(name)
+                    elif name.endswith(".weight_global_scale"):
+                        scale2_name = name.replace(".weight_global_scale", ".weight_scale_2")
+                        if scale2_name not in self.model_tensors:
+                            self.model_tensors[scale2_name] = inverse_scale(self.model_tensors.pop(name))
+                    elif name.endswith(".input_global_scale"):
+                        input_scale_name = name.replace(".input_global_scale", ".input_scale")
+                        if input_scale_name not in self.model_tensors:
+                            self.model_tensors[input_scale_name] = inverse_scale(self.model_tensors.pop(name))
            self._generate_nvfp4_tensors()

        self.dequant_model()
@@ -1575,6 +1646,12 @@ class TextModel(ModelBase):
        if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57":
            # ref: https://huggingface.co/sarvamai/sarvam-30b
            res = "sarvam-moe"
+        if chkhsh == "f728162c1315c26e40249849799b4ba3fe584c32084b4795b03eb295e63cb5af":
+            # ref: https://huggingface.co/lewtun/talkie-1930-13b-it-hf
+            res = "talkie"
+        if chkhsh == "36f3066e97b7f3994b379aaacde306c1444c6ae84e81a5ae3cd2b7ed3b8c42d4":
+            # ref: https://huggingface.co/openbmb/MiniCPM5-1B
+            res = "minicpm5"

        if res is None:
            logger.warning("\n")
@@ -2364,10 +2441,9 @@ class MmprojModel(ModelBase):
        raise KeyError(f"could not find any of: {keys}")

    def tensor_force_quant(self, name, new_name, bid, n_dims):
-        del bid, name, n_dims  # unused
        if ".patch_embd.weight" in new_name or ".patch_merger.weight" in new_name:
            return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
-        return False
+        return super().tensor_force_quant(name, new_name, bid, n_dims)


 class LazyTorchTensor(gguf.LazyBase):
--- a/conversion/gemma.py
+++ b/conversion/gemma.py
@@ -614,7 +614,7 @@ class Gemma3NModel(Gemma3Model):
        yield from super().modify_tensors(data_torch, name, bid)


-@ModelBase.register("Gemma4ForConditionalGeneration")
+@ModelBase.register("Gemma4ForConditionalGeneration", "Gemma4ForCausalLM")
 class Gemma4Model(Gemma3Model):
    model_arch = gguf.MODEL_ARCH.GEMMA4

--- a/conversion/qwen.py
+++ b/conversion/qwen.py
@@ -1,6 +1,5 @@
 from __future__ import annotations

-from pathlib import Path
 from typing import Any, Callable, Iterable, TYPE_CHECKING

 import torch
@@ -549,6 +548,7 @@ class _Qwen35MtpMixin:
    tensor_map: gguf.TensorNameMap
    no_mtp: bool
    mtp_only: bool
+    _original_block_count: int | None = None

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@@ -557,22 +557,44 @@ class _Qwen35MtpMixin:
            self.block_count += self.hparams.get("mtp_num_hidden_layers", 0)
        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)

+    def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
+        hparams = {**self.hparams, **self.hparams.get("text_config", {})}
+        key = next((k for k in ["n_layers", "num_hidden_layers", "n_layer", "num_layers"] if k in hparams), None)
+        type(self)._original_block_count = hparams.get(key)
+        return super().index_tensors(remote_hf_model_id=remote_hf_model_id)  # ty: ignore[unresolved-attribute]
+
    @classmethod
    def filter_tensors(cls, item):
-        name, _ = item
+        assert cls._original_block_count is not None
+        # TODO: change TextModel to super()
+        if (titem := TextModel.filter_tensors(item)) is None:
+            return None
+        name, gen = titem
+        if name.startswith("model.mtp."):
+            name = name.replace("model.", "", 1)
        if name.startswith("mtp."):
            if cls.no_mtp:
                return None
-            return item
-        if cls.mtp_only:
-            canonical = name.replace("language_model.", "")
-            keep = canonical in (
+            remapper = {
+                "fc":                    "eh_proj",
+                "pre_fc_norm_embedding": "enorm",
+                "pre_fc_norm_hidden":    "hnorm",
+                "norm":                  "shared_head.norm",
+            }
+            parts = name.split(".", 3)
+            if len(parts) == 4 and parts[1] == "layers" and parts[2].isdecimal():
+                mtp_idx = int(parts[2])
+                name = f"model.layers.{cls._original_block_count + mtp_idx}.{parts[3]}"
+            elif len(parts) == 3 and parts[1] in remapper:
+                name = f"model.layers.{cls._original_block_count}.{remapper[parts[1]]}.{parts[2]}"
+        elif cls.mtp_only:
+            keep = name in (
                "model.embed_tokens.weight", "model.norm.weight", "lm_head.weight",
                "embed_tokens.weight", "norm.weight",
            )
            if not keep:
                return None
-        return super().filter_tensors(item)  # ty: ignore[unresolved-attribute]
+        return name, gen

    def set_gguf_parameters(self):
        super().set_gguf_parameters()  # ty: ignore[unresolved-attribute]
@@ -594,29 +616,6 @@ class _Qwen35MtpMixin:
            self.metadata.version, size_label=None, output_type=output_type, model_type=None)    # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
        self.fname_out = self.fname_out.parent / f"mtp-{fname_default}.gguf"

-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if name.startswith("mtp."):
-            n_layer = self.hparams["num_hidden_layers"]
-            if name.find("layers.") != -1:
-                assert bid is not None
-                name = name.replace(f"mtp.layers.{bid}", f"model.layers.{bid + n_layer}")
-                bid = bid + n_layer
-            else:
-                remapper = {
-                    "mtp.fc":                    "model.layers.{bid}.eh_proj",
-                    "mtp.pre_fc_norm_embedding": "model.layers.{bid}.enorm",
-                    "mtp.pre_fc_norm_hidden":    "model.layers.{bid}.hnorm",
-                    "mtp.norm":                  "model.layers.{bid}.shared_head.norm",
-                }
-                stem   = Path(name).stem
-                suffix = Path(name).suffix
-                tmpl   = remapper[stem] + suffix
-                for b in range(n_layer, self.block_count):
-                    yield from super().modify_tensors(data_torch, tmpl.format(bid=b), b)  # ty: ignore[unresolved-attribute]
-                return
-
-        yield from super().modify_tensors(data_torch, name, bid)  # ty: ignore[unresolved-attribute]
-

@ModelBase.register("Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM")
 class Qwen3_5TextModel(_Qwen35MtpMixin, _Qwen35MRopeMixin, _LinearAttentionVReorderBase):
--- a/conversion/talkie.py
+++ b/conversion/talkie.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+from typing import Iterable, TYPE_CHECKING
+
+import torch
+
+if TYPE_CHECKING:
+    from torch import Tensor
+
+from .base import LazyTorchTensor, ModelBase, TextModel, gguf
+
+
+@ModelBase.register("TalkieForCausalLM")
+class TalkieModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.TALKIE
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        # Talkie used F.rms_norm without an explicit eps
+        self.gguf_writer.add_layer_norm_rms_eps(torch.finfo(torch.float32).eps)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        prefix = f"model.blocks.{bid}." if bid is not None else ""
+        suffix = name.removeprefix(prefix)
+
+        if suffix == "attn_gain.a_g":
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid, ".scale"), data_torch
+            return
+        elif suffix == "mlp_gain.a_g":
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid, ".scale"), data_torch
+            return
+        elif suffix == "lm_head_gain.w_g":
+            self.gguf_writer.add_logit_scale(LazyTorchTensor.to_eager(data_torch).item())
+            return
+        elif suffix in ("attn.attn_query.weight", "attn.attn_key.weight"):
+            # absorb inverse rope
+            head_dim = self.hparams["head_dim"]
+            shape = data_torch.shape
+            data_torch = torch.reshape(data_torch, (-1, head_dim, shape[-1]))
+            signs = torch.ones((1, head_dim, 1), dtype=data_torch.dtype)
+            signs[:, head_dim // 2 :, :] = -1
+            if self.lazy:
+                signs = LazyTorchTensor.from_eager(signs)
+            # (n_head, head_dim, n_in) -> (n_out, n_in)
+            data_torch = torch.reshape(data_torch * signs, shape)
+        elif suffix == "attn.head_gain.head_g":
+            # allow head gain to broadcast
+            data_torch = data_torch.unsqueeze(-1)
+
+        if not name.endswith(".weight"):
+            name += ".weight"
+
+        yield from super().modify_tensors(data_torch, name, bid)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -148,6 +148,10 @@ def parse_args() -> argparse.Namespace:
        "--fuse-gate-up-exps", action="store_true",
        help="Fuse gate_exps and up_exps tensors into a single gate_up_exps tensor for MoE models.",
    )
+    parser.add_argument(
+        "--fp8-as-q8", action="store_true",
+        help="Store tensors dequantized from FP8 as Q8_0 instead of BF16/F16.",
+    )

    args = parser.parse_args()
    if not args.print_supported_models and args.model is None:
@@ -264,7 +268,8 @@ def main() -> None:
                                     small_first_shard=args.no_tensor_first_split,
                                     remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
                                     sentence_transformers_dense_modules=args.sentence_transformers_dense_modules,
-                                     fuse_gate_up_exps=args.fuse_gate_up_exps
+                                     fuse_gate_up_exps=args.fuse_gate_up_exps,
+                                     fp8_as_q8=args.fp8_as_q8,
                                     )

        if args.vocab_only:
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -156,6 +156,8 @@ models = [
    {"name": "kanana2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
    {"name": "f2llmv2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
    {"name": "sarvam-moe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
+    {"name": "talkie",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/lewtun/talkie-1930-13b-it-hf", },
+    {"name": "minicpm5",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM5-1B"},
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -208,6 +208,16 @@ class LoraTorchTensor:
    def to(self, *args, **kwargs):
        return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))

+    def __mul__(self, other) -> LoraTorchTensor:
+        # Only output-side multiplication for now
+        # W = B @ A, so M_out * W == (M_out * B) @ A
+        if not isinstance(other, (int, float)) and other.shape and other.shape[-1] != 1:
+            raise NotImplementedError
+        return LoraTorchTensor(self._lora_A, self._lora_B * other)
+
+    def __rmul__(self, other) -> LoraTorchTensor:
+        return self * other
+
    @classmethod
    def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
        del types  # unused
--- a/docs/autoparser.md
+++ b/docs/autoparser.md
@@ -459,7 +459,7 @@ Each returned parser is wrapped by `wrap_for_generation_prompt()`, which prepend

 - Usage: `./bin/llama-template-analysis path/to/template.jinja`

-**Debug Logging**: Enable with `LLAMA_LOG_VERBOSITY=2`
+**Debug Logging**: Enable with `LLAMA_ARG_LOG_VERBOSITY=2`

 - Shows detailed analysis steps, pattern extraction results, and generated parser structure

--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -743,6 +743,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
 | GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. |
 | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
+| GGML_SYCL_ENABLE_VMM | 0 or 1 (default) | Enable the virtual-memory device pool. |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
 | UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Allow SYCL/Unified Runtime Level Zero device allocations larger than 4 GiB. llama.cpp's direct Level Zero allocation path requests the relaxed maximum-size limit itself when GGML_SYCL_ENABLE_LEVEL_ZERO=1. |

@@ -753,6 +754,7 @@ Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spo
 | Name            | Function                                                                         |
 |-----------------|----------------------------------------------------------------------------------|
 | DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. |
+| DEBUG_SYCL_MALLOC | Enable verbose per-call logging of device pool alloc/free operations. |

 ## Design Rule

--- a/docs/backend/snapdragon/README.md
+++ b/docs/backend/snapdragon/README.md
@@ -10,7 +10,7 @@ This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
 This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.

 ```
-~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.6
+~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.7
 [d]/> cd /workspace
 ```

--- a/docs/multimodal/granitevision.md
+++ b/docs/multimodal/granitevision.md
@@ -176,7 +176,7 @@ Note that currently you cannot quantize the visual encoder because granite visio


 ### 5. Running the Model in Llama cpp
-Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
+Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the llama.cpp banner.

 ```bash
 $ ./build/bin/llama-mtmd-cli -m $LLM_GGUF_PATH \
--- a/examples/model-conversion/README.md
+++ b/examples/model-conversion/README.md
@@ -335,7 +335,7 @@ $ make perplexity-run-full QUANTIZED_MODEL=~/path/to/quantized/model-Qxx.gguf LO

 ## HuggingFace utilities
 The following targets are useful for creating collections and model repositories
-on Hugging Face in the the ggml-org. These can be used when preparing a release
+on Hugging Face in the ggml-org. These can be used when preparing a release
 to script the process for new model releases.

 For the following targets a `HF_TOKEN` environment variable is required.
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)

 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
-set(GGML_VERSION_MINOR 12)
+set(GGML_VERSION_MINOR 13)
 set(GGML_VERSION_PATCH 0)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -1189,8 +1189,8 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-    // a - x
-    // b - dy
+    // a - dy
+    // b - x
    GGML_API struct ggml_tensor * ggml_silu_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -150,7 +150,7 @@ static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t o

 static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
    // shift all elements after idx by 1 to the left, overwriting the element at idx
-    for (int i = idx; i < chunk->n_free_blocks; i++) {
+    for (int i = idx; i < chunk->n_free_blocks - 1; i++) {
        chunk->free_blocks[i] = chunk->free_blocks[i+1];
    }
    chunk->n_free_blocks--;
--- a/ggml/src/ggml-backend-meta.cpp
+++ b/ggml/src/ggml-backend-meta.cpp
@@ -13,6 +13,7 @@
 #include <cstring>
 #include <map>
 #include <memory>
+#include <set>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -392,64 +393,100 @@ static ggml_backend_buffer_type_t ggml_backend_meta_device_get_host_buffer_type(
 // meta backend buffer
 //

+// Container to hold the tensor slices per simple ggml backend buffer.
+struct ggml_backend_meta_simple_tensor_container {
+    std::vector<ggml_context_ptr> ctxs;
+    std::map<const ggml_tensor *, std::vector<ggml_tensor *>> simple_tensors;
+
+    ggml_backend_meta_simple_tensor_container(const ggml_init_params & params, const int n_simple) {
+        ctxs.reserve(n_simple);
+        for (int i = 0; i < n_simple; i++) {
+            ctxs.emplace_back(ggml_init(params));
+        }
+    }
+    ggml_backend_meta_simple_tensor_container() {}
+};
+
 struct ggml_backend_meta_buffer_context {
+    // FIXME
+    // Most tensors can simply be stored statically in their own buffer.
+    // Externally created views however also need a mapping to simple tensors but they use the buffer of the view source.
+    // If external views are simply using that buffer they will slowly deplete its memory.
+    // Current solution: rotating set of 2 "compute" containers to hold external views, works correctly for llama.cpp.
+    // Long-term: tie the lifetime of external views to the meta backend executing the graph instead,
+    //     currently not possible due to graph-external operations in the backend scheduler.
+    ggml_backend_meta_simple_tensor_container stc_static;
+    ggml_backend_meta_simple_tensor_container stc_compute[2];
+    int stc_compute_index      = 0;
+    int stc_compute_index_next = 0;
+    std::vector<ggml_backend_buffer_ptr> bufs;
+
+    // FIXME
+    // The size of the split state cache is unbounded and can theoretically grow infinitely large.
+    // However, it is also expensive to build and clearing it on every rebuild in ggml_backend_meta_graph_compute is too expensive.
    static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::padding);
-
    std::map<std::pair<const ggml_tensor *, bool>, std::pair<ggml_backend_meta_split_state, char[nbtc]>> split_state_cache;
-    std::map<          const ggml_tensor *,        std::vector<ggml_tensor *>>                           simple_tensors;
-
-    struct buffer_config {
-        ggml_context          * ctx;
-        ggml_backend_buffer_t   buf;
-
-        buffer_config(ggml_context * ctx, ggml_backend_buffer_t buf) : ctx(ctx), buf(buf) {}
-    };
-    std::vector<buffer_config> buf_configs;

    int debug;

-    ggml_backend_meta_buffer_context() {
+    ggml_backend_meta_buffer_context(
+            ggml_backend_meta_simple_tensor_container & stc_static,
+            ggml_backend_meta_simple_tensor_container & stc_compute_0,
+            ggml_backend_meta_simple_tensor_container & stc_compute_1,
+            const std::vector<ggml_backend_buffer_t> & bufs)
+            : stc_static(std::move(stc_static)), stc_compute{std::move(stc_compute_0), std::move(stc_compute_1)} {
+        this->bufs.reserve(bufs.size());
+        for (ggml_backend_buffer_t buf : bufs) {
+            this->bufs.emplace_back(buf);
+        }
        const char * GGML_META_DEBUG = getenv("GGML_META_DEBUG");
        debug = GGML_META_DEBUG ? atoi(GGML_META_DEBUG) : 0;
    }
+
+    ggml_backend_meta_simple_tensor_container & get_simple_tensor_container(const ggml_tensor * tensor) {
+        if (stc_static.simple_tensors.find(tensor) != stc_static.simple_tensors.end()) {
+            return stc_static;
+        }
+        return stc_compute[stc_compute_index];
+    }
 };

 static void ggml_backend_meta_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
-    for (auto & [ctx, buf] : buf_ctx->buf_configs) {
-        ggml_backend_buffer_free(buf);
-        ggml_free(ctx);
-    }
    delete buf_ctx;
 }

 static size_t ggml_backend_meta_buffer_n_bufs(ggml_backend_buffer_t meta_buf) {
    GGML_ASSERT(ggml_backend_buffer_is_meta(meta_buf));
    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) meta_buf->context;
-    return buf_ctx->buf_configs.size();
+    return buf_ctx->bufs.size();
 }

 static ggml_backend_buffer_t ggml_backend_meta_buffer_simple_buffer(ggml_backend_buffer_t meta_buf, size_t index) {
    GGML_ASSERT(ggml_backend_buffer_is_meta(meta_buf));
    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) meta_buf->context;
-    GGML_ASSERT(index < buf_ctx->buf_configs.size());
-    return buf_ctx->buf_configs[index].buf;
+    GGML_ASSERT(index < buf_ctx->bufs.size());
+    return buf_ctx->bufs[index].get();
 }

 static struct ggml_tensor * ggml_backend_meta_buffer_simple_tensor(const struct ggml_tensor * tensor, size_t index) {
    GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer));
    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
-    GGML_ASSERT(index < buf_ctx->buf_configs.size());
+    GGML_ASSERT(index < buf_ctx->bufs.size());

-    auto it = buf_ctx->simple_tensors.find(tensor);
-    if (it == buf_ctx->simple_tensors.end()) {
+    ggml_backend_meta_simple_tensor_container & stc = buf_ctx->get_simple_tensor_container(tensor);
+    auto it = stc.simple_tensors.find(tensor);
+    if (it == stc.simple_tensors.end()) {
        return nullptr;
    }
    return it->second[index];
 }

-static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) {
+static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync);
+
+static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
+        ggml_backend_meta_simple_tensor_container & stc, const struct ggml_tensor * tensor, bool assume_sync) {
    const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer);
    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;

@@ -785,7 +822,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co
                src_ss[i] = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
                continue;
            }
-            src_ss[i] = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true);
+            src_ss[i] = ggml_backend_meta_get_split_state(stc, tensor->src[i], /*assume_sync =*/ true);
            GGML_ASSERT(src_ss[i].axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN);
        }

@@ -1079,17 +1116,23 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co
    return ret;
 }

+static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) {
+    GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer));
+    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
+    return ggml_backend_meta_get_split_state(buf_ctx->get_simple_tensor_container(tensor), tensor, assume_sync);
+}
+
 static void * ggml_backend_meta_buffer_get_base(ggml_backend_buffer_t buffer) {
    GGML_UNUSED(buffer);
    return (void *) 0x1000000000000000; // FIXME
 }

-static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
-    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
-    const size_t n_simple_bufs = ggml_backend_meta_buffer_n_bufs(buffer);
+static enum ggml_status ggml_backend_meta_buffer_init_tensor_impl(ggml_backend_meta_simple_tensor_container & stc, ggml_tensor * tensor) {
+    GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer));
+    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
+    const size_t n_simple_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer);

-    const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ true);
+    const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(stc, tensor, /*assume_sync =*/ true);
    GGML_ASSERT(ggml_nelements(tensor) == 0 || split_state.axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN);
    GGML_ASSERT(split_state.n_segments <= 16);

@@ -1104,8 +1147,8 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
    std::vector<ggml_tensor *> simple_tensors;
    simple_tensors.reserve(n_simple_bufs);
    for (size_t j = 0; j < n_simple_bufs; j++) {
-        ggml_context          * simple_ctx = buf_ctx->buf_configs[j].ctx;
-        ggml_backend_buffer_t   simple_buf = buf_ctx->buf_configs[j].buf;
+        ggml_context          * simple_ctx = stc.ctxs[j].get();
+        ggml_backend_buffer_t   simple_buf = buf_ctx->bufs[j].get();

        if (split_dim >= 0 && split_dim < GGML_MAX_DIMS) {
            // TODO: the following assert fails for llama-parallel even though the results are correct:
@@ -1158,7 +1201,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
            t_ij->data = (char *) t_ij->view_src->data + t_ij->view_offs;
        } else if (simple_buf != nullptr) {
            t_ij->data = (char *) ggml_backend_buffer_get_base(simple_buf)
-                + size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(buffer));
+                + size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(tensor->buffer));
        }
        t_ij->extra = tensor->extra;
        for (int i = 0; i < GGML_MAX_SRC; i++) {
@@ -1194,11 +1237,18 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
        }
    }

-    buf_ctx->simple_tensors[tensor] = simple_tensors;
+    stc.simple_tensors[tensor] = simple_tensors;

    return GGML_STATUS_SUCCESS;
 }

+static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
+    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
+    buf_ctx->stc_compute_index = buf_ctx->stc_compute_index_next;
+    return ggml_backend_meta_buffer_init_tensor_impl(buf_ctx->get_simple_tensor_container(tensor), tensor);
+}
+
 static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(buffer);
    GGML_ASSERT(ggml_is_contiguous(tensor));
@@ -1413,8 +1463,9 @@ static void ggml_backend_meta_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
 }

 static void ggml_backend_meta_buffer_reset(ggml_backend_buffer_t buffer) {
-    const size_t n_buffers = ggml_backend_meta_buffer_n_bufs(buffer);
-    for (size_t i = 0; i < n_buffers; i++) {
+    GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
+    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
+    for (size_t i = 0; i < buf_ctx->bufs.size(); i++) {
        ggml_backend_buffer_reset(ggml_backend_meta_buffer_simple_buffer(buffer, i));
    }
 }
@@ -1440,21 +1491,24 @@ bool ggml_backend_buffer_is_meta(ggml_backend_buffer_t buf) {
 static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
    const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft);

-    ggml_init_params params = {
-        /*.mem_size   =*/ 1024*1024*1024, // FIXME
+    const ggml_init_params params = {
+        /*.mem_size   =*/ 1024*1024*ggml_tensor_overhead(), // FIXME
        /*.mem_buffer =*/ nullptr,
        /*.no_alloc   =*/ true,
    };
+    ggml_backend_meta_simple_tensor_container stc_static;
+    ggml_backend_meta_simple_tensor_container stc_compute_0(params, n_simple_bufts);
+    ggml_backend_meta_simple_tensor_container stc_compute_1(params, n_simple_bufts);

-    ggml_backend_meta_buffer_context * buf_ctx = new ggml_backend_meta_buffer_context();
    size_t max_size = 0;
-    buf_ctx->buf_configs.reserve(n_simple_bufts);
+    std::vector<ggml_backend_buffer_t> bufs;
+    bufs.reserve(n_simple_bufts);
    for (size_t i = 0; i < n_simple_bufts; i++) {
-        ggml_backend_buffer_t simple_buf = ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size);
-        GGML_ASSERT(simple_buf != nullptr);
-        max_size = std::max(max_size, ggml_backend_buffer_get_size(simple_buf));
-        buf_ctx->buf_configs.emplace_back(ggml_init(params), simple_buf);
+        bufs.push_back(ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size));
+        GGML_ASSERT(bufs.back() != nullptr);
+        max_size = std::max(max_size, ggml_backend_buffer_get_size(bufs.back()));
    }
+    ggml_backend_meta_buffer_context * buf_ctx = new ggml_backend_meta_buffer_context(stc_static, stc_compute_0, stc_compute_1, bufs);

    return ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, buf_ctx, max_size);
 }
@@ -1462,26 +1516,32 @@ static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_bac
 struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
    const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft);

-    ggml_init_params params = {
-        /*.mem_size   =*/ 1024*1024*1024, // FIXME
+    constexpr size_t compute_headroom = 16; // Maximum number of views per statically allocated tensor that can be created between evals.
+    const ggml_init_params params_static = {
+        /*.mem_size   =*/ ggml_get_mem_size(ctx),
        /*.mem_buffer =*/ nullptr,
        /*.no_alloc   =*/ true,
    };
+    const ggml_init_params params_compute = {
+        /*.mem_size   =*/ compute_headroom*ggml_get_mem_size(ctx),
+        /*.mem_buffer =*/ nullptr,
+        /*.no_alloc   =*/ true,
+    };
+    ggml_backend_meta_simple_tensor_container stc_static   (params_static,  n_simple_bufts);
+    ggml_backend_meta_simple_tensor_container stc_compute_0(params_compute, n_simple_bufts);
+    ggml_backend_meta_simple_tensor_container stc_compute_1(params_compute, n_simple_bufts);

-    ggml_backend_meta_buffer_context * meta_buf_ctx = new ggml_backend_meta_buffer_context();
-    meta_buf_ctx->buf_configs.reserve(n_simple_bufts);
-    for (size_t i = 0; i < n_simple_bufts; i++) {
-        meta_buf_ctx->buf_configs.emplace_back(ggml_init(params), nullptr);
-    }
+    std::vector<ggml_backend_buffer_t> bufs(n_simple_bufts, nullptr);
+    ggml_backend_meta_buffer_context * meta_buf_ctx = new ggml_backend_meta_buffer_context(stc_static, stc_compute_0, stc_compute_1, bufs);

    ggml_backend_buffer_t meta_buf = ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, meta_buf_ctx, 0);
    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
        t->buffer = meta_buf;
-        ggml_backend_meta_buffer_init_tensor(meta_buf, t);
+        ggml_backend_meta_buffer_init_tensor_impl(meta_buf_ctx->stc_static, t);
        t->data = (void *) 0x2000000000000000; // FIXME
    }
    for (size_t i = 0; i < n_simple_bufts; i++) {
-        ggml_context * ctx = meta_buf_ctx->buf_configs[i].ctx;
+        ggml_context * ctx = meta_buf_ctx->stc_static.ctxs[i].get();
        ggml_backend_buffer_type_t simple_buft = ggml_backend_meta_buft_simple_buft(buft, i);

        // If a ggml_context only has zero-sized tensors, ggml_backend_alloc_ctx_tensors_from_buft returns NULL.
@@ -1494,15 +1554,15 @@ struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struc
            }
        }
        if (any_nonzero_slice) {
-            meta_buf_ctx->buf_configs[i].buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, simple_buft);
+            meta_buf_ctx->bufs[i].reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx, simple_buft));
        } else {
-            meta_buf_ctx->buf_configs[i].buf = ggml_backend_buft_alloc_buffer(simple_buft, 0);
+            meta_buf_ctx->bufs[i].reset(ggml_backend_buft_alloc_buffer(simple_buft, 0));
            for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
-                t->buffer = meta_buf_ctx->buf_configs[i].buf;
+                t->buffer = meta_buf_ctx->bufs[i].get();
            }
        }
-        GGML_ASSERT(meta_buf_ctx->buf_configs[i].buf != nullptr);
-        meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->buf_configs[i].buf));
+        GGML_ASSERT(meta_buf_ctx->bufs[i]);
+        meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->bufs[i].get()));
    }
    return meta_buf;
 }
@@ -1724,6 +1784,26 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
    }

    if (needs_rebuild) {
+        std::set<ggml_backend_buffer_t> used_buffers;
+        for (int i = 0; i < cgraph->n_leafs; i++) {
+            if (ggml_backend_buffer_is_meta(cgraph->leafs[i]->buffer)) {
+                used_buffers.emplace(cgraph->leafs[i]->buffer);
+            }
+        }
+        for (int i = 0; i < cgraph->n_nodes; i++) {
+            if (ggml_backend_buffer_is_meta(cgraph->nodes[i]->buffer)) {
+                used_buffers.emplace(cgraph->nodes[i]->buffer);
+            }
+        }
+        for (ggml_backend_buffer_t buf : used_buffers) {
+            ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buf->context;
+            buf_ctx->stc_compute_index_next = buf_ctx->stc_compute_index ^ 1;
+            ggml_backend_meta_simple_tensor_container & stc = buf_ctx->stc_compute[buf_ctx->stc_compute_index_next];
+            for (ggml_context_ptr & ctx : stc.ctxs) {
+                ggml_reset(ctx.get());
+            }
+            stc.simple_tensors.clear();
+        }
        size_t n_subgraphs  = 0;
        size_t max_tmp_size = 0;

@@ -1909,7 +1989,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
            const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
            const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
            const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
-            ggml_init_params params = {
+            const ggml_init_params params = {
                /*.mem_size   =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
                /*.mem_buffer =*/ nullptr,
                /*.no_alloc   =*/ true,
--- a/ggml/src/ggml-cpu/vec.cpp
+++ b/ggml/src/ggml-cpu/vec.cpp
@@ -273,67 +273,51 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G

 #if defined(GGML_SIMD)
    #if defined(__ARM_FEATURE_SVE)
-        const int sve_register_length = svcntb() * 8; //get vector length
-        const int ggml_f16_epr = sve_register_length / 16; // running when 16
-        const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
+        const int ggml_f16_epr = svcnth();
+        const int ggml_f16_step = 8 * ggml_f16_epr;
+        const int np = n - (n % ggml_f16_step);
+        const int np2 = n - (n % ggml_f16_epr);

-        const int np= (n & ~(ggml_f16_step - 1));
-        svfloat16_t sum1 = svdup_n_f16(0.0f);
-        svfloat16_t sum2 = svdup_n_f16(0.0f);
-        svfloat16_t sum3 = svdup_n_f16(0.0f);
-        svfloat16_t sum4 = svdup_n_f16(0.0f);
+        svfloat32_t sum1_lo = svdup_n_f32(0.0f);
+        svfloat32_t sum1_hi = svdup_n_f32(0.0f);
+        svfloat32_t sum2_lo = svdup_n_f32(0.0f);
+        svfloat32_t sum2_hi = svdup_n_f32(0.0f);
+        svfloat32_t sum3_lo = svdup_n_f32(0.0f);
+        svfloat32_t sum3_hi = svdup_n_f32(0.0f);
+        svfloat32_t sum4_lo = svdup_n_f32(0.0f);
+        svfloat32_t sum4_hi = svdup_n_f32(0.0f);

-        svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
-        svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
        for (int i = 0; i < np; i += ggml_f16_step) {
-            ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
-            ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
-            sum1 = GGML_F16x_VEC_FMA(sum1, ax1, ay1);
-
-            ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
-            ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
-            sum2 = GGML_F16x_VEC_FMA(sum2, ax2, ay2);
-
-            ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
-            ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
-            sum3 = GGML_F16x_VEC_FMA(sum3, ax3, ay3);
-
-            ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
-            ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
-            sum4 = GGML_F16x_VEC_FMA(sum4, ax4, ay4);
-
-            ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
-            ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
-            sum1 = GGML_F16x_VEC_FMA(sum1, ax5, ay5);
-
-            ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
-            ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
-            sum2 = GGML_F16x_VEC_FMA(sum2, ax6, ay6);
-
-            ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
-            ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
-            sum3 = GGML_F16x_VEC_FMA(sum3, ax7, ay7);
-
-            ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
-            ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
-            sum4 = GGML_F16x_VEC_FMA(sum4, ax8, ay8);
+            ggml_sve_f16_fma_widened(&sum1_lo, &sum1_hi, GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0), GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0));
+            ggml_sve_f16_fma_widened(&sum2_lo, &sum2_hi, GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1), GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1));
+            ggml_sve_f16_fma_widened(&sum3_lo, &sum3_hi, GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2), GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2));
+            ggml_sve_f16_fma_widened(&sum4_lo, &sum4_hi, GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3), GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3));
+            ggml_sve_f16_fma_widened(&sum1_lo, &sum1_hi, GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4), GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4));
+            ggml_sve_f16_fma_widened(&sum2_lo, &sum2_hi, GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5), GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5));
+            ggml_sve_f16_fma_widened(&sum3_lo, &sum3_hi, GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6), GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6));
+            ggml_sve_f16_fma_widened(&sum4_lo, &sum4_hi, GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7), GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7));
        }

-        const int np2 = (n & ~(ggml_f16_epr - 1)); // round down to multiple of 8
-        for (int k = np; k < np2; k += ggml_f16_epr) {
-            svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
-            svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
-            sum1 = GGML_F16x_VEC_FMA(sum1, rx, ry);
+        for (int i = np; i < np2; i += ggml_f16_epr) {
+            ggml_sve_f16_fma_widened(&sum1_lo, &sum1_hi, GGML_F16x_VEC_LOAD(x + i, 0), GGML_F16x_VEC_LOAD(y + i, 0));
        }

        if (np2 < n) {
-            svbool_t pg = svwhilelt_b16(np2, n);
-            svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
-            svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
+            const svbool_t pg = svwhilelt_b16(np2, n);
+            const svfloat16_t rx = svld1_f16(pg, (const __fp16 *)(x + np2));
+            const svfloat16_t ry = svld1_f16(pg, (const __fp16 *)(y + np2));

-            sum1 = svmad_f16_x(pg, hx, hy, sum1);
+            ggml_sve_f16_fma_widened(&sum1_lo, &sum1_hi, rx, ry);
        }
-        GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4);
+
+        sum1_lo = svadd_f32_m(DEFAULT_PG32, sum1_lo, sum2_lo);
+        sum1_hi = svadd_f32_m(DEFAULT_PG32, sum1_hi, sum2_hi);
+        sum3_lo = svadd_f32_m(DEFAULT_PG32, sum3_lo, sum4_lo);
+        sum3_hi = svadd_f32_m(DEFAULT_PG32, sum3_hi, sum4_hi);
+        sum1_lo = svadd_f32_m(DEFAULT_PG32, sum1_lo, sum3_lo);
+        sum1_hi = svadd_f32_m(DEFAULT_PG32, sum1_hi, sum3_hi);
+
+        sumf = ggml_sve_sum_f32x2(sum1_lo, sum1_hi);
    #elif defined(__riscv_v_intrinsic)
        #if defined(__riscv_zvfh)
            int vl = __riscv_vsetvlmax_e32m2();
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -14,6 +14,35 @@
 // floating point type used to accumulate sums
 typedef double ggml_float;

+#if defined(__ARM_FEATURE_SVE)
+inline static void ggml_sve_f16_fma_widened(
+        svfloat32_t * acc_lo,
+        svfloat32_t * acc_hi,
+        svfloat16_t x,
+        svfloat16_t y) {
+#if defined(__ARM_FEATURE_SVE2)
+    *acc_lo = svmlalb_f32(*acc_lo, x, y);
+    *acc_hi = svmlalt_f32(*acc_hi, x, y);
+#else
+    // Plain SVE fallback path if SVE2 instructions not available
+    svfloat16_t x_even = svtrn1_f16(x, x);
+    svfloat16_t x_odd = svtrn2_f16(x, x);
+
+    svfloat16_t y_even = svtrn1_f16(y, y);
+    svfloat16_t y_odd = svtrn2_f16(y, y);
+
+    svbool_t pg = svptrue_b32();
+
+    *acc_lo = svmla_f32_x(pg, *acc_lo, svcvt_f32_f16_x(pg, x_even), svcvt_f32_f16_x(pg, y_even));
+    *acc_hi = svmla_f32_x(pg, *acc_hi, svcvt_f32_f16_x(pg, x_odd), svcvt_f32_f16_x(pg, y_odd));
+#endif
+}
+
+inline static ggml_float ggml_sve_sum_f32x2(svfloat32_t sum_lo, svfloat32_t sum_hi) {
+    return (ggml_float) (svaddv_f32(svptrue_b32(), sum_lo) + svaddv_f32(svptrue_b32(), sum_hi));
+}
+#endif
+
 #define GGML_GELU_FP16
 #define GGML_GELU_QUICK_FP16

@@ -122,108 +151,61 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
 #if defined(GGML_SIMD)
    #if defined(__ARM_FEATURE_SVE)

-        const int sve_register_length = svcntb() * 8;
-        const int ggml_f16_epr = sve_register_length / 16; // running when 16
-        const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
+        const int ggml_f16_epr = svcnth();
+        const int ggml_f16_step = 2 * ggml_f16_epr;
+        int np = n - (n % ggml_f16_step);
+        int np2 = n - (n % ggml_f16_epr);

-        int np = (n & ~(ggml_f16_step - 1));
-
-        svfloat16_t sum_00 = svdup_n_f16(0.0f);
-        svfloat16_t sum_01 = svdup_n_f16(0.0f);
-        svfloat16_t sum_02 = svdup_n_f16(0.0f);
-        svfloat16_t sum_03 = svdup_n_f16(0.0f);
-
-        svfloat16_t sum_10 = svdup_n_f16(0.0f);
-        svfloat16_t sum_11 = svdup_n_f16(0.0f);
-        svfloat16_t sum_12 = svdup_n_f16(0.0f);
-        svfloat16_t sum_13 = svdup_n_f16(0.0f);
-
-        svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
-        svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
+        svfloat32_t sum_0_0_lo = svdup_n_f32(0.0f);
+        svfloat32_t sum_0_0_hi = svdup_n_f32(0.0f);
+        svfloat32_t sum_0_1_lo = svdup_n_f32(0.0f);
+        svfloat32_t sum_0_1_hi = svdup_n_f32(0.0f);
+        svfloat32_t sum_1_0_lo = svdup_n_f32(0.0f);
+        svfloat32_t sum_1_0_hi = svdup_n_f32(0.0f);
+        svfloat32_t sum_1_1_lo = svdup_n_f32(0.0f);
+        svfloat32_t sum_1_1_hi = svdup_n_f32(0.0f);

        for (int i = 0; i < np; i += ggml_f16_step) {
-            ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
+            const svfloat16_t ay0 = GGML_F16x_VEC_LOAD(y + i, 0);
+            const svfloat16_t ax00 = GGML_F16x_VEC_LOAD(x[0] + i, 0);
+            const svfloat16_t ax01 = GGML_F16x_VEC_LOAD(x[1] + i, 0);

-            ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elements
-            sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1);     // sum_00 = sum_00+ax1*ay1
-            ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
-            sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
+            ggml_sve_f16_fma_widened(&sum_0_0_lo, &sum_0_0_hi, ax00, ay0);
+            ggml_sve_f16_fma_widened(&sum_1_0_lo, &sum_1_0_hi, ax01, ay0);

-            ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
+            const svfloat16_t ay1 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 0);
+            const svfloat16_t ax10 = GGML_F16x_VEC_LOAD(x[0] + i + 1 * ggml_f16_epr, 0);
+            const svfloat16_t ax11 = GGML_F16x_VEC_LOAD(x[1] + i + 1 * ggml_f16_epr, 0);

-            ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 elements
-            sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
-            ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
-            sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
-
-            ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
-
-            ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
-            sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
-            ax3 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
-            sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
-
-            ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
-
-            ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
-            sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
-            ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
-            sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
-
-            ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
-
-            ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
-
-            sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
-            ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
-            sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
-
-            ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
-
-            ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
-
-            sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
-            ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
-            sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
-
-            ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
-
-            ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
-
-            sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
-            ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
-            sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
-
-            ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
-
-            ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
-
-            sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
-            ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
-            sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
+            ggml_sve_f16_fma_widened(&sum_0_1_lo, &sum_0_1_hi, ax10, ay1);
+            ggml_sve_f16_fma_widened(&sum_1_1_lo, &sum_1_1_hi, ax11, ay1);
        }

-        const int np2 = (n & ~(ggml_f16_epr - 1));
-        for (int k = np; k < np2; k += ggml_f16_epr) {
-            svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
+        for (int i = np; i < np2; i += ggml_f16_epr) {
+            const svfloat16_t ry = GGML_F16x_VEC_LOAD(y + i, 0);
+            const svfloat16_t rx0 = GGML_F16x_VEC_LOAD(x[0] + i, 0);
+            const svfloat16_t rx1 = GGML_F16x_VEC_LOAD(x[1] + i, 0);

-            svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
-            sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
-            rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
-            sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
+            ggml_sve_f16_fma_widened(&sum_0_0_lo, &sum_0_0_hi, rx0, ry);
+            ggml_sve_f16_fma_widened(&sum_1_0_lo, &sum_1_0_hi, rx1, ry);
        }

        if (np2 < n) {
-            svbool_t pg = svwhilelt_b16(np2, n);
-            svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
-            svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
-            svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
+            const svbool_t pg = svwhilelt_b16(np2, n);
+            const svfloat16_t ay = svld1_f16(pg, (const __fp16 *)(y + np2));
+            const svfloat16_t ax0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
+            const svfloat16_t ax1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));

-            sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
-            sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
+            ggml_sve_f16_fma_widened(&sum_0_0_lo, &sum_0_0_hi, ax0, ay);
+            ggml_sve_f16_fma_widened(&sum_1_0_lo, &sum_1_0_hi, ax1, ay);
        }
-        GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
-        GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
+
+        svfloat32_t sum_0_lo = svadd_f32_x(DEFAULT_PG32, sum_0_0_lo, sum_0_1_lo);
+        svfloat32_t sum_0_hi = svadd_f32_x(DEFAULT_PG32, sum_0_0_hi, sum_0_1_hi);
+        svfloat32_t sum_1_lo = svadd_f32_x(DEFAULT_PG32, sum_1_0_lo, sum_1_1_lo);
+        svfloat32_t sum_1_hi = svadd_f32_x(DEFAULT_PG32, sum_1_0_hi, sum_1_1_hi);
+        sumf[0] = ggml_sve_sum_f32x2(sum_0_lo, sum_0_hi);
+        sumf[1] = ggml_sve_sum_f32x2(sum_1_lo, sum_1_hi);
        np = n;
    #elif defined(__riscv_v_intrinsic)
        #if defined(__riscv_zvfh)
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -110,11 +110,14 @@
 #    define GGML_CUDA_USE_CUB
 #endif  // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070

-// PDL host-side support (cudaLaunchKernelEx) requires CUDART >= 11.8 and excludes HIP/MUSA.
+// PDL host-side support (cudaLaunchKernelEx) requires CUDART >= 11.8.
+// However, this has been bugged in CTK < 12.3 for MSVC builds, see
+// https://github.com/ggml-org/llama.cpp/pull/22522#discussion_r3302393293
 // __CUDA_ARCH__  is undefined in host passes; GPU arch check happens in device-side code.
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11080
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && \
+    (CUDART_VERSION >= 12030 || (!(defined(_MSC_VER) && !defined(__clang__)) && CUDART_VERSION >= 11080))
 #    define GGML_CUDA_USE_PDL
-#endif  // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11080
+#endif  // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && (CUDART_VERSION >= 12030 || (!(defined(_MSC_VER) && !defined(__clang__)) && CUDART_VERSION >= 11080))

 static __device__ __forceinline__ void ggml_cuda_pdl_sync() {
 #if defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -472,7 +472,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(

            const int i = 8 * (threadIdx.x % (nbatch_fa/8));

-            cp_async_cg_16<preload>(tile_mask_32 + j_sram*(nbatch_fa*sizeof(half) + 16) + i*sizeof(half), mask_h + j_vram*stride_mask + i);
+            cp_async_cg_16<preload>(tile_mask_32 + j_sram*(nbatch_fa*sizeof(half) + 16) + i*sizeof(half), mask_h + int64_t(j_vram)*stride_mask + i);
        }
    } else if constexpr (oob_check) {
 #pragma unroll
@@ -488,7 +488,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
            for (int i0 = 0; i0 < nbatch_fa; i0 += warp_size) {
                const int i = i0 + threadIdx.x;

-                tile_mask[j_sram*(nbatch_fa + 8) + i] = i < i_sup ? mask_h[j_vram*stride_mask + i] : half(0.0f);
+                tile_mask[j_sram*(nbatch_fa + 8) + i] = i < i_sup ? mask_h[int64_t(j_vram)*stride_mask + i] : half(0.0f);
            }
        }
    } else if constexpr (nbatch_fa < 2*warp_size) {
@@ -505,7 +505,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(

            const int i = threadIdx.x % (warp_size/cols_per_warp);

-            ggml_cuda_memcpy_1<sizeof(half2)>(tile_mask + j_sram*(nbatch_fa + 8) + 2*i, mask_h + j_vram*stride_mask + 2*i);
+            ggml_cuda_memcpy_1<sizeof(half2)>(tile_mask + j_sram*(nbatch_fa + 8) + 2*i, mask_h + int64_t(j_vram)*stride_mask + 2*i);
        }
    } else {
 #pragma unroll
@@ -521,7 +521,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
            for (int i0 = 0; i0 < nbatch_fa; i0 += 2*warp_size) {
                const int i = i0 + 2*threadIdx.x;

-                ggml_cuda_memcpy_1<sizeof(half2)>(tile_mask + j_sram*(nbatch_fa + 8) + i, mask_h + j_vram*stride_mask + i);
+                ggml_cuda_memcpy_1<sizeof(half2)>(tile_mask + j_sram*(nbatch_fa + 8) + i, mask_h + int64_t(j_vram)*stride_mask + i);
            }
        }
    }
--- a/ggml/src/ggml-cuda/fwht.cu
+++ b/ggml/src/ggml-cuda/fwht.cu
@@ -0,0 +1,101 @@
+#include "common.cuh"
+#include "fwht.cuh"
+
+template <int N>
+__launch_bounds__(4*ggml_cuda_get_physical_warp_size(), 1)
+__global__ void fwht_cuda(const float * src, float * dst, const int64_t n_rows, const float scale) {
+    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
+
+    const int64_t r = (int64_t) blockIdx.x * blockDim.y + threadIdx.y;
+
+    if (r >= n_rows) {
+        return;
+    }
+
+    src += r * N;
+    dst += r * N;
+
+    static constexpr int el_w = N / warp_size;
+    float     reg[el_w];
+    const int lane = threadIdx.x;
+
+    ggml_cuda_pdl_sync();
+#pragma unroll
+    for (int i = 0; i < el_w; ++i) {
+        reg[i] = src[i * warp_size + lane] * scale;
+    }
+
+#pragma unroll
+    for (int h = 1; h < warp_size; h *= 2) {
+#pragma unroll
+        for (int j = 0; j < el_w; j++) {
+            const float val  = reg[j];
+            const float val2 = __shfl_xor_sync(0xFFFFFFFF, val, h, warp_size);
+
+            reg[j] = (lane & h) == 0 ? val + val2 : val2 - val;
+        }
+    }
+
+#pragma unroll
+    for (int h = warp_size; h < N; h *= 2) {
+        const int step = h / warp_size;
+#pragma unroll
+        for (int j = 0; j < el_w; j += 2 * step) {
+#pragma unroll
+            for (int k = 0; k < step; k++) {
+                const float x = reg[j + k];
+                const float y = reg[j + k + step];
+
+                reg[j + k]        = x + y;
+                reg[j + k + step] = x - y;
+            }
+        }
+    }
+
+#pragma unroll
+    for (int i = 0; i < el_w; ++i) {
+        dst[i * warp_size + lane] = reg[i];
+    }
+}
+
+bool ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_shape(src, dst));
+    if (!ggml_is_contiguous(src) || !ggml_is_contiguous(dst)) {
+        return false;
+    }
+    const int     n    = src->ne[0];
+    const int64_t rows = ggml_nrows(src);
+
+    const float * src_d = (const float *) src->data;
+    float *       dst_d = (float *) dst->data;
+
+    const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
+    const int rows_per_block = 4;
+
+    const int64_t num_blocks = (rows + rows_per_block - 1) / rows_per_block;
+
+    cudaStream_t                         stream = ctx.stream();
+    dim3                                 grid_dims(num_blocks, 1, 1);
+    dim3                                 block_dims(warp_size, rows_per_block, 1);
+    const ggml_cuda_kernel_launch_params launch_params =
+        ggml_cuda_kernel_launch_params(grid_dims, block_dims, 0, stream);
+
+    const float scale = 1 / sqrtf(n);
+
+    switch (n) {
+        case 64:
+            ggml_cuda_kernel_launch(fwht_cuda<64>, launch_params, src_d, dst_d, rows, scale);
+            return true;
+        case 128:
+            ggml_cuda_kernel_launch(fwht_cuda<128>, launch_params, src_d, dst_d, rows, scale);
+            return true;
+        case 256:
+            ggml_cuda_kernel_launch(fwht_cuda<256>, launch_params, src_d, dst_d, rows, scale);
+            return true;
+        case 512:
+            ggml_cuda_kernel_launch(fwht_cuda<512>, launch_params, src_d, dst_d, rows, scale);
+            return true;
+        default:
+            return false;
+    }
+}
--- a/ggml/src/ggml-cuda/fwht.cuh
+++ b/ggml/src/ggml-cuda/fwht.cuh
@@ -0,0 +1,4 @@
+#include "common.cuh"
+
+// Returns whether the Fast Walsh-Hadamard transform could be used.
+bool ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -24,6 +24,7 @@
 #include "ggml-cuda/diagmask.cuh"
 #include "ggml-cuda/diag.cuh"
 #include "ggml-cuda/fattn.cuh"
+#include "ggml-cuda/fwht.cuh"
 #include "ggml-cuda/getrows.cuh"
 #include "ggml-cuda/im2col.cuh"
 #include "ggml-cuda/mmf.cuh"
@@ -2569,6 +2570,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
            use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
            use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
            use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
+            use_mul_mat_vec_q       = use_mul_mat_vec_q         && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
            any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
        }
    } else {
@@ -2577,6 +2579,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
        use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
        use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
        use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
+        use_mul_mat_vec_q       = use_mul_mat_vec_q         && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
        any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
    }

@@ -2594,6 +2597,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
    bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc);
    bool use_batched_cublas_f32  = src0->type == GGML_TYPE_F32;

+    const int32_t hint = ggml_get_op_params_i32(dst, 1);
+    if (hint == GGML_HINT_SRC0_IS_HADAMARD && !split && ggml_cuda_op_fwht(ctx, src1, dst)) {
+        return;
+    }
+
    if (!split && use_mul_mat_vec_f) {
        // the custom F16 vector kernel can be used over batched cuBLAS GEMM
        // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
@@ -4986,8 +4994,14 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
 }

 static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
-    GGML_UNUSED(dev);
-    return GGML_BACKEND_DEVICE_TYPE_GPU;
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *) dev->context;
+
+    cudaDeviceProp prop;
+    CUDA_CHECK(cudaGetDeviceProperties(&prop, ctx->device));
+
+    return prop.integrated
+        ? GGML_BACKEND_DEVICE_TYPE_IGPU
+        : GGML_BACKEND_DEVICE_TYPE_GPU;
 }

 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -63,6 +63,7 @@ static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) {

 enum mmvq_parameter_table_id {
    MMVQ_PARAMETERS_GENERIC = 0,
+    MMVQ_PARAMETERS_TURING,
    MMVQ_PARAMETERS_GCN,
    MMVQ_PARAMETERS_RDNA2,
    MMVQ_PARAMETERS_RDNA3_0,
@@ -78,6 +79,8 @@ static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
    return MMVQ_PARAMETERS_RDNA2;
 #elif defined(GCN) || defined(CDNA)
    return MMVQ_PARAMETERS_GCN;
+#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING && __CUDA_ARCH__ < GGML_CUDA_CC_AMPERE
+    return MMVQ_PARAMETERS_TURING;
 #else
    return MMVQ_PARAMETERS_GENERIC;
 #endif
@@ -96,6 +99,9 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
    if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
        return MMVQ_PARAMETERS_GCN;
    }
+    if (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING && ggml_cuda_highest_compiled_arch(cc) < GGML_CUDA_CC_AMPERE) {
+        return MMVQ_PARAMETERS_TURING;
+    }
    return MMVQ_PARAMETERS_GENERIC;
 }

@@ -271,6 +277,53 @@ int get_mmvq_mmid_max_batch(ggml_type type, int cc) {
    return MMVQ_MAX_BATCH_SIZE;
 }

+bool ggml_cuda_should_use_mmvq(enum ggml_type type, int cc, int64_t ne11) {
+    if (GGML_CUDA_CC_IS_CDNA(cc)) {
+        if (GGML_CUDA_CC_IS_CDNA1(cc)) {
+            switch (type) {
+                case GGML_TYPE_Q4_0:
+                case GGML_TYPE_Q4_1:
+                    return ne11 <= 7;
+                case GGML_TYPE_Q5_1:
+                    return ne11 <= 7;
+                case GGML_TYPE_Q8_0:
+                    return ne11 <= 6;
+                case GGML_TYPE_Q2_K:
+                    return ne11 <= 4;
+                case GGML_TYPE_Q3_K:
+                    return ne11 <= 3;
+                case GGML_TYPE_Q4_K:
+                    return ne11 <= 2;
+                case GGML_TYPE_Q5_K:
+                    return ne11 <= 3;
+                case GGML_TYPE_Q6_K:
+                    return ne11 <= 4;
+                case GGML_TYPE_IQ1_S:
+                    return ne11 <= 5;
+                case GGML_TYPE_IQ2_XXS:
+                case GGML_TYPE_IQ3_S:
+                case GGML_TYPE_IQ4_XS:
+                    return ne11 <= 6;
+                default:
+                    return ne11 <= MMVQ_MAX_BATCH_SIZE;
+            }
+        }
+        switch (type) { // tuned for CDNA2
+            case GGML_TYPE_Q2_K:
+                return ne11 <= 5;
+            case GGML_TYPE_Q3_K:
+            case GGML_TYPE_Q4_K:
+            case GGML_TYPE_Q5_K:
+                return ne11 <= 3;
+            case GGML_TYPE_Q6_K:
+                return ne11 <= 5;
+            default:
+                return ne11 <= MMVQ_MAX_BATCH_SIZE;
+        }
+    }
+    return ne11 <= MMVQ_MAX_BATCH_SIZE;
+}
+
 // Device constexpr: returns the max batch size for the current arch+type at compile time.
 template <ggml_type type>
 static constexpr __device__ int get_mmvq_mmid_max_batch_for_device() {
@@ -370,11 +423,38 @@ static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_d
        }
        return 1;
    }
+    if (table_id == MMVQ_PARAMETERS_TURING) {
+        if (ncols_dst == 1) {
+            switch (type) {
+                case GGML_TYPE_Q2_K:
+                case GGML_TYPE_Q3_K:
+                case GGML_TYPE_Q4_K:
+                case GGML_TYPE_Q5_K:
+                case GGML_TYPE_Q6_K:
+                    return 2;
+                default:
+                    return 4;
+            }
+        }
+        switch (ncols_dst) {
+            case 2:
+            case 3:
+            case 4:
+                return 4;
+            case 5:
+            case 6:
+            case 7:
+            case 8:
+                return 2;
+            default:
+                return 1;
+        }
+    }
    return 1;
 }

 static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int table_id, bool small_k = false, int nwarps = 1) {
-    if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN) {
+    if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN || table_id == MMVQ_PARAMETERS_TURING) {
        switch (ncols_dst) {
            case 1:
                return small_k ? nwarps : 1;
--- a/ggml/src/ggml-cuda/mmvq.cuh
+++ b/ggml/src/ggml-cuda/mmvq.cuh
@@ -2,6 +2,8 @@

 #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.

+bool ggml_cuda_should_use_mmvq(enum ggml_type type, int cc, int64_t ne11);
+
 // Returns the maximum batch size for which MMVQ should be used for MUL_MAT_ID,
 // based on the quantization type and GPU architecture (compute capability).
 int get_mmvq_mmid_max_batch(ggml_type type, int cc);
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -68,6 +68,7 @@ static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C }
 static int opt_opstage  = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
 static int opt_opbatch  = 1024; // max number of ops in a batch
 static int opt_opqueue  = 16;   // max number of pending batches
+static int opt_oppoll   = 0;    // polling for batch completions

 static std::regex* opt_opfilter = NULL; // regex of ops to not claim

@@ -550,7 +551,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)

    size_t row_size    = ggml_row_size(t->type, t->ne[0]);
    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));  // extra elements for the pad
-    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size/2 quants + scales)

    // Ensure we don't try to read more data than is available in the source buffer 'data'
    // or write more than the tensor can hold.
@@ -611,7 +612,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)

    size_t row_size    = ggml_row_size(t->type, t->ne[0]);
    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));  // extra elements for the pad
-    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size/2 quants + scales)

    // Ensure we don't try to copy more data than the tensor actually contains.
    const size_t total_tensor_size = (size_t)nrows * row_size;
@@ -660,6 +661,239 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
    ggml_aligned_free(buf_rp, row_size_rp);
 }

+static void unpack_q4_1_quants(uint8_t * qs, const block_q4_1 * x, unsigned int bi) {
+    static const int qk = QK4_1;
+
+    for (unsigned int i = 0; i < qk / 2; ++i) {
+        const int x0             = (x->qs[i] & 0x0F);
+        const int x1             = (x->qs[i] >> 4);
+        qs[bi * qk + i + 0]      = x0;
+        qs[bi * qk + i + qk / 2] = x1;
+    }
+}
+
+static void pack_q4_1_quants(block_q4_1 * x, const uint8_t * qs, unsigned int bi) {
+    static const int qk = QK4_1;
+
+    for (unsigned int i = 0; i < qk / 2; ++i) {
+        const uint8_t x0 = qs[bi * qk + i + 0];
+        const uint8_t x1 = qs[bi * qk + i + qk / 2];
+        x->qs[i]         = x0 | (x1 << 4);
+    }
+}
+
+static void repack_row_q4_1x4x2(uint8_t * y, const block_q4_1 * x, int64_t k) {
+    static const int qk = QK_Q4_0x4x2;
+    const int        nb = (k + qk - 1) / qk;  // number of blocks (padded)
+    const int        nloe = k % qk;           // leftovers
+
+    const int dblk_size = 8 * 4;              // 8x (d, m) __fp16 = 32 bytes
+    const int qblk_size = qk / 2;             // int4 = 128 bytes
+    const int qrow_size = k / 2;              // int4 (not padded to blocks)
+
+    uint8_t * y_q = y + 0;                    // quants first
+    uint8_t * y_d = y + qrow_size;            // then scales/offsets
+
+    // Repack the quants
+    for (int i = 0; i < nb; i++) {
+        uint8_t qs[QK_Q4_0x4x2];  // unpacked quants
+        unpack_q4_1_quants(qs, &x[i * 8 + 0], 0);
+        unpack_q4_1_quants(qs, &x[i * 8 + 1], 1);
+        unpack_q4_1_quants(qs, &x[i * 8 + 2], 2);
+        unpack_q4_1_quants(qs, &x[i * 8 + 3], 3);
+        unpack_q4_1_quants(qs, &x[i * 8 + 4], 4);
+        unpack_q4_1_quants(qs, &x[i * 8 + 5], 5);
+        unpack_q4_1_quants(qs, &x[i * 8 + 6], 6);
+        unpack_q4_1_quants(qs, &x[i * 8 + 7], 7);
+
+        bool partial = (nloe && i == nb-1);
+
+        uint8_t * q = y_q + (i * qblk_size);
+        for (int j = 0; j < qk / 2; j++) {
+            q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
+        }
+    }
+
+    // Repack the scales and offsets
+    for (int i = 0; i < nb; i++) {
+        ggml_half * d_m = (ggml_half *) (y_d + i * dblk_size);
+        for (int j = 0; j < 8; j++) {
+            d_m[j * 2 + 0] = x[i * 8 + j].d;
+            d_m[j * 2 + 1] = x[i * 8 + j].m;
+        }
+    }
+}
+
+static void unpack_row_q4_1x4x2(block_q4_1 * x, const uint8_t * y, int64_t k) {
+    static const int qk = QK_Q4_0x4x2;
+    const int        nb = (k + qk - 1) / qk;  // number of blocks (padded)
+    const int        nloe = k % qk;           // leftovers
+
+    const int dblk_size = 8 * 4;              // 8x (d, m) __fp16 = 32 bytes
+    const int qblk_size = qk / 2;             // int4 = 128 bytes
+    const int qrow_size = k / 2;              // int4 (not padded to blocks)
+
+    const uint8_t * y_q = y + 0;              // quants first
+    const uint8_t * y_d = y + qrow_size;      // then scales/offsets
+
+    // Unpack the quants
+    for (int i = 0; i < nb; i++) {
+        uint8_t qs[QK_Q4_0x4x2];
+        bool partial = (nloe && i == nb-1);
+
+        const uint8_t * q = y_q + (i * qblk_size);
+        for (int j = 0; j < qk / 2; j++) {
+            if (partial) {
+                qs[j*2+0] = q[j] & 0x0F;
+                qs[j*2+1] = q[j] >> 4;
+            } else {
+                qs[j+000] = q[j] & 0x0F;
+                qs[j+128] = q[j] >> 4;
+            }
+        }
+
+        pack_q4_1_quants(&x[i * 8 + 0], qs, 0);
+        pack_q4_1_quants(&x[i * 8 + 1], qs, 1);
+        pack_q4_1_quants(&x[i * 8 + 2], qs, 2);
+        pack_q4_1_quants(&x[i * 8 + 3], qs, 3);
+        pack_q4_1_quants(&x[i * 8 + 4], qs, 4);
+        pack_q4_1_quants(&x[i * 8 + 5], qs, 5);
+        pack_q4_1_quants(&x[i * 8 + 6], qs, 6);
+        pack_q4_1_quants(&x[i * 8 + 7], qs, 7);
+    }
+
+    // Unpack the scales and offsets
+    for (int i = 0; i < nb; i++) {
+        const ggml_half * d_m = (const ggml_half *) (y_d + i * dblk_size);
+        for (int j = 0; j < 8; j++) {
+            x[i * 8 + j].d = d_m[j * 2 + 0];
+            x[i * 8 + j].m = d_m[j * 2 + 1];
+        }
+    }
+}
+
+static void init_row_q4_1x4x2(block_q4_1 * x, int64_t k) {
+    static const int qk = QK_Q4_0x4x2;
+    const int        nb = (k + qk - 1) / qk;  // number of blocks (padded)
+
+    uint8_t qs[QK_Q4_0x4x2];  // unpacked quants
+    memset(qs, 0, sizeof(qs));
+
+    for (int i = 0; i < nb; i++) {
+        pack_q4_1_quants(&x[i * 8 + 0], qs, 0);
+        pack_q4_1_quants(&x[i * 8 + 1], qs, 1);
+        pack_q4_1_quants(&x[i * 8 + 2], qs, 2);
+        pack_q4_1_quants(&x[i * 8 + 3], qs, 3);
+        pack_q4_1_quants(&x[i * 8 + 4], qs, 4);
+        pack_q4_1_quants(&x[i * 8 + 5], qs, 5);
+        pack_q4_1_quants(&x[i * 8 + 6], qs, 6);
+        pack_q4_1_quants(&x[i * 8 + 7], qs, 7);
+    }
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < 8; j++) {
+            x[i * 8 + j].d = 0;
+            x[i * 8 + j].m = 0;
+        }
+    }
+}
+
+static void repack_q4_1_q4x4x2(ggml_tensor * t, const void * data, size_t size) {
+    int64_t nrows = ggml_nrows(t);
+
+    size_t row_size    = ggml_row_size(t->type, t->ne[0]);
+    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size/2 quants + scales)
+
+    const size_t total_tensor_size = (size_t)nrows * row_size;
+    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
+
+    const int64_t n_full_rows = n_bytes_to_copy / row_size;
+    const size_t  n_rem_bytes = n_bytes_to_copy % row_size;
+
+    void * buf_pd = ggml_aligned_malloc(row_size_pd);
+    GGML_ASSERT(buf_pd != NULL);
+
+    void * buf_rp = ggml_aligned_malloc(row_size_rp);
+    GGML_ASSERT(buf_rp != NULL);
+
+    HEX_VERBOSE("ggml-hex: repack-q4_1-q4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
+                t->ne[0], nrows, row_size);
+
+    init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]);
+
+    for (int64_t i = 0; i < n_full_rows; i++) {
+        const uint8_t * src = (const uint8_t *) data + (i * row_size);
+        uint8_t *       dst = (uint8_t *) t->data + (i * row_size);
+
+        memcpy(buf_pd, src, row_size);
+        repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]);
+        memcpy(dst, buf_rp, row_size);
+    }
+
+    if (n_rem_bytes > 0) {
+        const int64_t i = n_full_rows;
+        const uint8_t * src = (const uint8_t *) data + (i * row_size);
+        uint8_t *       dst = (uint8_t *) t->data + (i * row_size);
+
+        init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]);
+        memcpy(buf_pd, src, n_rem_bytes);
+        repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]);
+        memcpy(dst, buf_rp, n_rem_bytes);
+    }
+
+    ggml_aligned_free(buf_pd, row_size_pd);
+    ggml_aligned_free(buf_rp, row_size_rp);
+}
+
+static void repack_q4x4x2_q4_1(void * data, const ggml_tensor * t, size_t size) {
+    int64_t nrows = ggml_nrows(t);
+
+    size_t row_size    = ggml_row_size(t->type, t->ne[0]);
+    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size/2 quants + scales)
+
+    const size_t total_tensor_size = (size_t)nrows * row_size;
+    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
+
+    const int64_t n_full_rows = n_bytes_to_copy / row_size;
+    const size_t  n_rem_bytes = n_bytes_to_copy % row_size;
+
+    void * buf_pd = ggml_aligned_malloc(row_size_pd);
+    GGML_ASSERT(buf_pd != NULL);
+
+    void * buf_rp = ggml_aligned_malloc(row_size_rp);
+    GGML_ASSERT(buf_rp != NULL);
+
+    HEX_VERBOSE("ggml-hex: repack-q4x4x2-q4_1 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
+                t->ne[0], nrows, row_size);
+
+    memset(buf_rp, 0, row_size_rp);  // clear-out padded buffer to make sure the tail is all zeros
+
+    for (int64_t i = 0; i < n_full_rows; i++) {
+        const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
+        uint8_t *       dst = (uint8_t *) data + (i * row_size);
+
+        memcpy(buf_rp, src, row_size);
+        unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]);
+        memcpy(dst, buf_pd, row_size);
+    }
+
+    if (n_rem_bytes > 0) {
+        const int64_t i = n_full_rows;
+        const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
+        uint8_t *       dst = (uint8_t *) data + (i * row_size);
+
+        // We still need to read and unpack the entire source row because quantization is block-based.
+        memcpy(buf_rp, src, row_size);
+        unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]);
+        memcpy(dst, buf_pd, n_rem_bytes);
+    }
+
+    ggml_aligned_free(buf_pd, row_size_pd);
+    ggml_aligned_free(buf_rp, row_size_rp);
+}
+
 // ======== Q8x4x2 ====================
 static void dump_block_q8_0(const block_q8_0 * b, int i) {
    HEX_VERBOSE("ggml-hex: repack q8_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, b->qs[0], b->qs[1], b->qs[2],
@@ -876,7 +1110,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)

    size_t row_size    = ggml_row_size(t->type, t->ne[0]);
    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2));  // extra elements for the pad
-    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size quants + scales)

    // Ensure we don't try to read more data than is available in the source buffer 'data'
    // or write more than the tensor can hold.
@@ -937,7 +1171,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)

    size_t row_size    = ggml_row_size(t->type, t->ne[0]);
    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2));  // extra elements for the pad
-    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size quants + scales)

    // Ensure we don't try to copy more data than the tensor actually contains.
    const size_t total_tensor_size = (size_t)nrows * row_size;
@@ -1238,7 +1472,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si

    size_t row_size    = ggml_row_size(t->type, t->ne[0]);
    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2));  // extra elements for the pad
-    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size/2 quants + scales)

    // Ensure we don't try to read more data than is available in the source buffer 'data'
    // or write more than the tensor can hold.
@@ -1299,7 +1533,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si

    size_t row_size    = ggml_row_size(t->type, t->ne[0]);
    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2));  // extra elements for the pad
-    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size/2 quants + scales)

    // Ensure we don't try to copy more data than the tensor actually contains.
    const size_t total_tensor_size = (size_t)nrows * row_size;
@@ -1365,6 +1599,12 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
            repack_q4_0_q4x4x2(tensor, data, size);
            break;

+        case GGML_TYPE_Q4_1:
+            GGML_ASSERT(offset == 0);
+            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
+            repack_q4_1_q4x4x2(tensor, data, size);
+            break;
+
        case GGML_TYPE_Q8_0:
            GGML_ASSERT(offset == 0);
            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
@@ -1407,6 +1647,12 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
            repack_q4x4x2_q4_0(data, tensor, size);
            break;

+        case GGML_TYPE_Q4_1:
+            GGML_ASSERT(offset == 0);
+            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
+            repack_q4x4x2_q4_1(data, tensor, size);
+            break;
+
        case GGML_TYPE_Q8_0:
            GGML_ASSERT(offset == 0);
            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
@@ -1886,7 +2132,8 @@ void ggml_hexagon_session::flush_pending(bool all) {
        uint32_t               n_dbufs;

        // Read response packet from queue
-        int err = dspqueue_read(this->queue, &flags, 1, &n_dbufs, &dbuf, sizeof(rsp), &rsp_size, (uint8_t *) &rsp, DSPQUEUE_TIMEOUT);
+        const uint32_t timeo = opt_oppoll ? 0 : DSPQUEUE_TIMEOUT;
+        int err = dspqueue_read(this->queue, &flags, 1, &n_dbufs, &dbuf, sizeof(rsp), &rsp_size, (uint8_t *) &rsp, timeo);
        if (err == AEE_EEXPIRED) {
            continue;
        }
@@ -2290,6 +2537,7 @@ static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_ses
    const int64_t H        = v->ne[1];
    const int64_t n_tokens = v->ne[2];
    const int64_t n_seqs   = v->ne[3];
+    const int64_t K        = state->ne[1];

    if (S_v <= 0 || S_v > 128 || H <= 0 || n_tokens <= 0 || n_seqs <= 0) {
        return false;
@@ -2302,10 +2550,10 @@ static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_ses
    if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) {
        return false;
    }
-    if (ggml_nelements(state) != S_v * S_v * H * n_seqs) {
+    if (ggml_nelements(state) != S_v * S_v * H * n_seqs * K) {
        return false;
    }
-    if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs) {
+    if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs * K) {
        return false;
    }

@@ -2327,6 +2575,7 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s

    switch (src0->type) {
        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_MXFP4:
@@ -2377,6 +2626,7 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session

    switch (src0->type) {
        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_MXFP4:
@@ -2874,6 +3124,7 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
        case GGML_OP_NORM:            return HTP_OP_NORM;
        case GGML_OP_L2_NORM:         return HTP_OP_L2_NORM;
        case GGML_OP_RMS_NORM:        return HTP_OP_RMS_NORM;
+        case GGML_OP_CONCAT:          return HTP_OP_CONCAT;
        case GGML_OP_SCALE:           return HTP_OP_SCALE;
        case GGML_OP_SQR:             return HTP_OP_SQR;
        case GGML_OP_SQRT:            return HTP_OP_SQRT;
@@ -3286,6 +3537,25 @@ static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * se
    return true;
 }

+static bool ggml_hexagon_supported_concat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    int dim = ((const int32_t *) op->op_params)[0];
+    if (dim < 0 || dim >= GGML_MAX_DIMS) {
+        return false;
+    }
+
+    for (int i = 0; i < GGML_MAX_SRC; ++i) {
+        const struct ggml_tensor * src = op->src[i];
+        if (!src) {
+            continue;
+        }
+        if (src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_I32 && src->type != GGML_TYPE_F16) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
 static bool ggml_hexagon_supported_fill(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
    const struct ggml_tensor * dst = op;

@@ -3434,6 +3704,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
            supp = ggml_hexagon_supported_cumsum(sess, op);
            break;

+        case GGML_OP_CONCAT:
+            supp = ggml_hexagon_supported_concat(sess, op);
+            break;
+
        case GGML_OP_FILL:
            supp = ggml_hexagon_supported_fill(sess, op);
            break;
@@ -3598,6 +3872,8 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
    // Basic sanity checks to make sure definitions match
    static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
                  "please update hexagon_type to match ggml_type");
+    static_assert((unsigned int) HTP_TYPE_Q4_1 == (unsigned int) GGML_TYPE_Q4_1,
+                  "please update hexagon_type to match ggml_type");
    static_assert((unsigned int) HTP_TYPE_Q8_0 == (unsigned int) GGML_TYPE_Q8_0,
                  "please update hexagon_type to match ggml_type");
    static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
@@ -3610,6 +3886,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
    const char * str_opstage  = getenv("GGML_HEXAGON_OPSTAGE");
    const char * str_opbatch  = getenv("GGML_HEXAGON_OPBATCH");
    const char * str_opqueue  = getenv("GGML_HEXAGON_OPQUEUE");
+    const char * str_oppoll   = getenv("GGML_HEXAGON_OPPOLL");
    const char * str_opfilter = getenv("GGML_HEXAGON_OPFILTER");
    const char * str_profile  = getenv("GGML_HEXAGON_PROFILE");
    const char * str_etm      = getenv("GGML_HEXAGON_ETM");
@@ -3647,6 +3924,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
    opt_opstage   = str_opstage  ? strtoul(str_opstage, NULL, 0)          : opt_opstage;
    opt_opbatch   = str_opbatch  ? strtoul(str_opbatch, NULL, 0)          : opt_opbatch;
    opt_opqueue   = str_opqueue  ? strtoul(str_opqueue, NULL, 0)          : opt_opqueue;
+    opt_oppoll    = str_oppoll   ? strtoul(str_oppoll,  NULL, 0)          : opt_oppoll;
    opt_profile   = str_profile  ? atoi(str_profile)                      : 0;
    opt_etm       = str_etm      ? atoi(str_etm)                          : 0;
    opt_nhvx      = str_nhvx     ? strtoul(str_nhvx, NULL, 0)             : opt_nhvx;
--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -35,6 +35,7 @@ add_library(${HTP_LIB} SHARED
    ssm-conv.c
    cumsum-ops.c
    fill-ops.c
+    concat-ops.c
    diag-ops.c
    solve-tri-ops.c
    gated-delta-net-ops.c
@@ -57,15 +58,16 @@ list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)

 if (_hmx_idx GREATER_EQUAL 0)
    target_sources(${HTP_LIB} PRIVATE
-        hmx-queue.c
-        hmx-matmul-ops.c
        hmx-flash-attn-ops.c
+        hmx-matmul-ops.c
+        hmx-queue.c
    )

    # -mhmx enables HMX instruction set (needed by files that include hmx-utils.h)
    set_source_files_properties(
-        hmx-matmul-ops.c
        hmx-flash-attn-ops.c
+        hmx-matmul-ops.c
+        hmx-queue.c
        PROPERTIES COMPILE_OPTIONS "-mhmx"
    )

--- a/ggml/src/ggml-hexagon/htp/concat-ops.c
+++ b/ggml/src/ggml-hexagon/htp/concat-ops.c
@@ -0,0 +1,275 @@
+#include "htp-ctx.h"
+#include "htp-ops.h"
+#include "hexagon_types.h"
+#include "hexagon_protos.h"
+#include "hvx_hexagon_protos.h"
+#include "hex-dma.h"
+#include "vtcm-utils.h"
+#include "hvx-utils.h"
+#include "hex-fastdiv.h"
+#include <string.h>
+
+struct htp_concat_context {
+    struct htp_ops_context * octx;
+    uint32_t dim;
+    uint32_t nrows_per_thread;
+    struct fastdiv_values div_ne0;
+    struct fastdiv_values div_ne1;
+    struct fastdiv_values div_ne2;
+};
+
+static void concat_2d_f32_transposed(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_concat_context * cctx = (struct htp_concat_context *) data;
+    struct htp_ops_context * octx = cctx->octx;
+
+    const struct htp_tensor * src0 = octx->src[0];
+    const struct htp_tensor * src1 = octx->src[1];
+    const struct htp_tensor * dst  = octx->dst;
+
+    const uint32_t src0_ne0 = src0->ne[0];
+    const uint32_t src1_ne0 = src1->ne[0];
+    const uint32_t ne1      = dst->ne[1];
+
+    const uint32_t start_i = ith * cctx->nrows_per_thread;
+    const uint32_t end_i   = (start_i + cctx->nrows_per_thread < ne1) ? (start_i + cctx->nrows_per_thread) : ne1;
+    if (start_i >= end_i) return;
+
+    dma_queue * q = octx->ctx->dma[ith];
+
+    uint8_t * spad0_base = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread;
+    uint8_t * spad1_base = octx->src1_spad.data + ith * octx->src1_spad.size_per_thread;
+
+    const uint32_t block_i = 32;
+    const uint32_t spad1_stride = block_i * sizeof(float);
+
+    int32_t offsets[32] __attribute__((aligned(128)));
+    for(int k=0; k<32; k++) {
+        offsets[k] = k * spad1_stride;
+    }
+    HVX_Vector vv = *(HVX_Vector*)offsets;
+    const uint32_t src1_ne0_padded = hex_round_up(src1_ne0, 32);
+    const uint32_t spad0_row_bytes = hex_round_up((src0_ne0 + src1_ne0_padded) * sizeof(float), VLEN);
+    uint32_t mu = src1_ne0_padded * spad1_stride;
+
+    for (uint32_t i = start_i; i < end_i; i += block_i) {
+        uint32_t current_block_i = (end_i - i < block_i) ? (end_i - i) : block_i;
+
+        uint32_t src1_width_bytes = current_block_i * sizeof(float);
+        uint8_t * src1_ptr = (uint8_t *)src1->data + i * src1->nb[1];
+        dma_queue_push(q, dma_make_ptr(spad1_base, src1_ptr), spad1_stride, src1->nb[0], src1_width_bytes, src1_ne0);
+
+        uint32_t src0_row_bytes = src0_ne0 * sizeof(float);
+        uint8_t * src0_ptr = (uint8_t *)src0->data + i * src0->nb[1];
+        dma_queue_push(q, dma_make_ptr(spad0_base, src0_ptr), spad0_row_bytes, src0->nb[1], src0_row_bytes, current_block_i);
+
+        dma_queue_pop(q); // src1
+
+        HVX_Vector * vtcm_tmp = (HVX_Vector *)(spad1_base + src1_ne0_padded * spad1_stride);
+
+        for (uint32_t j = 0; j < src1_ne0_padded; j += 32) {
+            #pragma unroll(4)
+            for (uint32_t ii = 0; ii < current_block_i; ii++) {
+                size_t rt = (size_t)(spad1_base + j * spad1_stride + ii * sizeof(float));
+                Q6_vgather_ARMVw(&vtcm_tmp[ii], rt, mu, vv);
+                uint8_t * dst_ptr = spad0_base + ii * spad0_row_bytes + (src0_ne0 + j) * sizeof(float);
+                hvx_vmemu(dst_ptr) = vtcm_tmp[ii];
+            }
+        }
+
+        dma_queue_pop(q); // src0
+
+        uint8_t * dst_ptr = (uint8_t *)dst->data + i * dst->nb[1];
+        dma_queue_push(q, dma_make_ptr(dst_ptr, spad0_base), dst->nb[1], spad0_row_bytes, (src0_ne0 + src1_ne0) * sizeof(float), current_block_i);
+
+        dma_queue_pop(q);
+    }
+}
+
+static void concat_2d_f16_transposed(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_concat_context * cctx = (struct htp_concat_context *) data;
+    struct htp_ops_context * octx = cctx->octx;
+
+    const struct htp_tensor * src0 = octx->src[0];
+    const struct htp_tensor * src1 = octx->src[1];
+    const struct htp_tensor * dst  = octx->dst;
+
+    const uint32_t src0_ne0 = src0->ne[0];
+    const uint32_t src1_ne0 = src1->ne[0];
+    const uint32_t ne1      = dst->ne[1];
+
+    const uint32_t start_i = ith * cctx->nrows_per_thread;
+    const uint32_t end_i   = (start_i + cctx->nrows_per_thread < ne1) ? (start_i + cctx->nrows_per_thread) : ne1;
+    if (start_i >= end_i) return;
+
+    dma_queue * q = octx->ctx->dma[ith];
+
+    uint8_t * spad0_base = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread;
+    uint8_t * spad1_base = octx->src1_spad.data + ith * octx->src1_spad.size_per_thread;
+
+    const uint32_t block_i = 64;
+    const uint32_t spad1_stride = block_i * sizeof(__fp16);
+
+    int16_t offsets[64] __attribute__((aligned(128)));
+    for(int k=0; k<64; k++) {
+        offsets[k] = k * spad1_stride;
+    }
+    HVX_Vector vv = *(HVX_Vector*)offsets;
+    const uint32_t src1_ne0_padded = hex_round_up(src1_ne0, 64);
+    const uint32_t spad0_row_bytes = hex_round_up((src0_ne0 + src1_ne0_padded) * sizeof(__fp16), VLEN);
+    uint32_t mu = src1_ne0_padded * spad1_stride;
+
+    for (uint32_t i = start_i; i < end_i; i += block_i) {
+        uint32_t current_block_i = (end_i - i < block_i) ? (end_i - i) : block_i;
+
+        uint32_t src1_width_bytes = current_block_i * sizeof(__fp16);
+        uint8_t * src1_ptr = (uint8_t *)src1->data + i * src1->nb[1];
+        dma_queue_push(q, dma_make_ptr(spad1_base, src1_ptr), spad1_stride, src1->nb[0], src1_width_bytes, src1_ne0);
+
+        uint32_t src0_row_bytes = src0_ne0 * sizeof(__fp16);
+        uint8_t * src0_ptr = (uint8_t *)src0->data + i * src0->nb[1];
+        dma_queue_push(q, dma_make_ptr(spad0_base, src0_ptr), spad0_row_bytes, src0->nb[1], src0_row_bytes, current_block_i);
+
+        dma_queue_pop(q); // src1
+
+        HVX_Vector * vtcm_tmp = (HVX_Vector *)(spad1_base + src1_ne0_padded * spad1_stride);
+
+        for (uint32_t j = 0; j < src1_ne0_padded; j += 64) {
+            #pragma unroll(4)
+            for (uint32_t ii = 0; ii < current_block_i; ii++) {
+                size_t rt = (size_t)(spad1_base + j * spad1_stride + ii * sizeof(__fp16));
+                Q6_vgather_ARMVh(&vtcm_tmp[ii], rt, mu, vv);
+                uint8_t * dst_ptr = spad0_base + ii * spad0_row_bytes + (src0_ne0 + j) * sizeof(__fp16);
+                hvx_vmemu(dst_ptr) = vtcm_tmp[ii];
+            }
+        }
+
+        dma_queue_pop(q); // src0
+
+        uint8_t * dst_ptr = (uint8_t *)dst->data + i * dst->nb[1];
+        dma_queue_push(q, dma_make_ptr(dst_ptr, spad0_base), dst->nb[1], spad0_row_bytes, (src0_ne0 + src1_ne0) * sizeof(__fp16), current_block_i);
+
+        dma_queue_pop(q);
+    }
+}
+
+static void concat_generic(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_concat_context * cctx = (struct htp_concat_context *) data;
+    struct htp_ops_context * octx = cctx->octx;
+
+    const struct htp_tensor * src0 = octx->src[0];
+    const struct htp_tensor * src1 = octx->src[1];
+    const struct htp_tensor * dst  = octx->dst;
+
+    const int dim = cctx->dim;
+    const uint32_t type_size = (dst->type == HTP_TYPE_F32 || dst->type == HTP_TYPE_I32) ? 4 : 2;
+
+    const uint32_t ne[4] = {dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]};
+    const uint32_t total_elements = ne[0] * ne[1] * ne[2] * ne[3];
+    const uint32_t chunk_size = (total_elements + nth - 1) / nth;
+
+    const uint32_t start_idx = MIN(ith * chunk_size, total_elements);
+    const uint32_t end_idx   = MIN(start_idx + chunk_size, total_elements);
+
+    // Naive scalar element-wise copy
+    for (uint32_t idx = start_idx; idx < end_idx; idx++) {
+        uint32_t idx_div_ne0 = fastdiv(idx, &cctx->div_ne0);
+        uint32_t i0 = idx - idx_div_ne0 * ne[0];
+
+        uint32_t idx_div_ne01 = fastdiv(idx_div_ne0, &cctx->div_ne1);
+        uint32_t i1 = idx_div_ne0 - idx_div_ne01 * ne[1];
+
+        uint32_t idx_div_ne012 = fastdiv(idx_div_ne01, &cctx->div_ne2);
+        uint32_t i2 = idx_div_ne01 - idx_div_ne012 * ne[2];
+        uint32_t i3 = idx_div_ne012;
+
+        uint8_t * dst_ptr = (uint8_t *)dst->data + i3 * dst->nb[3] + i2 * dst->nb[2] + i1 * dst->nb[1] + i0 * dst->nb[0];
+
+        uint32_t idx_dim = 0;
+        if (dim == 0) idx_dim = i0;
+        else if (dim == 1) idx_dim = i1;
+        else if (dim == 2) idx_dim = i2;
+        else if (dim == 3) idx_dim = i3;
+
+        const struct htp_tensor * src = (idx_dim < src0->ne[dim]) ? src0 : src1;
+
+        uint32_t s0 = i0;
+        uint32_t s1 = i1;
+        uint32_t s2 = i2;
+        uint32_t s3 = i3;
+
+        if (dim == 0 && src == src1) s0 -= src0->ne[0];
+        if (dim == 1 && src == src1) s1 -= src0->ne[1];
+        if (dim == 2 && src == src1) s2 -= src0->ne[2];
+        if (dim == 3 && src == src1) s3 -= src0->ne[3];
+
+        uint8_t * src_ptr = (uint8_t *)src->data + s3 * src->nb[3] + s2 * src->nb[2] + s1 * src->nb[1] + s0 * src->nb[0];
+
+        if (type_size == 4) {
+            *(float*)dst_ptr = *(float*)src_ptr;
+        } else {
+            *(__fp16*)dst_ptr = *(__fp16*)src_ptr;
+        }
+    }
+}
+
+int op_concat(struct htp_ops_context * octx) {
+    const struct htp_tensor * src0 = octx->src[0];
+    const struct htp_tensor * src1 = octx->src[1];
+    const struct htp_tensor * dst  = octx->dst;
+
+    int dim = octx->op_params[0];
+
+    bool is_2d = dst->ne[2] == 1 && dst->ne[3] == 1;
+
+    const uint32_t type_size = (dst->type == HTP_TYPE_F32 || dst->type == HTP_TYPE_I32) ? 4 : 2;
+    bool is_src1_transposed  = (src1->nb[0] > src1->nb[1]);
+    bool is_src0_transposed  = (src0->nb[0] > src0->nb[1]);
+
+    uint32_t n_threads = octx->n_threads;
+    struct htp_concat_context cctx;
+    cctx.octx = octx;
+    cctx.dim = dim;
+    cctx.div_ne0 = init_fastdiv_values(dst->ne[0]);
+    cctx.div_ne1 = init_fastdiv_values(dst->ne[1]);
+    cctx.div_ne2 = init_fastdiv_values(dst->ne[2]);
+
+    void (*worker_func)(unsigned int, unsigned int, void *) = concat_generic;
+
+    if (dim == 0 && is_2d && is_src1_transposed && !is_src0_transposed) {
+        n_threads = MIN(dst->ne[1], n_threads);
+        if (n_threads < 1) {
+            n_threads = 1;
+        }
+        uint32_t block_i = (type_size == 4) ? 32 : 64;
+
+        cctx.nrows_per_thread = hmx_ceil_div(dst->ne[1], n_threads);
+
+        // Allocate VTCM
+        uint32_t spad1_stride = block_i * type_size;
+
+        uint32_t src1_ne0_padded = hex_round_up(src1->ne[0], block_i);
+        uint32_t spad0_row_bytes = hex_round_up((src0->ne[0] + src1_ne0_padded) * type_size, VLEN);
+
+        octx->src0_spad.size_per_thread = block_i * spad0_row_bytes;
+        octx->src1_spad.size_per_thread = src1_ne0_padded * spad1_stride + block_i * VLEN;
+
+        octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
+        octx->src1_spad.size = n_threads * octx->src1_spad.size_per_thread;
+
+        if (octx->src0_spad.size + octx->src1_spad.size > octx->ctx->vtcm_size) {
+            return HTP_STATUS_VTCM_TOO_SMALL;
+        }
+
+        octx->src0_spad.data = octx->ctx->vtcm_base;
+        octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
+
+        if (type_size == 4) {
+            worker_func = concat_2d_f32_transposed;
+        } else {
+            worker_func = concat_2d_f16_transposed;
+        }
+    }
+
+    worker_pool_run_func(octx->ctx->worker_pool, worker_func, &cctx, n_threads);
+    return HTP_STATUS_OK;
+}
--- a/ggml/src/ggml-hexagon/htp/cpy-ops.c
+++ b/ggml/src/ggml-hexagon/htp/cpy-ops.c
@@ -28,158 +28,170 @@ struct htp_copy_context {
    uint32_t          dst_blocks_per_row;

    uint32_t          src0_nrows_per_thread;
-
-    void (*copy)(struct htp_copy_context * ct, struct htp_ops_context * octx, int nth, int ith);
 };

 #define cpy_preamble                              \
    const struct htp_tensor *src0 = octx->src[0]; \
    const struct htp_tensor *dst  = octx->dst;    \
                                                  \
-    const uint32_t ne00 = src0->ne[0];     \
-    const uint32_t ne01 = src0->ne[1];     \
-    const uint32_t ne02 = src0->ne[2];     \
-    const uint32_t ne03 = src0->ne[3];     \
-                                           \
-    const uint32_t nb00 = src0->nb[0];     \
-    const uint32_t nb01 = src0->nb[1];     \
-    const uint32_t nb02 = src0->nb[2];     \
-    const uint32_t nb03 = src0->nb[3];     \
-                                           \
-    const uint32_t  ne0 = dst->ne[0];      \
-    const uint32_t  ne1 = dst->ne[1];      \
-    const uint32_t  ne2 = dst->ne[2];      \
-    const uint32_t  ne3 = dst->ne[3];      \
-                                           \
-    const uint32_t  nb0 = dst->nb[0];      \
-    const uint32_t  nb1 = dst->nb[1];      \
-    const uint32_t  nb2 = dst->nb[2];      \
-    const uint32_t  nb3 = dst->nb[3];      \
-                                           \
+    const uint32_t ne00 = src0->ne[0];            \
+    const uint32_t ne01 = src0->ne[1];            \
+    const uint32_t ne02 = src0->ne[2];            \
+    const uint32_t ne03 = src0->ne[3];            \
+                                                  \
+    const uint32_t nb00 = src0->nb[0];            \
+    const uint32_t nb01 = src0->nb[1];            \
+    const uint32_t nb02 = src0->nb[2];            \
+    const uint32_t nb03 = src0->nb[3];            \
+                                                  \
+    const uint32_t  ne0 = dst->ne[0];             \
+    const uint32_t  ne1 = dst->ne[1];             \
+    const uint32_t  ne2 = dst->ne[2];             \
+    const uint32_t  ne3 = dst->ne[3];             \
+                                                  \
+    const uint32_t  nb0 = dst->nb[0];             \
+    const uint32_t  nb1 = dst->nb[1];             \
+    const uint32_t  nb2 = dst->nb[2];             \
+    const uint32_t  nb3 = dst->nb[3];             \
+                                                  \
    const uint32_t   nr = ne01;

-static void cpy_thread_sametype_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) {
-    cpy_preamble;
-
-    // parallelize by src0 rows
-    const uint32_t dr  = ct->src0_nrows_per_thread;
-    const uint32_t ir0 = dr * ith;
-    const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
-
-    // copy by rows
-    for (uint32_t i03 = 0; i03 < ne03; i03++) {
-        for (uint32_t i02 = 0; i02 < ne02; i02++) {
-            #pragma unroll(2)
-            for (uint32_t i01 = ir0; i01 < ir1; i01++) {
-                uint8_t* dst_ptr  = (uint8_t*) dst->data  + i01*nb1  + i02*nb2  + i03*nb3;
-                uint8_t* src0_ptr = (uint8_t*) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                hex_l2fetch(src0_ptr, ne00 * ct->src0_type_size, nb01, 2);
-                hvx_copy_uu(dst_ptr, src0_ptr, ne00, ct->src0_type_size);
-            }
-        }
-    }
+#define DEFINE_CPY_SAMESHAPE(NAME, ELEM_TYPE, ELEM_SIZE)                                                       \
+static void cpy_thread_##NAME##_sameshape(unsigned int nth, unsigned int ith, void * data) {                   \
+    struct htp_copy_context * ct = (struct htp_copy_context *) data;                                           \
+    struct htp_ops_context * octx = ct->octx;                                                                  \
+    cpy_preamble;                                                                                              \
+    const uint32_t dr  = ct->src0_nrows_per_thread;                                                            \
+    const uint32_t ir0 = dr * ith;                                                                             \
+    const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;                                                    \
+    if (ir0 >= nr) return;                                                                                     \
+    for (uint32_t i03 = 0; i03 < ne03; i03++) {                                                                \
+        for (uint32_t i02 = 0; i02 < ne02; i02++) {                                                            \
+            _Pragma("unroll(4)")                                                                               \
+            for (uint32_t i01 = ir0; i01 < ir1; i01++) {                                                       \
+                uint8_t* dst_ptr  = (uint8_t*) dst->data  + i01*nb1  + i02*nb2  + i03*nb3;                     \
+                uint8_t* src0_ptr = (uint8_t*) src0->data + i01*nb01 + i02*nb02 + i03*nb03;                    \
+                hex_l2fetch(src0_ptr, ne00 * ELEM_SIZE, nb01, 2);                                              \
+                hvx_copy_uu(dst_ptr, src0_ptr, ne00, ELEM_SIZE);                                               \
+            }                                                                                                  \
+        }                                                                                                      \
+    }                                                                                                          \
 }

-static void cpy_thread_sametype_reshape(struct htp_copy_context * ct, struct htp_ops_context * octx, int nth, int ith) {
-    cpy_preamble;
+DEFINE_CPY_SAMESHAPE(f32, float, 4)
+DEFINE_CPY_SAMESHAPE(f16, __fp16, 2)

-    // parallelize by src0 rows
-    const uint32_t dr  = ct->src0_nrows_per_thread;
-    const uint32_t ir0 = dr * ith;
-    const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
-
-    // Fast path: when both src0 and dst are contiguous in memory
-    // Replace the element-by-element loop with a single bulk HVX copy per (i03, i02) slice.
-    const bool src0_contig = (nb00 == ct->src0_type_size) &&
-                             (nb01 == ne00 * nb00) &&
-                             (nb02 == ne01 * nb01) &&
-                             (nb03 == ne02 * nb02);
-    const bool dst_contig  = (nb0  == ct->dst_type_size)  &&
-                             (nb1  == ne0  * nb0)  &&
-                             (nb2  == ne1  * nb1)  &&
-                             (nb3  == ne2  * nb2);
-
-    if (src0_contig && dst_contig) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                uint8_t * src_ptr = (uint8_t *) src0->data + i03*nb03 + i02*nb02 + ir0*nb01;
-                uint32_t  flat    = ((i03*ne02 + i02)*ne01 + ir0) * ne00;
-                uint8_t * dst_ptr = (uint8_t *) dst->data  + flat * ct->src0_type_size;
-                hvx_copy_uu(dst_ptr, src_ptr, (ir1 - ir0) * ne00, ct->src0_type_size);
-            }
-        }
-        return;
-    }
-
-    // dst counters
-    int64_t k10 = 0;
-    int64_t i11 = 0;
-    int64_t i12 = 0;
-    int64_t i13 = 0;
-
-    // number of blocks in a row
-    const int64_t nk00 = ct->src0_blocks_per_row;
-    const int64_t nk0  = ct->dst_blocks_per_row;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            k10 += nk00 * ir0;
-            while (k10 >= nk0) {
-                k10 -= nk0;
-                if (++i11 == ne1) {
-                    i11 = 0;
-                    if (++i12 == ne2) {
-                        i12 = 0;
-                        if (++i13 == ne3) {
-                            i13 = 0;
-                        }
-                    }
-                }
-            }
-            for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                for (int64_t k00 = 0; k00 < nk00; k00++) {
-                    const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                          char * dst_ptr  = ((char *)  dst->data + k10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-                    memcpy(dst_ptr, src0_ptr, ct->dst_type_size);
-
-                    if (++k10 == nk0) {
-                        k10 = 0;
-                        if (++i11 == ne1) {
-                            i11 = 0;
-                            if (++i12 == ne2) {
-                                i12 = 0;
-                                if (++i13 == ne3) {
-                                    i13 = 0;
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-            k10 += nk00 * (ne01 - ir1);
-            while (k10 >= nk0) {
-                k10 -= nk0;
-                if (++i11 == ne1) {
-                    i11 = 0;
-                    if (++i12 == ne2) {
-                        i12 = 0;
-                        if (++i13 == ne3) {
-                            i13 = 0;
-                        }
-                    }
-                }
-            }
-        }
-    }
+#define DEFINE_CPY_RESHAPE(NAME, ELEM_TYPE, ELEM_SIZE)                                                         \
+static void cpy_thread_##NAME##_reshape(unsigned int nth, unsigned int ith, void * data) {                     \
+    struct htp_copy_context * ct = (struct htp_copy_context *) data;                                           \
+    struct htp_ops_context * octx = ct->octx;                                                                  \
+    cpy_preamble;                                                                                              \
+    const uint32_t dr  = ct->src0_nrows_per_thread;                                                            \
+    const uint32_t ir0 = dr * ith;                                                                             \
+    const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;                                                    \
+    if (ir0 >= nr) return;                                                                                     \
+    const bool src0_contig = (nb00 == ELEM_SIZE)   &&                                                          \
+                             (nb01 == ne00 * nb00) &&                                                          \
+                             (nb02 == ne01 * nb01) &&                                                          \
+                             (nb03 == ne02 * nb02);                                                            \
+    const bool dst_contig  = (nb0  == ELEM_SIZE)   &&                                                          \
+                             (nb1  == ne0  * nb0)  &&                                                          \
+                             (nb2  == ne1  * nb1)  &&                                                          \
+                             (nb3  == ne2  * nb2);                                                             \
+    if (src0_contig && dst_contig) {                                                                           \
+        for (int64_t i03 = 0; i03 < ne03; i03++) {                                                             \
+            for (int64_t i02 = 0; i02 < ne02; i02++) {                                                         \
+                uint8_t * src_ptr = (uint8_t *) src0->data + i03*nb03 + i02*nb02 + ir0*nb01;                   \
+                uint32_t  flat    = ((i03*ne02 + i02)*ne01 + ir0) * ne00;                                      \
+                uint8_t * dst_ptr = (uint8_t *) dst->data  + flat * ELEM_SIZE;                                 \
+                hvx_copy_uu(dst_ptr, src_ptr, (ir1 - ir0) * ne00, ELEM_SIZE);                                  \
+            }                                                                                                  \
+        }                                                                                                      \
+        return;                                                                                                \
+    }                                                                                                          \
+    const bool reshape_flat_fast = (ne03 == 1 && ne2 == 1 && ne3 == 1) &&                                      \
+                                   (ne0 == ne00 * ne01) && (ne1 == ne02) &&                                    \
+                                   (nb00 == ELEM_SIZE) && (nb0 == ELEM_SIZE);                                  \
+    if (reshape_flat_fast) {                                                                                   \
+        for (uint32_t i02 = 0; i02 < ne02; i02++) {                                                            \
+            for (uint32_t i01 = ir0; i01 < ir1; i01++) {                                                       \
+                uint8_t * src0_ptr = (uint8_t *) src0->data + i01 * nb01 + i02 * nb02;                         \
+                uint8_t * dst_ptr  = (uint8_t *) dst->data  + i01 * ne00 * ELEM_SIZE + i02 * nb1;              \
+                hvx_copy_uu(dst_ptr, src0_ptr, ne00, ELEM_SIZE);                                               \
+            }                                                                                                  \
+        }                                                                                                      \
+        return;                                                                                                \
+    }                                                                                                          \
+    int64_t k10 = 0;                                                                                           \
+    int64_t i11 = 0;                                                                                           \
+    int64_t i12 = 0;                                                                                           \
+    int64_t i13 = 0;                                                                                           \
+    const int64_t nk00 = ct->src0_blocks_per_row;                                                              \
+    const int64_t nk0  = ct->dst_blocks_per_row;                                                               \
+    for (int64_t i03 = 0; i03 < ne03; i03++) {                                                                 \
+        for (int64_t i02 = 0; i02 < ne02; i02++) {                                                             \
+            k10 += nk00 * ir0;                                                                                 \
+            while (k10 >= nk0) {                                                                               \
+                k10 -= nk0;                                                                                    \
+                if (++i11 == ne1) {                                                                            \
+                    i11 = 0;                                                                                   \
+                    if (++i12 == ne2) {                                                                        \
+                        i12 = 0;                                                                               \
+                        if (++i13 == ne3) {                                                                    \
+                            i13 = 0;                                                                           \
+                        }                                                                                      \
+                    }                                                                                          \
+                }                                                                                              \
+            }                                                                                                  \
+            for (int64_t i01 = ir0; i01 < ir1; i01++) {                                                        \
+                for (int64_t k00 = 0; k00 < nk00; k00++) {                                                     \
+                    const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); \
+                          char * dst_ptr  = ((char *)  dst->data + k10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);  \
+                    memcpy(dst_ptr, src0_ptr, ELEM_SIZE);                                                      \
+                    if (++k10 == nk0) {                                                                        \
+                        k10 = 0;                                                                               \
+                        if (++i11 == ne1) {                                                                    \
+                            i11 = 0;                                                                           \
+                            if (++i12 == ne2) {                                                                \
+                                i12 = 0;                                                                       \
+                                if (++i13 == ne3) {                                                            \
+                                    i13 = 0;                                                                   \
+                                }                                                                              \
+                            }                                                                                  \
+                        }                                                                                      \
+                    }                                                                                          \
+                }                                                                                              \
+            }                                                                                                  \
+            k10 += nk00 * (ne01 - ir1);                                                                        \
+            while (k10 >= nk0) {                                                                               \
+                k10 -= nk0;                                                                                    \
+                if (++i11 == ne1) {                                                                            \
+                    i11 = 0;                                                                                   \
+                    if (++i12 == ne2) {                                                                        \
+                        i12 = 0;                                                                               \
+                        if (++i13 == ne3) {                                                                    \
+                            i13 = 0;                                                                           \
+                        }                                                                                      \
+                    }                                                                                          \
+                }                                                                                              \
+            }                                                                                                  \
+        }                                                                                                      \
+    }                                                                                                          \
 }

-static void cpy_thread_f16_f32_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) {
+DEFINE_CPY_RESHAPE(f32, float, 4)
+DEFINE_CPY_RESHAPE(f16, __fp16, 2)
+
+static void cpy_thread_f16_f32_sameshape(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_copy_context * ct = (struct htp_copy_context *) data;
+    struct htp_ops_context * octx = ct->octx;
    cpy_preamble;

    // parallelize by src0 rows
    const uint32_t dr  = ct->src0_nrows_per_thread;
    const uint32_t ir0 = dr * ith;
    const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
+    if (ir0 >= nr) return;

    // copy by rows
    for (uint32_t i03 = 0; i03 < ne03; i03++) {
@@ -195,13 +207,16 @@ static void cpy_thread_f16_f32_sameshape(struct htp_copy_context * ct, struct ht
    }
 }

-static void cpy_thread_f32_f16_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) {
+static void cpy_thread_f32_f16_sameshape(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_copy_context * ct = (struct htp_copy_context *) data;
+    struct htp_ops_context * octx = ct->octx;
    cpy_preamble;

    // parallelize by src0 rows
    const uint32_t dr  = ct->src0_nrows_per_thread;
    const uint32_t ir0 = dr * ith;
    const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
+    if (ir0 >= nr) return;

    // copy by rows
    for (uint32_t i03 = 0; i03 < ne03; i03++) {
@@ -217,11 +232,6 @@ static void cpy_thread_f32_f16_sameshape(struct htp_copy_context * ct, struct ht
    }
 }

-static void cpy_work_func(unsigned int n, unsigned int i, void *data) {
-    struct htp_copy_context *ct = (struct htp_copy_context *) data;
-    ct->copy(ct, ct->octx, n, i);
-}
-
 int op_cpy(struct htp_ops_context * octx) {
    cpy_preamble;

@@ -254,22 +264,32 @@ int op_cpy(struct htp_ops_context * octx) {

    ct.src0_nrows_per_thread = (nr + n_threads - 1) / n_threads;

+    worker_callback_t copy_fun;
+
    if (sametype && sameshape) {
-        ct.copy = cpy_thread_sametype_sameshape;
+        if (src0->type == HTP_TYPE_F32) {
+            copy_fun = cpy_thread_f32_sameshape;
+        } else {
+            copy_fun = cpy_thread_f16_sameshape;
+        }
    } else if (sameshape) {
        /**/ if (dst->type == HTP_TYPE_F16 && src0->type == HTP_TYPE_F32)
-            ct.copy = cpy_thread_f16_f32_sameshape;
+            copy_fun = cpy_thread_f16_f32_sameshape;
        else if (dst->type == HTP_TYPE_F32 && src0->type == HTP_TYPE_F16)
-            ct.copy = cpy_thread_f32_f16_sameshape;
+            copy_fun = cpy_thread_f32_f16_sameshape;
        else
            return HTP_STATUS_NO_SUPPORT;
    } else if (sametype) {
-        ct.copy = cpy_thread_sametype_reshape;
+        if (src0->type == HTP_TYPE_F32) {
+            copy_fun = cpy_thread_f32_reshape;
+        } else {
+            copy_fun = cpy_thread_f16_reshape;
+        }
    } else {
        return HTP_STATUS_NO_SUPPORT;
    }

-    worker_pool_run_func(octx->ctx->worker_pool, cpy_work_func, &ct, n_threads);
+    worker_pool_run_func(octx->ctx->worker_pool, copy_fun, &ct, n_threads);

    return HTP_STATUS_OK;
 }
--- a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
+++ b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
@@ -22,6 +22,16 @@
 // Must be multiple of 32
 #define FLASH_ATTN_BLOCK_SIZE (32 * 2)

+#if __HVX_ARCH__ < 79
+#define HVX_OP_ADD_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b))
+#define HVX_OP_SUB_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a, b))
+#define HVX_OP_MUL_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b))
+#else
+#define HVX_OP_ADD_F32(a, b) Q6_Vsf_vadd_VsfVsf(a, b)
+#define HVX_OP_SUB_F32(a, b) Q6_Vsf_vsub_VsfVsf(a, b)
+#define HVX_OP_MUL_F32(a, b) Q6_Vsf_vmpy_VsfVsf(a, b)
+#endif
+
 // This is a bit of a hack because the compiler is strugling to properly inline
 // the default hvx_vec_f32_to_f16 with output into the local array.
 static __attribute__((noinline)) void hvx_vec_f32_to_f16_a(void *ptr, HVX_Vector v0, HVX_Vector v1)
@@ -54,8 +64,8 @@ static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict
        rsum_p = hvx_vec_mpyacc_f32_f16(rsum_p, x_hf, y_hf);
    }

-    HVX_Vector rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum_p), Q6_V_hi_W(rsum_p)));
-    rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum)));
+    HVX_Vector rsum = HVX_OP_ADD_F32(Q6_V_lo_W(rsum_p), Q6_V_hi_W(rsum_p));
+    rsum = HVX_OP_MUL_F32(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum));
    hvx_vec_store_u(r, 4, rsum);
 }

@@ -105,10 +115,10 @@ static inline HVX_Vector hvx_dot_f16_f16_aa_rx4(const void * restrict y,
        rsum3_p = hvx_vec_mpyacc_f32_f16(rsum3_p, x3_hf, y_hf);
    }

-    HVX_Vector rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum0_p), Q6_V_hi_W(rsum0_p)));
-    HVX_Vector rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum1_p), Q6_V_hi_W(rsum1_p)));
-    HVX_Vector rsum2 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum2_p), Q6_V_hi_W(rsum2_p)));
-    HVX_Vector rsum3 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum3_p), Q6_V_hi_W(rsum3_p)));
+    HVX_Vector rsum0 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum0_p), Q6_V_hi_W(rsum0_p));
+    HVX_Vector rsum1 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum1_p), Q6_V_hi_W(rsum1_p));
+    HVX_Vector rsum2 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum2_p), Q6_V_hi_W(rsum2_p));
+    HVX_Vector rsum3 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum3_p), Q6_V_hi_W(rsum3_p));

    HVX_Vector_x4 rsum0123 = { .v = { rsum0, rsum1, rsum2, rsum3 } };
    return hvx_vec_reduce_sum_f32x4(rsum0123);
@@ -123,7 +133,7 @@ static inline HVX_Vector hvx_dot_f16_f16_aa_rx32(const void * restrict y,
    const size_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
    const size_t nloe = n % VLEN_FP16; // leftover elements

-    HVX_Vector   sums;  // initialize at j = 0
+    HVX_Vector   sums = Q6_V_vzero();
    const size_t stride_x_4 = stride_x * 4;
    for (uint32_t j = 0; j < VLEN_FP32; j += 4) {
        HVX_Vector     sums_x4 = hvx_dot_f16_f16_aa_rx4(y, x, stride_x, nvec, nloe);
@@ -132,8 +142,7 @@ static inline HVX_Vector hvx_dot_f16_f16_aa_rx32(const void * restrict y,
        x += stride_x_4;
    }

-    sums = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), sums);
-    return Q6_Vsf_equals_Vqf32(sums);
+    return HVX_OP_MUL_F32(hvx_vec_splat_f32(s), sums);
 }

 // MAD: y (F32) += x (F16) * s (F16)
@@ -268,11 +277,10 @@ static inline void hvx_scale_vec_f32_aa(uint8_t * restrict dst, const uint8_t *
    uint32_t i = 0;
    #pragma unroll(4)
    for (; i < nvec; ++i) {
-        vdst[i] = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs));
+        vdst[i] = HVX_OP_MUL_F32(vsrc[i], vs);
    }
    if (nloe) {
-        HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
-        hvx_vec_store_a(&vdst[i], nloe * sizeof(float), Q6_Vsf_equals_Vqf32(v));
+        hvx_vec_store_a(&vdst[i], nloe * sizeof(float), HVX_OP_MUL_F32(vsrc[i], vs));
    }
 }

@@ -438,25 +446,44 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
            // Process in sub-blocks of 32 (VLEN_FP32)
            HVX_Vector sb_scores[FLASH_ATTN_BLOCK_SIZE / VLEN_FP32];
            HVX_Vector v_max = hvx_vec_splat_f32(-INFINITY);
-            for (uint32_t iv = 0; ic + VLEN_FP32 <= current_block_size; ic += VLEN_FP32, ++iv) {
+            for (uint32_t iv = 0; ic < current_block_size; ic += VLEN_FP32, ++iv) {
                // 1. Compute scores
                HVX_Vector scores = hvx_dot_f16_f16_aa_rx32(q_ptr_vtcm, k_base + ic * factx->size_k_row_padded, factx->size_k_row_padded, DK, factx->scale);

                // 2. Softcap
                if (factx->logit_softcap != 0.0f) {
                    scores = hvx_vec_tanh_f32(scores);
-                    scores = Q6_Vqf32_vmpy_VsfVsf(scores, logit_cap);
-                    scores = Q6_Vsf_equals_Vqf32(scores);
+                    scores = HVX_OP_MUL_F32(scores, logit_cap);
                }

                // 3. Mask
                if (mask) {
                    const __fp16 * mp = m_base + ic;
                    HVX_Vector m_vals_f16 = *(const HVX_UVector *) mp;
-                    HVX_VectorPair m_vals_f32_pair = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_f16), slope_vec);
-                    HVX_Vector add_val = Q6_V_lo_W(m_vals_f32_pair);
-                    scores = Q6_Vqf32_vadd_Vqf32Vsf(add_val, scores);
-                    scores = Q6_Vsf_equals_Vqf32(scores);
+
+                    // Multiplying -INFINITY (0xFC00) by a slope in VhfVhf instructions can incorrectly produce NaN on v79.
+                    // Clamp -INFINITY to the max negative fp16 finite value (-65504.0f).
+                    HVX_Vector vinf = Q6_Vh_vsplat_R(0xFC00);
+                    HVX_Vector vmin = Q6_Vh_vsplat_R(0xFBFF);
+                    HVX_VectorPred is_inf = Q6_Q_vcmp_eq_VhVh(m_vals_f16, vinf);
+                    m_vals_f16 = Q6_V_vmux_QVV(is_inf, vmin, m_vals_f16);
+
+                    #if __HVX_ARCH__ >= 79
+                        HVX_VectorPair m_vals_f32_pair = Q6_Wsf_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_f16), slope_vec);
+                        HVX_Vector add_val = Q6_V_lo_W(m_vals_f32_pair);
+                        scores = Q6_Vsf_vadd_VsfVsf(add_val, scores);
+                    #else
+                        HVX_VectorPair m_vals_f32_pair = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_f16), slope_vec);
+                        HVX_Vector add_val = Q6_V_lo_W(m_vals_f32_pair);
+                        scores = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(add_val, scores));
+                    #endif
+                }
+
+                // Mask out invalid lanes for leftover handling
+                uint32_t valid_lanes = current_block_size - ic;
+                if (valid_lanes < VLEN_FP32) {
+                    HVX_VectorPred valid_pred = Q6_Q_vsetq_R(valid_lanes * 4); // 4 bytes per fp32 lane
+                    scores = Q6_V_vmux_QVV(valid_pred, scores, hvx_vec_splat_f32(-INFINITY));
                }

                sb_scores[iv] = scores;
@@ -466,78 +493,55 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
            {
                // 4. Online Softmax Update
                HVX_Vector M_new_vec = Q6_Vsf_vmax_VsfVsf(v_max, M_vec);
-                HVX_Vector diff_vec  = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(M_vec, M_new_vec));
+                HVX_Vector diff_vec  = HVX_OP_SUB_F32(M_vec, M_new_vec);
                HVX_Vector ms_vec    = hvx_vec_exp_f32(diff_vec);
                M_vec = M_new_vec;

                hvx_scale_vec_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms_vec);

                HVX_Vector p_sum_vec = hvx_vec_splat_f32(0.0f);
-                for (uint32_t ic2 = 0, iv = 0; ic2 + VLEN_FP32 <= current_block_size; ic2 += VLEN_FP32, ++iv) {
+                for (uint32_t ic2 = 0, iv = 0; ic2 < current_block_size; ic2 += VLEN_FP32, ++iv) {
                    HVX_Vector scores = sb_scores[iv];
-                    HVX_Vector scores_shifted = Q6_Vqf32_vsub_VsfVsf(scores, M_vec);
-                    HVX_Vector P = hvx_vec_exp_f32(Q6_Vsf_equals_Vqf32(scores_shifted));
+                    HVX_Vector scores_shifted = HVX_OP_SUB_F32(scores, M_vec);
+                    HVX_Vector P = hvx_vec_exp_f32(scores_shifted);

-                    p_sum_vec = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(p_sum_vec, P));
+                    p_sum_vec = HVX_OP_ADD_F32(p_sum_vec, P);

                    // 5. Accumulate V
                    __fp16 __attribute__((aligned(VLEN))) p_arr[VLEN_FP16];
                    hvx_vec_f32_to_f16_a(p_arr, P, hvx_vec_splat_f32(0));

+                    float __attribute__((aligned(128))) P_arr[VLEN_FP32];
+                    hvx_vec_store_a(P_arr, 128, P);
+
                    for (uint32_t j = 0; j < VLEN_FP32; j += 2) {
-                        const uint32_t  cur_ic = ic2 + j;
-                        const uint8_t * v_ptr  = v_base + cur_ic * factx->size_v_row_padded;
+                        const uint32_t cur_ic = ic2 + j;
+                        if (cur_ic >= current_block_size) {
+                            break;
+                        }
+
+                        if (cur_ic + 1 == current_block_size) {
+                            // Odd leftover, process single row
+                            if (P_arr[j] != 0.0f) {
+                                const uint8_t * v_ptr = v_base + cur_ic * factx->size_v_row_padded;
+                                hvx_mad_f32_f16_aa(VKQ32, v_ptr, (p_arr + j), DV);
+                            }
+                            break;
+                        }
+
+                        // Avoid NaN * 0.0 = NaN for uninitialized V cache rows.
+                        // Check the f32 values to safely avoid strict aliasing violations.
+                        if (P_arr[j] == 0.0f && P_arr[j + 1] == 0.0f) {
+                            continue;
+                        }
+
+                        const uint8_t * v_ptr = v_base + cur_ic * factx->size_v_row_padded;
                        hvx_mad_f32_f16_aa_rx2(VKQ32, v_ptr, v_ptr + factx->size_v_row_padded, (p_arr + j), (p_arr + j + 1), DV);
                    }
                }

                p_sum_vec = hvx_vec_reduce_sum_f32(p_sum_vec);
-                S_vec = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(S_vec, ms_vec)), p_sum_vec));
-            }
-
-            if (ic < current_block_size) {
-                // Sync scalars for leftover/next block if needed
-                float M = hvx_vec_get_f32(M_vec);
-                float S = hvx_vec_get_f32(S_vec);
-
-                // Leftover
-                for (; ic < current_block_size; ++ic) {
-                    float s_val;
-                    const uint8_t * k_ptr = k_base + ic * factx->size_k_row_padded;
-                    hvx_dot_f16_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, factx->scale);
-                    if (factx->logit_softcap != 0.0f) {
-                        s_val = factx->logit_softcap * tanhf(s_val);
-                    }
-
-                    if (mask) {
-                        const float m_val = m_base[ic];
-                        s_val += slope * m_val;
-                    }
-
-                    const float Mold = M;
-                    __fp16 vs = 1.0f;
-
-                    if (s_val > M) {
-                        M = s_val;
-                        HVX_Vector diff_vec = hvx_vec_splat_f32(Mold - M);
-                        HVX_Vector ms_vec   = hvx_vec_exp_f32(diff_vec);
-                        hvx_scale_vec_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms_vec);
-
-                        float ms = hvx_vec_get_f32(ms_vec);
-                        S = S * ms + vs;
-                    } else {
-                        HVX_Vector diff_vec = hvx_vec_splat_f32(s_val - M);
-                        vs = hvx_vec_get_f32(hvx_vec_exp_f32(diff_vec));
-                        S += vs;
-                    }
-
-                    const uint8_t * v_ptr = v_base + ic * factx->size_v_row_padded;
-
-                    hvx_mad_f32_f16_aa(VKQ32, v_ptr, &vs, DV);
-                }
-
-                M_vec = hvx_vec_splat_f32(M);
-                S_vec = hvx_vec_splat_f32(S);
+                S_vec = HVX_OP_ADD_F32(HVX_OP_MUL_F32(S_vec, ms_vec), p_sum_vec);
            }

            // Issue DMA for next+1 block (if exists)
@@ -599,8 +603,9 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
        const int i2 = iq2;
        const int i3 = iq3;

-        // dst is permuted
-        uint8_t * dst_ptr = (uint8_t *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1) * nb1;
+        // dst is permuted: [DV, n_heads, n_tokens, n_seq]
+        // head stride is nb[1], token stride is nb[2], batch stride is nb[3]
+        uint8_t * dst_ptr = (uint8_t *) dst->data + i2 * dst->nb[1] + i1 * dst->nb[2] + i3 * dst->nb[3];

        if (dst->type == HTP_TYPE_F32) {
            hvx_copy_f32_ua(dst_ptr, (uint8_t *) VKQ32, DV);
@@ -623,8 +628,8 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
    }

 #ifdef HTP_HAS_HMX
-    // HMX path: prefill (neq1 >= 32), head_dim multiple of 32, F16 KV
-    if (k->type == HTP_TYPE_F16 && v->type == HTP_TYPE_F16 && k->ne[0] % 32 == 0 && q->ne[1] >= 32) {
+    // HMX path: head_dim multiple of 32, F16 KV
+    if (k->type == HTP_TYPE_F16 && v->type == HTP_TYPE_F16 && k->ne[0] % 32 == 0) {
        int ret = hmx_flash_attn_ext(octx);
        if (ret == HTP_STATUS_OK) {
            return ret;
--- a/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c
+++ b/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c
@@ -586,6 +586,7 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
    const uint32_t H        = v->ne[1];
    const uint32_t n_tokens = v->ne[2];
    const uint32_t n_seqs   = v->ne[3];
+    const uint32_t K        = state->ne[1];

    const uint32_t total_rows = H * n_seqs;
    if (ith >= total_rows) {
@@ -606,6 +607,10 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
    float local_k[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
    float local_sums[4] __attribute__((aligned(128)));

+    const uint64_t state_seq_stride = state->nb[2] / sizeof(float);
+    const uint64_t state_size_per_snap = (uint64_t) S_v * S_v * H * n_seqs;
+    const int64_t shift = (int64_t) n_tokens - (int64_t) K;
+
    for (uint32_t ir = ith; ir < total_rows; ir += nth) {
        const uint32_t iv1 = ir % H;
        const uint32_t iv3 = ir / H;
@@ -615,8 +620,8 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
        const uint32_t iq3 = iv3 / rq3;
        const uint32_t ik3 = iv3 / rk3;

-        float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
-        const float * s_in = state_in_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
+        float * s_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
+        const float * s_in = state_in_base + (uint64_t) iv3 * state_seq_stride + (uint64_t) iv1 * S_v * S_v;

        memcpy(s_out, s_in, gctx->state_bytes);
        float * s_work = s_out;
@@ -689,6 +694,16 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
                }
            }

+            if (K > 1) {
+                const int64_t target_slot = (int64_t) t - shift;
+                if (target_slot >= 0 && target_slot < (int64_t) K) {
+                    float * curr_state_o = state_out_base + (uint64_t) target_slot * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
+                    if (curr_state_o != s_work) {
+                        memcpy(curr_state_o, s_work, gctx->state_bytes);
+                    }
+                }
+            }
+
            attn_data += (uint64_t) S_v * H;
        }
    }
@@ -709,6 +724,7 @@ static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, vo
    const uint32_t S_v      = v->ne[0];
    const uint32_t H        = v->ne[1];
    const uint32_t n_seqs   = v->ne[3];
+    const uint32_t K        = state->ne[1];

    const uint32_t total_rows = H * n_seqs;
    if (ith >= total_rows) {
@@ -736,6 +752,9 @@ static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, vo
        spad = gctx->vtcm_state_base + gctx->vtcm_state_per_thread * ith;
    }

+    const uint64_t state_seq_stride = state->nb[2] / sizeof(float);
+    const uint64_t state_size_per_snap = (uint64_t) S_v * S_v * H * n_seqs;
+
    for (uint32_t ir = ith; ir < total_rows; ir += nth) {
        const uint32_t iv1 = ir % H;
        const uint32_t iv3 = ir / H;
@@ -745,8 +764,8 @@ static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, vo
        const uint32_t iq3 = iv3 / rq3;
        const uint32_t ik3 = iv3 / rk3;

-        float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
-        const float * s_in = state_in_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
+        float * s_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
+        const float * s_in = state_in_base + (uint64_t) iv3 * state_seq_stride + (uint64_t) iv1 * S_v * S_v;
        float * s_work;

        if (spad) {
@@ -901,6 +920,7 @@ int op_gated_delta_net(struct htp_ops_context * octx) {
    const uint32_t H        = v->ne[1];
    const uint32_t n_tokens = v->ne[2];
    const uint32_t n_seqs   = v->ne[3];
+    const uint32_t K        = state->ne[1];

    if (S_v == 0 || S_v > HTP_GDN_MAX_SV || H == 0 || n_tokens == 0 || n_seqs == 0) {
        return HTP_STATUS_NO_SUPPORT;
@@ -913,10 +933,10 @@ int op_gated_delta_net(struct htp_ops_context * octx) {
        (n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) {
        return HTP_STATUS_NO_SUPPORT;
    }
-    if (state->ne[0] * state->ne[1] * state->ne[2] * state->ne[3] != S_v * S_v * H * n_seqs) {
+    if (state->ne[0] * state->ne[2] * state->ne[3] != S_v * S_v * H * n_seqs) {
        return HTP_STATUS_NO_SUPPORT;
    }
-    if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs) {
+    if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs * K) {
        return HTP_STATUS_NO_SUPPORT;
    }

--- a/ggml/src/ggml-hexagon/htp/get-rows-ops.c
+++ b/ggml/src/ggml-hexagon/htp/get-rows-ops.c
@@ -17,9 +17,13 @@

 struct get_rows_context {
    struct htp_ops_context * octx;
-    uint32_t src1_nrows_per_thread;
+    uint32_t tasks_per_thread;
+    uint32_t total_tasks;
+    uint32_t chunks_per_row;
+    uint32_t chunk_size;
    struct fastdiv_values get_rows_div_ne10;
    struct fastdiv_values get_rows_div_ne10_ne11;
+    struct fastdiv_values get_rows_div_chunks_per_row;
 };

 #define get_rows_preamble \
@@ -52,20 +56,23 @@ struct get_rows_context {
                                               \
    const uint32_t nr = ne10 * ne11 * ne12;

-static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *data) {
+static void get_rows_thread_f32_f32_dma(unsigned int nth, unsigned int ith, void *data) {
    struct get_rows_context * grctx = (struct get_rows_context *)data;
    struct htp_ops_context * octx = grctx->octx;
    get_rows_preamble;

    uint64_t qt = HAP_perf_get_qtimer_count();

-    // parallelize by src1 elements (which correspond to dst rows)
-    const uint32_t dr  = grctx->src1_nrows_per_thread;
+    const uint32_t dr  = grctx->tasks_per_thread;
    const uint32_t ir0 = dr * ith;
-    const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
+    if (ir0 >= grctx->total_tasks) {
+        return;
+    }
+    const uint32_t ir1 = MIN(ir0 + dr, grctx->total_tasks);

    const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);

+    dma_queue * dma_queue = octx->ctx->dma[ith];
    for (uint32_t i = ir0; i < ir1; ++i) {
        const uint32_t i12 = fastdiv(i, &grctx->get_rows_div_ne10_ne11);
        const uint32_t rem = i - i12 * ne11 * ne10;
@@ -73,29 +80,77 @@ static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da
        const uint32_t i10 = rem - i11 * ne10;

        const uintptr_t src1_addr = octx->src[1]->data + i10*nb10 + i11*nb11 + i12*nb12;
-
        uint32_t i01 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr;

        if (i01 >= ne01) {
-            // invalid index, skip for now to avoid crash
            continue;
        }

        const uintptr_t src0_ptr = octx->src[0]->data + i01*nb01 + i11*nb02 + i12*nb03;
        const uintptr_t dst_ptr  = octx->dst->data    + i10*nb1  + i11*nb2  + i12*nb3;
-        hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
+
+        while (!dma_queue_push(dma_queue, dma_make_ptr((void *)dst_ptr, (const void *)src0_ptr), nb1, nb01, ne00 * sizeof(float), 1)) {
+            dma_queue_pop(dma_queue);
+        }
+    }
+    dma_queue_flush(dma_queue);
+
+    qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt);
+    FARF(HIGH, "get-rows-f32-f32-dma %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
+         ne00, ne01, ne02, ne03, ir0, ir1, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, (unsigned) qt);
+}
+
+static void get_rows_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void *data) {
+    struct get_rows_context * grctx = (struct get_rows_context *)data;
+    struct htp_ops_context * octx = grctx->octx;
+    get_rows_preamble;
+
+    uint64_t qt = HAP_perf_get_qtimer_count();
+
+    const uint32_t dr  = grctx->tasks_per_thread;
+    const uint32_t ir0 = dr * ith;
+    if (ir0 >= grctx->total_tasks) {
+        return;
+    }
+    const uint32_t ir1 = MIN(ir0 + dr, grctx->total_tasks);
+
+    const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
+
+    const uint32_t chunks_per_row = grctx->chunks_per_row;
+    const uint32_t chunk_size     = grctx->chunk_size;
+    for (uint32_t i = ir0; i < ir1; ++i) {
+        const uint32_t row_idx   = fastdiv(i, &grctx->get_rows_div_chunks_per_row);
+        const uint32_t chunk_idx = i - row_idx * chunks_per_row;
+
+        const uint32_t i12 = fastdiv(row_idx, &grctx->get_rows_div_ne10_ne11);
+        const uint32_t rem = row_idx - i12 * ne11 * ne10;
+        const uint32_t i11 = fastdiv(rem, &grctx->get_rows_div_ne10);
+        const uint32_t i10 = rem - i11 * ne10;
+
+        const uintptr_t src1_addr = octx->src[1]->data + i10*nb10 + i11*nb11 + i12*nb12;
+        uint32_t i01 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr;
+
+        if (i01 >= ne01) {
+            continue;
+        }
+
+        const uint32_t offset = chunk_idx * chunk_size;
+        if (offset < ne00) {
+            const uint32_t copy_size = MIN(chunk_size, ne00 - offset);
+            const uintptr_t src0_ptr = octx->src[0]->data + i01*nb01 + i11*nb02 + i12*nb03 + offset * sizeof(float);
+            const uintptr_t dst_ptr  = octx->dst->data    + i10*nb1  + i11*nb2  + i12*nb3  + offset * sizeof(float);
+            hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, copy_size);
+        }
    }

    qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt);
-    FARF(HIGH, "get-rows-f32-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
+    FARF(HIGH, "get-rows-f32-f32-hvx %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
         ne00, ne01, ne02, ne03, ir0, ir1, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, (unsigned) qt);
 }

 int op_get_rows(struct htp_ops_context * octx) {
    get_rows_preamble;

-    const uint32_t n_threads = MIN(nr, octx->n_threads);
-
    if (octx->src[0]->type != HTP_TYPE_F32) {
        return HTP_STATUS_NO_SUPPORT;
    }
@@ -112,13 +167,52 @@ int op_get_rows(struct htp_ops_context * octx) {
        return HTP_STATUS_OK;
    }

+    const uint32_t nb00 = octx->src[0]->nb[0];
+    const uint32_t nb0  = octx->dst->nb[0];
+
+    const bool can_use_dma = (nb00 == sizeof(float)) && (nb0 == sizeof(float));
+    const bool use_dma = can_use_dma && (ne00 >= 2048);
+
    struct get_rows_context grctx;
    grctx.octx = octx;
    grctx.get_rows_div_ne10      = init_fastdiv_values(octx->src[1]->ne[0]);
    grctx.get_rows_div_ne10_ne11 = init_fastdiv_values(octx->src[1]->ne[0] * octx->src[1]->ne[1]);

-    grctx.src1_nrows_per_thread = (nr + n_threads - 1) / n_threads;
+    if (use_dma) {
+        grctx.chunks_per_row = 1;
+        grctx.chunk_size = ne00;
+        grctx.total_tasks = nr;
+        grctx.get_rows_div_chunks_per_row = init_fastdiv_values(1);

-    worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32, &grctx, n_threads);
+        const uint32_t n_threads = MIN(nr, octx->n_threads);
+        grctx.tasks_per_thread = (nr + n_threads - 1) / n_threads;
+
+        worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32_dma, &grctx, n_threads);
+    } else {
+        uint32_t chunks_per_row = 1;
+        uint32_t chunk_size = ne00;
+        uint32_t total_tasks = nr;
+
+        if (nr < octx->n_threads) {
+            const uint32_t min_chunk_size = 1024;
+            uint32_t max_chunks = ne00 / min_chunk_size;
+            if (max_chunks == 0) {
+                max_chunks = 1;
+            }
+            chunks_per_row = MIN((octx->n_threads + nr - 1) / nr, max_chunks);
+            chunk_size = (ne00 + chunks_per_row - 1) / chunks_per_row;
+            total_tasks = nr * chunks_per_row;
+        }
+
+        grctx.chunks_per_row = chunks_per_row;
+        grctx.chunk_size = chunk_size;
+        grctx.total_tasks = total_tasks;
+        grctx.get_rows_div_chunks_per_row = init_fastdiv_values(chunks_per_row);
+
+        const uint32_t n_threads = MIN(total_tasks, octx->n_threads);
+        grctx.tasks_per_thread = (total_tasks + n_threads - 1) / n_threads;
+
+        worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32_hvx, &grctx, n_threads);
+    }
    return HTP_STATUS_OK;
 }
--- a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
@@ -50,8 +50,8 @@ static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV,
    const size_t g_br         = hex_align_up(gqa_factor * Br, HMX_FP16_TILE_N_ROWS);
    const size_t q_tile_size  = hex_align_up(g_br * DK * sizeof(__fp16), 4096);    // Q:  [g_br, DK]
    const size_t o_tile_size  = hex_align_up(g_br * DV * sizeof(__fp16), 4096);    // O:  [g_br, DV] x2 ping-pong
-    const size_t k_dma_size   = hex_align_up(Bc * DK * sizeof(__fp16), 4096);      // K DMA: [Bc, DK] x2 double-buf
-    const size_t v_dma_size   = hex_align_up(Bc * DV * sizeof(__fp16), 4096);      // V DMA: [Bc, DV] x2 double-buf
+    const size_t k_dma_size   = hex_align_up(Bc * hex_round_up(DK * sizeof(__fp16), 128), 4096);      // K DMA: [Bc, DK] x2 double-buf
+    const size_t v_dma_size   = hex_align_up(Bc * hex_round_up(DV * sizeof(__fp16), 128), 4096);      // V DMA: [Bc, DV] x2 double-buf
    const size_t k_tile_size  = hex_align_up(Bc * DK * sizeof(__fp16), 4096);      // K tiles: [Bc, DK] interleaved
    const size_t v_tile_size  = hex_align_up(Bc * DV * sizeof(__fp16), 4096);      // V tiles: [Bc, DV] interleaved
    const size_t s_tile_size  = hex_align_up(g_br * Bc * sizeof(__fp16), 4096);    // S/P:[g_br, Bc]
@@ -1248,9 +1248,6 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
    if (DK % 32 != 0 || DV % 32 != 0) {
        return HTP_STATUS_NO_SUPPORT;
    }
-    if (neq1 < 32) {
-        return HTP_STATUS_NO_SUPPORT;
-    }

    // GQA factor
    const uint32_t n_kv_heads = k->ne[2];
@@ -1278,7 +1275,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
    struct hmx_fa_context factx;
    memset(&factx, 0, sizeof(factx));
    factx.octx           = octx;
-    factx.n_threads      = octx->ctx->n_threads;
+    factx.n_threads      = n_threads;
    factx.DK             = DK;
    factx.DV             = DV;
    factx.n_kv           = nek1;
@@ -1328,10 +1325,15 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
    factx.m1          = powf(2.0f, -(max_bias / 2.0f) / factx.n_head_log2);

    // ======== VTCM allocation (GQA-aware) ========
+    const size_t size_k_row        = DK * sizeof(__fp16);
+    const size_t size_v_row        = DV * sizeof(__fp16);
+    const size_t size_k_row_padded = hex_round_up(size_k_row, 128);
+    const size_t size_v_row_padded = hex_round_up(size_v_row, 128);
+
    const size_t q_tile_bytes  = hex_align_up(g_br * DK * sizeof(__fp16), 4096);
    const size_t o_tile_bytes  = hex_align_up(g_br * DV * sizeof(__fp16), 4096);
-    const size_t k_dma_bytes   = hex_align_up(Bc * DK * sizeof(__fp16), 4096);
-    const size_t v_dma_bytes   = hex_align_up(Bc * DV * sizeof(__fp16), 4096);
+    const size_t k_dma_bytes   = hex_align_up(Bc * size_k_row_padded, 4096);
+    const size_t v_dma_bytes   = hex_align_up(Bc * size_v_row_padded, 4096);
    const size_t k_tile_bytes  = hex_align_up(Bc * DK * sizeof(__fp16), 4096);
    const size_t v_tile_bytes  = hex_align_up(Bc * DV * sizeof(__fp16), 4096);
    const size_t s_tile_bytes  = hex_align_up(g_br * Bc * sizeof(__fp16), 4096);
@@ -1401,11 +1403,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
    // ======== DMA setup ========
    dma_queue * const dma = ctx->dma[0];

-    // Padded row sizes for DMA
-    const size_t size_k_row        = nek0 * sizeof(__fp16);
-    const size_t size_v_row        = nev0 * sizeof(__fp16);
-    const size_t size_k_row_padded = hex_round_up(nek0 * sizeof(__fp16), 128);
-    const size_t size_v_row_padded = hex_round_up(nev0 * sizeof(__fp16), 128);
+    // Padded row sizes for DMA (defined in outer scope)

    const size_t n_row_tiles_g_br = g_br / HMX_FP16_TILE_N_ROWS;
    const size_t n_tiles_per_bc   = Bc / HMX_FP16_TILE_N_COLS;
--- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
@@ -16,6 +16,7 @@
 #include "ggml-common.h"

 #include "hex-dma.h"
+#include "hex-fastdiv.h"
 #include "worker-pool.h"

 #include "hvx-utils.h"
@@ -34,6 +35,10 @@ static const __fp16 q4_0_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
    -8, 0, -7, 0, -6, 0, -5, 0, -4, 0, -3, 0, -2, 0, -1, 0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0,
 };

+static const __fp16 q4_1_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
+    0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0,
+};
+
 // MXFP4 dequantization LUT: maps 4-bit index to fp16 mantissa value
 // kvalues: 0, 0.5, 1, 1.5, 2, 3, 4, 6, 0, -0.5, -1, -1.5, -2, -3, -4, -6
 static const __fp16 mxfp4_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
@@ -62,6 +67,8 @@ static inline size_t get_x4x2_row_stride(int weight_type, int k) {
        case HTP_TYPE_Q4_0:
        case HTP_TYPE_IQ4_NL:
            return (size_t) nb * (QK_Q4_0x4x2 / 2 + HMX_X4X2_DBLK_SIZE);         // 144 * nb
+        case HTP_TYPE_Q4_1:
+            return (size_t) nb * (QK_Q4_0x4x2 / 2 + 32);                         // 160 * nb
        case HTP_TYPE_Q8_0:
            return (size_t) nb * (QK_Q8_0x4x2 + HMX_X4X2_DBLK_SIZE);             // 272 * nb
        case HTP_TYPE_MXFP4:
@@ -181,45 +188,44 @@ next_nc:
 // In x4x2, sub-blocks 0..3 use lower nibbles, sub-blocks 4..7 use upper nibbles
 // of the same 32 packed bytes.
 static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale, const HVX_Vector vlut_cvt) {
+    (void)vlut_cvt;
    HVX_Vector vq = hvx_vmemu(packed_32);
    const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+    const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
    HVX_Vector v_scales = hvx_vec_repl_f16(hvx_vmemu(scale));
-    // q4x4x2 stores two int4 values per byte. Keep only the selected nibble.
-    HVX_Vector v_quants =  Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
+
+    HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
    v_quants = Q6_V_vand_VV(v_quants, mask_h4);
-    // Shuffle before LUT
-    v_quants = Q6_Vb_vshuff_Vb(v_quants);
-    // Use standard vlut16 (not _nomatch) to avoid stale-register NaN.
-    // _nomatch retains the previous destination-register value for colliding
-    // indices, but the C intrinsic doesn't model the implicit read so the
-    // compiler may allocate a register containing garbage/NaN.
-    HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
-    HVX_Vector v_hf = Q6_V_lo_W(vp);
+
+    HVX_Vector v_int8 = Q6_Vb_vsub_VbVb(v_quants, i8);
+    HVX_Vector v0     = Q6_V_lo_W(Q6_Wh_vunpack_Vb(v_int8));
+    HVX_Vector v_hf   = Q6_Vhf_equals_Vh(v0);

    return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales));
 }

 // Batch-dequantize 4 contiguous x4x2 Q4_0 groups (4x32 = 128 packed bytes) using
-// full HVX vector width.  One vmemu + one vlut16 replaces 4 separate calls.
+// full HVX vector width.
 // Output: vector_x2 each hold 32 FP16 values in the first 64 bytes.
 static inline HVX_Vector_x2 dequantize_x4x2_q4_0_x4groups_hvx(
            const uint8_t *packed_128, bool upper_nibbles,
            const __fp16 *scales_4, const HVX_Vector vlut_cvt) {
-    // Load all 128 packed bytes (4 contiguous 32-byte groups)
+    (void)vlut_cvt;
    HVX_Vector vq = hvx_vmemu(packed_128);
    const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+    const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
    HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
    v_quants = Q6_V_vand_VV(v_quants, mask_h4);

-    // Shuffle before LUT
-    v_quants = Q6_Vb_vshuff_Vb(v_quants);
+    HVX_Vector v_int8 = Q6_Vb_vsub_VbVb(v_quants, i8);

-    // Full-width vlut16: 128 byte lookups -> 128 fp16 results in a VectorPair
-    HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
-    HVX_Vector v_lo = Q6_V_lo_W(vp);  // [group0: 32 fp16 | group1: 32 fp16]
-    HVX_Vector v_hi = Q6_V_hi_W(vp);  // [group2: 32 fp16 | group3: 32 fp16]
+    HVX_VectorPair vp_int16 = Q6_Wh_vunpack_Vb(v_int8);
+    HVX_Vector v_lo = Q6_V_lo_W(vp_int16);
+    HVX_Vector v_hi = Q6_V_hi_W(vp_int16);
+
+    v_lo = Q6_Vhf_equals_Vh(v_lo);
+    v_hi = Q6_Vhf_equals_Vh(v_hi);

-    // Build per-group scale vectors: first 64 bytes use scale_a, last 64 use scale_b
    HVX_Vector vscale = hvx_vmemu(scales_4);
    HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vscale);
    HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vscale, 4));
@@ -227,9 +233,97 @@ static inline HVX_Vector_x2 dequantize_x4x2_q4_0_x4groups_hvx(
    v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01));
    v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));

-    // Extract individual groups: scatter uses q_mask64 so only first 64 bytes matter
-    HVX_Vector_x2 r = { v_lo,/* group1 already in [0:63] */
-                        v_hi /* group2 already in [0:63] */ };
+    HVX_Vector_x2 r = { v_lo, v_hi };
+    return r;
+}
+
+static inline HVX_Vector dequantize_x4x2_q4_1_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale_offset, const HVX_Vector vlut_cvt) {
+    (void)vlut_cvt;
+    HVX_Vector vq = hvx_vmemu(packed_32);
+    const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+    HVX_Vector v_dm = hvx_vmemu(scale_offset);
+    HVX_Vector v_scales = hvx_vec_repl_f16(v_dm);
+    HVX_Vector v_offsets = hvx_vec_repl_f16(Q6_V_vror_VR(v_dm, 2));
+
+    HVX_Vector v_quants =  Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
+    v_quants = Q6_V_vand_VV(v_quants, mask_h4);
+
+    HVX_Vector v0   = Q6_V_lo_W(Q6_Wh_vunpack_Vb(v_quants));
+    HVX_Vector v_hf = Q6_Vhf_equals_Vh(v0);
+
+    return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales), v_offsets));
+}
+
+static inline HVX_Vector_x2 dequantize_x4x2_q4_1_x4groups_hvx(
+            const uint8_t *packed_128, bool upper_nibbles,
+            const __fp16 *scales_offsets_4, const HVX_Vector vlut_cvt) {
+    (void)vlut_cvt;
+    HVX_Vector vq = hvx_vmemu(packed_128);
+    const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+    HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
+    v_quants = Q6_V_vand_VV(v_quants, mask_h4);
+
+    HVX_VectorPair vp_int16 = Q6_Wh_vunpack_Vb(v_quants);
+    HVX_Vector v_lo = Q6_V_lo_W(vp_int16);
+    HVX_Vector v_hi = Q6_V_hi_W(vp_int16);
+
+    v_lo = Q6_Vhf_equals_Vh(v_lo);
+    v_hi = Q6_Vhf_equals_Vh(v_hi);
+
+    HVX_Vector vscale_offset = hvx_vmemu(scales_offsets_4);
+    HVX_VectorPair dm_deal = Q6_W_vdeal_VVR(vscale_offset, vscale_offset, -2);
+    HVX_Vector vd = Q6_V_lo_W(dm_deal);
+    HVX_Vector vm = Q6_V_hi_W(dm_deal);
+
+    HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vd);
+    HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vd, 4));
+
+    HVX_Vector v_os01 = hvx_vec_repl_2x_f16(vm);
+    HVX_Vector v_os23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vm, 4));
+
+    v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01), v_os01));
+    v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23), v_os23));
+
+    HVX_Vector_x2 r = { v_lo, v_hi };
+    return r;
+}
+
+// LUT-based dequantizers for non-linear IQ4_NL format.
+static inline HVX_Vector dequantize_x4x2_iq4_nl_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale, const HVX_Vector vlut_cvt) {
+    HVX_Vector vq = hvx_vmemu(packed_32);
+    const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+    HVX_Vector v_scales = hvx_vec_repl_f16(hvx_vmemu(scale));
+    HVX_Vector v_quants =  Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
+    v_quants = Q6_V_vand_VV(v_quants, mask_h4);
+    v_quants = Q6_Vb_vshuff_Vb(v_quants);
+    HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
+    HVX_Vector v_hf = Q6_V_lo_W(vp);
+
+    return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales));
+}
+
+static inline HVX_Vector_x2 dequantize_x4x2_iq4_nl_x4groups_hvx(
+            const uint8_t *packed_128, bool upper_nibbles,
+            const __fp16 *scales_4, const HVX_Vector vlut_cvt) {
+    HVX_Vector vq = hvx_vmemu(packed_128);
+    const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+    HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
+    v_quants = Q6_V_vand_VV(v_quants, mask_h4);
+
+    v_quants = Q6_Vb_vshuff_Vb(v_quants);
+
+    HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
+    HVX_Vector v_lo = Q6_V_lo_W(vp);
+    HVX_Vector v_hi = Q6_V_hi_W(vp);
+
+    HVX_Vector vscale = hvx_vmemu(scales_4);
+    HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vscale);
+    HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vscale, 4));
+
+    v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01));
+    v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
+
+    HVX_Vector_x2 r = { v_lo, v_hi };
    return r;
 }

@@ -320,100 +414,176 @@ static inline HVX_Vector_x4 dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t *
    return r;
 }

+typedef struct {
+    __fp16                  *dst;
+    const uint8_t           *src;
+    int                      n_cols;
+    int                      k_block;
+    size_t                   row_stride;
+    int                      weight_type;
+    int                      n_tot_tiles;
+    int                      n_tiles_per_task;
+    int                      n_tasks;
+    int                      n_k_tiles;
+    struct fastdiv_values    n_k_tiles_div;
+} x4x2_dequantize_state_t;
+
 // Dequantize a tile range from x4x2 weight data (already in VTCM) to tile-major FP16.
 // Input:  vtcm_src has n_cols rows of x4x2 data, each row_stride bytes.
 // Output: vtcm_dst in tile-major FP16 layout.
-static void dequantize_x4x2_weight_to_fp16_tiles_task(
-        __fp16 *restrict vtcm_dst,
-        const uint8_t *restrict vtcm_src,
-        int n_cols, int k_block,
-        size_t row_stride, int weight_type,
+
+#define DEFINE_DEQUANTIZE_Q4_TASK(suffix, lut_name, helper_prefix, dblk_size, scale_step)                      \
+static void dequantize_x4x2_weight_to_fp16_tiles_task_##suffix(                                                \
+        const x4x2_dequantize_state_t *state,                                                                  \
+        int start_tile, int end_tile) {                                                                        \
+                                                                                                               \
+    const int n_k_tiles = state->n_k_tiles;                                                                    \
+    const int qrow_size = (unsigned)state->k_block / 2;                                                        \
+    const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div;                                          \
+    const HVX_Vector vlut_cvt = hvx_vmem(lut_name);                                                            \
+                                                                                                               \
+    const HVX_Vector v_scat_base  = hvx_vmem(hmx_transpose_scatter_offsets);                                   \
+    const HVX_Vector v_scat_step  = Q6_V_vsplat_R(4);                                                          \
+    const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64);                                                          \
+                                                                                                               \
+    unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div);                                               \
+    unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div);                                 \
+                                                                                                               \
+    for (unsigned t = start_tile; t < (unsigned)end_tile; ) {                                                  \
+        if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; }                                                       \
+                                                                                                               \
+        if ((kt % 4 == 0) && (t + 4 <= (unsigned)end_tile) && (fastdiv(t + 3, &n_k_tiles_div) == ct)) {        \
+            unsigned blk_idx      = ((kt * 32) / QK_Q4_0x4x2);                                                 \
+            unsigned sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32;                                            \
+            bool upper            = (sub_blk_base >= 4);                                                       \
+            unsigned packed_off   = blk_idx * (QK_Q4_0x4x2 / 2);                                               \
+            unsigned scale_off    = qrow_size + blk_idx * (dblk_size) + sub_blk_base * (scale_step);           \
+                                                                                                               \
+            __fp16 *tile_bases[4];                                                                             \
+            for (unsigned g = 0; g < 4; g++) {                                                                 \
+                tile_bases[g] = state->dst + (t + g) * HMX_FP16_TILE_N_ELMS;                                   \
+            }                                                                                                  \
+                                                                                                               \
+            HVX_Vector v_off = v_scat_base;                                                                    \
+            unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * state->row_stride;                               \
+                                                                                                               \
+            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {                                                \
+                const uint8_t *r0 = state->src + row_offset; row_offset += state->row_stride;                  \
+                const uint8_t *r1 = state->src + row_offset; row_offset += state->row_stride;                  \
+                                                                                                               \
+                HVX_Vector_x2 dv0 = dequantize_x4x2_##helper_prefix##_x4groups_hvx(                            \
+                    r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);                       \
+                Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[0]);         \
+                Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[1]);         \
+                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);                                                   \
+                                                                                                               \
+                HVX_Vector_x2 dv1 = dequantize_x4x2_##helper_prefix##_x4groups_hvx(                            \
+                    r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt);                       \
+                Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[0]);         \
+                Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[1]);         \
+                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);                                                   \
+            }                                                                                                  \
+                                                                                                               \
+            for (int g = 0; g < 4; g++) { (void) *(volatile HVX_Vector *)(tile_bases[g]); }                    \
+            t += 4; kt += 4;                                                                                   \
+            continue;                                                                                          \
+        }                                                                                                      \
+                                                                                                               \
+        __fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS;                                             \
+        {                                                                                                      \
+            unsigned blk_idx   = (kt * 32) / QK_Q4_0x4x2;                                                      \
+            unsigned sub_blk   = ((kt * 32) % QK_Q4_0x4x2) / 32;                                               \
+            bool upper         = (sub_blk >= 4);                                                               \
+            unsigned byte_off  = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;         \
+            unsigned scale_off = qrow_size + blk_idx * (dblk_size) + sub_blk * (scale_step);                   \
+                                                                                                               \
+            HVX_Vector v_off = v_scat_base;                                                                    \
+            unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * state->row_stride;                               \
+            unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;                                                     \
+                                                                                                               \
+            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {                                     \
+                const uint8_t *r0 = state->src + row_offset; row_offset += state->row_stride;                  \
+                const uint8_t *r1 = state->src + row_offset; row_offset += state->row_stride;                  \
+                                                                                                               \
+                HVX_Vector v0 = dequantize_x4x2_##helper_prefix##_group_hvx(                                   \
+                    r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);                         \
+                HVX_Vector v1 = (row1 < (unsigned)state->n_cols)                                               \
+                    ? dequantize_x4x2_##helper_prefix##_group_hvx(                                             \
+                        r1 + byte_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt)                      \
+                    : Q6_V_vzero();                                                                            \
+                                                                                                               \
+                Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);            \
+                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);                                                   \
+                Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1);            \
+                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);                                                   \
+            }                                                                                                  \
+            (void) *(volatile HVX_Vector *)(tile_base);                                                        \
+        }                                                                                                      \
+        ++t; ++kt;                                                                                             \
+    }                                                                                                          \
+                                                                                                               \
+    if (start_tile < end_tile) {                                                                               \
+        (void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS);                   \
+    }                                                                                                          \
+}                                                                                                              \
+                                                                                                               \
+static void dequantize_x4x2_worker_loop_##suffix(unsigned int n, unsigned int i, void *data) {                 \
+    x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;                                          \
+    for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {                     \
+        int start = task_id * state->n_tiles_per_task;                                                         \
+        int end   = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);                             \
+        dequantize_x4x2_weight_to_fp16_tiles_task_##suffix(state, start, end);                                 \
+    }                                                                                                          \
+}
+
+DEFINE_DEQUANTIZE_Q4_TASK(q4_0,   q4_0_to_fp16_lut,   q4_0, HMX_X4X2_DBLK_SIZE, (int)sizeof(__fp16))
+DEFINE_DEQUANTIZE_Q4_TASK(q4_1,   q4_1_to_fp16_lut,   q4_1, 32, 4)
+DEFINE_DEQUANTIZE_Q4_TASK(iq4_nl, iq4_nl_to_fp16_lut, iq4_nl, HMX_X4X2_DBLK_SIZE, (int)sizeof(__fp16))
+
+static void dequantize_x4x2_weight_to_fp16_tiles_task_mxfp4(
+        const x4x2_dequantize_state_t *state,
        int start_tile, int end_tile) {

-    const int n_k_tiles = (unsigned)k_block / HMX_FP16_TILE_N_COLS;
-    const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL);
-    const int qrow_size = is_q4 ? ((unsigned)k_block / 2) : k_block;
+    const int n_k_tiles = state->n_k_tiles;
+    const int qrow_size = state->k_block;
+    const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div;
+    const HVX_Vector vlut_cvt = hvx_vmem(mxfp4_to_fp16_lut);

-    const HVX_Vector vlut_cvt = (weight_type == HTP_TYPE_IQ4_NL) ? hvx_vmem(iq4_nl_to_fp16_lut) :
-                                (weight_type == HTP_TYPE_MXFP4)  ? hvx_vmem(mxfp4_to_fp16_lut) :
-                                                                   hvx_vmem(q4_0_to_fp16_lut);
-
-    // vscatter setup: write dequantized K-values directly to transposed [K][N] tile positions.
-    // Each int32 element holds a K-row-pair (2 adjacent fp16 values).  word[i] at offset i*128
-    // maps to K-rows 2i and 2i+1.  Column offset (n*4) added per row.
    const HVX_Vector v_scat_base  = hvx_vmem(hmx_transpose_scatter_offsets);
-    const HVX_Vector v_scat_step  = Q6_V_vsplat_R(4);  // 4 bytes = 1 column step
-    const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64);  // first 16 words (64 bytes)
+    const HVX_Vector v_scat_step  = Q6_V_vsplat_R(4);
+    const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64);

-    unsigned ct = (unsigned)start_tile / n_k_tiles;  // column tile index
-    unsigned kt = (unsigned)start_tile % n_k_tiles;  // K tile index
-    for (unsigned t = start_tile; t < end_tile; ) {
-        if (kt >= n_k_tiles) { kt = 0; ct++; }
+    unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div);
+    unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div);

-        // --- Batch-4 fast path for Q4: process 4 contiguous K-tiles with one vlut16 per row ---
-        if (is_q4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) {
-            unsigned blk_idx      = (kt * 32) / QK_Q4_0x4x2;
-            unsigned sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32;  // 0 or 4
-            bool upper            = (sub_blk_base >= 4);
-            unsigned packed_off   = blk_idx * (QK_Q4_0x4x2 / 2);     // 128 contiguous packed bytes
-            unsigned scale_off    = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE
-                                  + sub_blk_base * (int)sizeof(__fp16);   // 4 consecutive scales
+    for (unsigned t = start_tile; t < (unsigned)end_tile; ) {
+        if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; }

-            __fp16 *tile_bases[4];
-            for (unsigned g = 0; g < 4; g++) { tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; }
-
-            HVX_Vector v_off = v_scat_base;
-
-            unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
-            unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
-
-            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
-                const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
-                const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
-
-                HVX_Vector_x2 dv0 = dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
-                HVX_Vector_x2 dv1 = dequantize_x4x2_q4_0_x4groups_hvx(r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt);
-
-                Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[0]);
-                Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[1]);
-                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
-
-                Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[0]);
-                Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[1]);
-                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
-            }
-
-            for (int g = 0; g < 4; g++) { (void) *(volatile HVX_Vector *)(tile_bases[g]); }
-            t += 4; kt += 4;
-            continue;
-        }
-
-        // --- Batch-4 fast path for MXFP4: same nibble layout but E8M0 scales ---
-        if (weight_type == HTP_TYPE_MXFP4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) {
+        // Batch-4 fast path for MXFP4
+        if ((kt % 4 == 0) && (t + 4 <= (unsigned)end_tile) && (fastdiv(t + 3, &n_k_tiles_div) == ct)) {
            int  blk_idx      = (kt * 32) / QK_MXFP4x4x2;
-            int  sub_blk_base = ((kt * 32) % QK_MXFP4x4x2) / 32;                 // 0 or 4
+            int  sub_blk_base = ((kt * 32) % QK_MXFP4x4x2) / 32;
            bool upper        = (sub_blk_base >= 4);
-            int  packed_off   = blk_idx * (QK_MXFP4x4x2 / 2);                    // 128 contiguous packed bytes
-            int  e8m0_blk_off = qrow_size + blk_idx * HMX_X4X2_MXFP4_EBLK_SIZE;  // all 8 E8M0 scales
+            int  packed_off   = blk_idx * (QK_MXFP4x4x2 / 2);
+            int  e8m0_blk_off = qrow_size + blk_idx * HMX_X4X2_MXFP4_EBLK_SIZE;

            __fp16 * tile_bases[4];
            for (int g = 0; g < 4; g++) {
-                tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS;
+                tile_bases[g] = state->dst + (t + g) * HMX_FP16_TILE_N_ELMS;
            }

            HVX_Vector v_off = v_scat_base;
            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
                int             row0 = ct * HMX_FP16_TILE_N_COLS + r;
                int             row1 = row0 + 1;
-                const uint8_t * r0   = vtcm_src + row0 * row_stride;
-                const uint8_t * r1   = vtcm_src + row1 * row_stride;
+                const uint8_t * r0   = state->src + row0 * state->row_stride;
+                const uint8_t * r1   = state->src + row1 * state->row_stride;

-                // Batch-convert all 8 E8M0 scales once per row (stays in HVX register)
                mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off);

                HVX_Vector_x4 dv0, dv1;
                dv0 = dequantize_x4x2_mxfp4_x4groups_hvx(r0 + packed_off, upper, sub_blk_base, vlut_cvt, r0_e8);
-                if (row1 < n_cols) {
+                if (row1 < state->n_cols) {
                    mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off);
                    dv1 = dequantize_x4x2_mxfp4_x4groups_hvx(r1 + packed_off, upper, sub_blk_base, vlut_cvt, r1_e8);
                } else {
@@ -434,41 +604,13 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
                (void) *(volatile HVX_Vector *) (tile_bases[g]);
            }

-            t += 4;
+            t += 4; kt += 4;
            continue;
        }

-        // --- Single-tile fallback ---
-        __fp16 *tile_base = vtcm_dst + t * HMX_FP16_TILE_N_ELMS;
-
-        if (is_q4) {
-            unsigned blk_idx   = (kt * 32) / QK_Q4_0x4x2;
-            unsigned sub_blk   = ((kt * 32) % QK_Q4_0x4x2) / 32;
-            bool upper         = (sub_blk >= 4);
-            unsigned byte_off  = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;
-            unsigned scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16);
-
-            HVX_Vector v_off = v_scat_base;  // reset to column 0
-            unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
-            unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
-            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
-                const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
-                const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
-
-                HVX_Vector v0 = dequantize_x4x2_q4_0_group_hvx(
-                    r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
-                HVX_Vector v1 = (row1 < n_cols)
-                    ? dequantize_x4x2_q4_0_group_hvx(
-                        r1 + byte_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt)
-                    : Q6_V_vzero();
-
-                Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);
-                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
-                Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1);
-                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
-            }
-            (void) *(volatile HVX_Vector *)(tile_base);
-        } else if (weight_type == HTP_TYPE_MXFP4) {
+        // Single-tile fallback
+        __fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS;
+        {
            int  blk_idx      = (kt * 32) / QK_MXFP4x4x2;
            int  sub_blk      = ((kt * 32) % QK_MXFP4x4x2) / 32;
            bool upper        = (sub_blk >= 4);
@@ -480,15 +622,14 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
                int row0 = ct * HMX_FP16_TILE_N_COLS + r;
                int row1 = row0 + 1;

-                const uint8_t * r0 = vtcm_src + row0 * row_stride;
-                const uint8_t * r1 = vtcm_src + row1 * row_stride;
+                const uint8_t * r0 = state->src + row0 * state->row_stride;
+                const uint8_t * r1 = state->src + row1 * state->row_stride;

-                // Batch-convert all 8 E8M0 scales once per row (stays in HVX register)
                mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off);

                HVX_Vector v0 = dequantize_x4x2_mxfp4_group_hvx(r0 + byte_off, upper, sub_blk, vlut_cvt, r0_e8);
                HVX_Vector v1;
-                if (row1 < n_cols) {
+                if (row1 < state->n_cols) {
                    mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off);
                    v1 = dequantize_x4x2_mxfp4_group_hvx(r1 + byte_off, upper, sub_blk, vlut_cvt, r1_e8);
                } else {
@@ -501,23 +642,59 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
            }
            (void) *(volatile HVX_Vector *) (tile_base);
-        } else {
-            // Q8_0
+        }
+        ++t; ++kt;
+    }
+
+    if (start_tile < end_tile) {
+        (void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS);
+    }
+}
+
+static void dequantize_x4x2_worker_loop_mxfp4(unsigned int n, unsigned int i, void *data) {
+    x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;
+    for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {
+        int start = task_id * state->n_tiles_per_task;
+        int end   = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);
+        dequantize_x4x2_weight_to_fp16_tiles_task_mxfp4(state, start, end);
+    }
+}
+
+static void dequantize_x4x2_weight_to_fp16_tiles_task_q8_0(
+        const x4x2_dequantize_state_t *state,
+        int start_tile, int end_tile) {
+
+    const int n_k_tiles = state->n_k_tiles;
+    const int qrow_size = state->k_block;
+    const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div;
+
+    const HVX_Vector v_scat_base  = hvx_vmem(hmx_transpose_scatter_offsets);
+    const HVX_Vector v_scat_step  = Q6_V_vsplat_R(4);
+    const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64);
+
+    unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div);
+    unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div);
+
+    for (unsigned t = start_tile; t < (unsigned)end_tile; ) {
+        if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; }
+
+        __fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS;
+        {
            int blk_idx  = (kt * 32) / QK_Q8_0x4x2;
            int sub_blk  = ((kt * 32) % QK_Q8_0x4x2) / 32;
            int byte_off  = blk_idx * QK_Q8_0x4x2 + sub_blk * 32;
            int scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16);

-            HVX_Vector v_off = v_scat_base;  // reset to column 0
+            HVX_Vector v_off = v_scat_base;
            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
                int row0 = ct * HMX_FP16_TILE_N_COLS + r;
                int row1 = row0 + 1;

-                const uint8_t *r0 = vtcm_src + row0 * row_stride;
-                const uint8_t *r1 = vtcm_src + row1 * row_stride;
+                const uint8_t *r0 = state->src + row0 * state->row_stride;
+                const uint8_t *r1 = state->src + row1 * state->row_stride;

                HVX_Vector v0 = dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r0 + byte_off), (const __fp16 *)(r0 + scale_off));
-                HVX_Vector v1 = (row1 < n_cols) ? dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r1 + byte_off), (const __fp16 *)(r1 + scale_off)) : Q6_V_vzero();
+                HVX_Vector v1 = (row1 < state->n_cols) ? dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r1 + byte_off), (const __fp16 *)(r1 + scale_off)) : Q6_V_vzero();

                Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);
                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
@@ -529,50 +706,31 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
        ++t; ++kt;
    }

-    // Drain HVX scatter write buffer: a vmem load on the same HW thread retires
-    // all pending scatter entries to VTCM.  Without this, the main thread's HMX
-    // reads may see stale data because atomic_fetch_sub (release) only orders
-    // regular stores, not the HVX scatter buffer.
    if (start_tile < end_tile) {
-        (void) *(volatile HVX_Vector *)(vtcm_dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS);
+        (void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS);
    }
 }

-typedef struct {
-    __fp16        *dst;
-    const uint8_t *src;
-    int            n_cols;
-    int            k_block;
-    size_t         row_stride;
-    int            weight_type;
-    int            n_tot_tiles;
-    int            n_tiles_per_task;
-    int            n_tasks;
-} x4x2_dequantize_state_t;
-
-static void dequantize_x4x2_worker_loop(unsigned int n, unsigned int i, void *data) {
+static void dequantize_x4x2_worker_loop_q8_0(unsigned int n, unsigned int i, void *data) {
    x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;
-
    for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {
        int start = task_id * state->n_tiles_per_task;
        int end   = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);
-
-        dequantize_x4x2_weight_to_fp16_tiles_task(
-            state->dst, state->src, state->n_cols, state->k_block,
-            state->row_stride, state->weight_type, start, end);
+        dequantize_x4x2_weight_to_fp16_tiles_task_q8_0(state, start, end);
    }
 }

 static void dequantize_x4x2_weight_chunk_to_fp16_tiles(
        struct htp_context *ctx, __fp16 *vtcm_dst,
        const void *vtcm_src, int n_cols, int k_block,
-        size_t row_stride, int weight_type) {
+        size_t row_stride, int weight_type,
+        int n_k_tiles, struct fastdiv_values n_k_tiles_div,
+        worker_callback_t dequant_worker_fn) {

    assert(n_cols  % HMX_FP16_TILE_N_COLS == 0);
    assert(k_block % HMX_FP16_TILE_N_COLS == 0);

    size_t n_col_tiles = n_cols / HMX_FP16_TILE_N_COLS;
-    size_t n_k_tiles   = k_block / HMX_FP16_TILE_N_COLS;
    size_t n_tot_tiles = n_col_tiles * n_k_tiles;

    size_t n_tiles_per_task = hmx_ceil_div(n_tot_tiles, ctx->n_threads);
@@ -587,12 +745,16 @@ static void dequantize_x4x2_weight_chunk_to_fp16_tiles(
    state.k_block          = k_block;
    state.row_stride       = row_stride;
    state.weight_type      = weight_type;
+    state.n_k_tiles        = n_k_tiles;
+    state.n_k_tiles_div    = n_k_tiles_div;

-    worker_pool_run_func(ctx->worker_pool, dequantize_x4x2_worker_loop, &state, ctx->n_threads);
+    worker_pool_run_func(ctx->worker_pool, dequant_worker_fn, &state, ctx->n_threads);
 }

 // --- End x4x2 dequantizers ---

+#pragma clang diagnostic ignored "-Wbackend-plugin" // spurios warning for hmx intrinsics
+
 // requires external HMX lock
 static void core_dot_chunk_fp16(__fp16 *restrict output, const __fp16 *restrict activation, const __fp16 *restrict weight, const __fp16 *restrict scales,
                                int n_row_tiles, int n_col_tiles, int n_dot_tiles) {
@@ -883,6 +1045,20 @@ int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *
        return -1;
    }

+    worker_callback_t dequant_worker_fn = NULL;
+    switch (weight_type) {
+        case HTP_TYPE_Q4_0:   dequant_worker_fn = dequantize_x4x2_worker_loop_q4_0; break;
+        case HTP_TYPE_IQ4_NL: dequant_worker_fn = dequantize_x4x2_worker_loop_iq4_nl; break;
+        case HTP_TYPE_Q4_1:   dequant_worker_fn = dequantize_x4x2_worker_loop_q4_1; break;
+        case HTP_TYPE_MXFP4:  dequant_worker_fn = dequantize_x4x2_worker_loop_mxfp4; break;
+        case HTP_TYPE_Q8_0:   dequant_worker_fn = dequantize_x4x2_worker_loop_q8_0; break;
+        default:
+            return -1;
+    }
+
+    const int n_k_tiles = k / HMX_FP16_TILE_N_COLS;
+    const struct fastdiv_values n_k_tiles_div = init_fastdiv_values(n_k_tiles);
+
    // --- Dynamic VTCM layout ---
    const size_t vec_dot_size = k * sizeof(__fp16);
    const size_t vtcm_budget  = ctx->vtcm_size;
@@ -975,7 +1151,7 @@ int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *
        {
            // B0: wait for DMA, dequant weight chunk 0
            dma_queue_pop(ctx->dma[0]);
-            dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type);
+            dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn);

            // A1: issue DMA for weight chunk 1
            const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols);
@@ -994,7 +1170,7 @@ int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *
            // B1: DMA pop + dequant (runs in parallel with C0 on HMX worker)
            if (1 < n_chunk_cnt) {
                dma_queue_pop(ctx->dma[0]);
-                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type);
+                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn);
            }
        }

@@ -1036,7 +1212,7 @@ int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *
            // B_{i+2}: DMA pop + dequant (multi-thread HVX, parallel with C_{i+1})
            if (i + 2 < n_chunk_cnt) {
                dma_queue_pop(ctx->dma[0]);
-                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type);
+                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn);
            }
        }
    }
--- a/ggml/src/ggml-hexagon/htp/htp-ctx.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h
@@ -104,6 +104,7 @@ int op_argsort(struct htp_ops_context * octx);
 int op_ssm_conv(struct htp_ops_context * octx);
 int op_cumsum(struct htp_ops_context * octx);
 int op_fill(struct htp_ops_context * octx);
+int op_concat(struct htp_ops_context * octx);
 int op_diag(struct htp_ops_context * octx);
 int op_solve_tri(struct htp_ops_context * octx);
 int op_gated_delta_net(struct htp_ops_context * octx);
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -20,6 +20,7 @@ enum htp_data_type {
    HTP_TYPE_F32    = 0,
    HTP_TYPE_F16    = 1,
    HTP_TYPE_Q4_0   = 2,
+    HTP_TYPE_Q4_1   = 3,
    HTP_TYPE_Q8_0   = 8,
    HTP_TYPE_IQ4_NL = 20,
    HTP_TYPE_I32    = 26,
@@ -28,6 +29,7 @@ enum htp_data_type {

    // types used internally for repack, dyn.quant, etc
    HTP_TYPE_Q4_0x4x2 = 200,
+    HTP_TYPE_Q4_1x4x2,
    HTP_TYPE_Q8_0x4x2,
    HTP_TYPE_MXFP4x4x2,

@@ -89,6 +91,7 @@ enum htp_op_code {
    HTP_OP_TRI,
    HTP_OP_PAD,
    HTP_OP_NORM,
+    HTP_OP_CONCAT,

    HTP_OP_INVALID
 };
--- a/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h
@@ -0,0 +1,90 @@
+#ifndef HVX_SIN_COS_H
+#define HVX_SIN_COS_H
+
+#include "hvx-base.h"
+#include "hvx-floor.h"
+
+static inline HVX_Vector hvx_vec_cos_f32(HVX_Vector x) {
+    HVX_Vector const_inv_pi = hvx_vec_splat_f32(0.3183098861837907f);
+    HVX_Vector const_half   = hvx_vec_splat_f32(0.5f);
+    HVX_Vector const_pi     = hvx_vec_splat_f32(3.141592653589793f);
+    HVX_Vector const_one    = hvx_vec_splat_f32(1.0f);
+    HVX_Vector const_neg_one = hvx_vec_splat_f32(-1.0f);
+
+    // n = floor(x * (1/pi) + 0.5)
+    HVX_Vector n_float = hvx_vec_floor_f32(hvx_vec_add_f32_f32(hvx_vec_mul_f32_f32(x, const_inv_pi), const_half));
+
+    // y = x - n * pi
+    HVX_Vector y = hvx_vec_sub_f32_f32(x, hvx_vec_mul_f32_f32(n_float, const_pi));
+
+    // Sign determination: if n is odd, sign is -1.0f, else 1.0f
+    // half_n = n * 0.5f
+    HVX_Vector half_n = hvx_vec_mul_f32_f32(n_float, const_half);
+    // floor_half_n = floor(half_n)
+    HVX_Vector floor_half_n = hvx_vec_floor_f32(half_n);
+    // is_odd = half_n > floor_half_n
+    HVX_VectorPred is_odd = Q6_Q_vcmp_gt_VsfVsf(half_n, floor_half_n);
+    // sign = vmux(is_odd, -1.0f, 1.0f)
+    HVX_Vector sign = Q6_V_vmux_QVV(is_odd, const_neg_one, const_one);
+
+    // z = y^2
+    HVX_Vector z = hvx_vec_mul_f32_f32(y, y);
+
+    // Chebyshev approximation for cos(y)
+    HVX_Vector c4 = hvx_vec_splat_f32(2.3557242013849433e-05f);
+    HVX_Vector c3 = hvx_vec_splat_f32(-0.0013871428263450528f);
+    HVX_Vector c2 = hvx_vec_splat_f32(0.041665895266688284f);
+    HVX_Vector c1 = hvx_vec_splat_f32(-0.4999999360426369f);
+    HVX_Vector c0 = hvx_vec_splat_f32(0.9999999999071725f);
+
+    HVX_Vector cos_y = hvx_vec_add_f32_f32(c3, hvx_vec_mul_f32_f32(z, c4));
+    cos_y = hvx_vec_add_f32_f32(c2, hvx_vec_mul_f32_f32(z, cos_y));
+    cos_y = hvx_vec_add_f32_f32(c1, hvx_vec_mul_f32_f32(z, cos_y));
+    cos_y = hvx_vec_add_f32_f32(c0, hvx_vec_mul_f32_f32(z, cos_y));
+
+    return hvx_vec_mul_f32_f32(cos_y, sign);
+}
+
+static inline HVX_Vector hvx_vec_sin_f32(HVX_Vector x) {
+    HVX_Vector const_inv_pi = hvx_vec_splat_f32(0.3183098861837907f);
+    HVX_Vector const_half   = hvx_vec_splat_f32(0.5f);
+    HVX_Vector const_pi     = hvx_vec_splat_f32(3.141592653589793f);
+    HVX_Vector const_one    = hvx_vec_splat_f32(1.0f);
+    HVX_Vector const_neg_one = hvx_vec_splat_f32(-1.0f);
+
+    // n = floor(x * (1/pi) + 0.5)
+    HVX_Vector n_float = hvx_vec_floor_f32(hvx_vec_add_f32_f32(hvx_vec_mul_f32_f32(x, const_inv_pi), const_half));
+
+    // y = x - n * pi
+    HVX_Vector y = hvx_vec_sub_f32_f32(x, hvx_vec_mul_f32_f32(n_float, const_pi));
+
+    // Sign determination: if n is odd, sign is -1.0f, else 1.0f
+    // half_n = n * 0.5f
+    HVX_Vector half_n = hvx_vec_mul_f32_f32(n_float, const_half);
+    // floor_half_n = floor(half_n)
+    HVX_Vector floor_half_n = hvx_vec_floor_f32(half_n);
+    // is_odd = half_n > floor_half_n
+    HVX_VectorPred is_odd = Q6_Q_vcmp_gt_VsfVsf(half_n, floor_half_n);
+    // sign = vmux(is_odd, -1.0f, 1.0f)
+    HVX_Vector sign = Q6_V_vmux_QVV(is_odd, const_neg_one, const_one);
+
+    // z = y^2
+    HVX_Vector z = hvx_vec_mul_f32_f32(y, y);
+
+    // Chebyshev approximation for sin(y)
+    HVX_Vector s4 = hvx_vec_splat_f32(2.642186986152672e-06f);
+    HVX_Vector s3 = hvx_vec_splat_f32(-0.00019825318964070864f);
+    HVX_Vector s2 = hvx_vec_splat_f32(0.00833326283319605f);
+    HVX_Vector s1 = hvx_vec_splat_f32(-0.16666666082087775f);
+    HVX_Vector s0 = hvx_vec_splat_f32(0.999999999915155f);
+
+    HVX_Vector sin_y = hvx_vec_add_f32_f32(s3, hvx_vec_mul_f32_f32(z, s4));
+    sin_y = hvx_vec_add_f32_f32(s2, hvx_vec_mul_f32_f32(z, sin_y));
+    sin_y = hvx_vec_add_f32_f32(s1, hvx_vec_mul_f32_f32(z, sin_y));
+    sin_y = hvx_vec_add_f32_f32(s0, hvx_vec_mul_f32_f32(z, sin_y));
+    sin_y = hvx_vec_mul_f32_f32(y, sin_y);
+
+    return hvx_vec_mul_f32_f32(sin_y, sign);
+}
+
+#endif /* HVX_SIN_COS_H */
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -14,6 +14,8 @@
 #include "hvx-sqrt.h"
 #include "hvx-arith.h"
 #include "hvx-div.h"
+#include "hvx-floor.h"
+#include "hvx-sin-cos.h"
 #include "hvx-base.h"

 #endif /* HVX_UTILS_H */
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -420,8 +420,7 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que

    ctx->n_threads = n_hvx;
    for (int i = 0; i < ctx->n_threads; i++) {
-        // see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
-        ctx->dma[i] = dma_queue_create(128);
+        ctx->dma[i] = dma_queue_create(256); // queue depth
    }

    // init worker pool
@@ -601,6 +600,9 @@ static int execute_op(struct htp_ops_context * octx) {
        case HTP_OP_PAD:
            return op_pad(octx);

+        case HTP_OP_CONCAT:
+            return op_concat(octx);
+
        case HTP_OP_GATED_DELTA_NET:
            return op_gated_delta_net(octx);

@@ -851,6 +853,11 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
        for (uint32_t i=0; i < n_ops; i++) {
            struct profile_data prof;

+            if (i == (n_ops-1)) {
+                // wake up the host before starting the last op
+                dspqueue_write_early_wakeup_noblock(queue, 0, 0);
+            }
+
            profile_start(ctx->profiler, &prof);

            proc_op_req(octx, tens, i, &ops[i]);
@@ -867,8 +874,6 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
            }
        }

-        // dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);
-
        struct htp_opbatch_rsp rsp;
        rsp.id        = req.id;
        rsp.status    = HTP_STATUS_OK;
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -7,6 +7,7 @@

 #include <math.h>
 #include <string.h>
+#include <stdlib.h>

 #include "hex-dma.h"
 #include "hvx-utils.h"
@@ -75,6 +76,9 @@ struct htp_rope_context {
    size_t theta_cache_offset;
    uint32_t src0_nrows;

+    struct fastdiv_values div_ne2_ne1;
+    struct fastdiv_values div_ne1;
+
    uint64_t t_start;
 };

@@ -117,13 +121,84 @@ static __attribute__((noinline)) void rope_cache_init(const float    theta_base,
                            float *        cache,
                            const float    theta_scale) {
    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
-    float theta = theta_base;
+#if __HVX_ARCH__ >= 79
+    const bool is_v79_or_newer = true;
+#else
+    const bool is_v79_or_newer = false;
+#endif

-    for (uint32_t i0 = 0; i0 < ne0; i0 += 2) {
-        const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f;
-        rope_yarn_one(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, cache);
+    if (is_v79_or_newer && ext_factor == 0.0f) {
+        // Fast path: fully vectorized
+        // We process 32 pairs (64 elements) per iteration.
+        const uint32_t n_blocks = ne0 / 64;

-        theta *= theta_scale;
+        // Initialize theta scale powers: [1.0f, theta_scale, theta_scale^2, ..., theta_scale^31]
+        float __attribute__((aligned(128))) theta_powers[32];
+        theta_powers[0] = 1.0f;
+        for (int j = 1; j < 32; j++) {
+            theta_powers[j] = theta_powers[j - 1] * theta_scale;
+        }
+        HVX_Vector v_theta_powers = hvx_vmem(theta_powers);
+
+        HVX_Vector v_freq_scale = hvx_vec_splat_f32(freq_scale);
+        HVX_Vector v_mscale = hvx_vec_splat_f32(mscale);
+
+        // Base theta starts at theta_base
+        float theta_block = theta_base;
+        // The scale factor for the next block is theta_scale^32
+        float theta_scale_32 = 1.0f;
+        for (int j = 0; j < 32; j++) {
+            theta_scale_32 *= theta_scale;
+        }
+
+        for (uint32_t b = 0; b < n_blocks; b++) {
+            uint32_t i0 = b * 64;
+            HVX_Vector v_theta_base = hvx_vec_splat_f32(theta_block);
+            HVX_Vector v_theta = hvx_vec_mul_f32_f32(v_theta_base, v_theta_powers);
+
+            if (freq_factors) {
+                // Load 32 elements of freq_factors
+                HVX_Vector v_ff = hvx_vmemu(freq_factors + i0 / 2);
+                HVX_Vector v_inv_ff = hvx_vec_inverse_f32(v_ff);
+                v_theta = hvx_vec_mul_f32_f32(v_theta, v_inv_ff);
+            }
+
+            HVX_Vector v_theta_final = hvx_vec_mul_f32_f32(v_theta, v_freq_scale);
+
+            HVX_Vector vcos = hvx_vec_cos_f32(v_theta_final);
+            HVX_Vector vsin = hvx_vec_sin_f32(v_theta_final);
+
+            vcos = hvx_vec_mul_f32_f32(vcos, v_mscale);
+            vsin = hvx_vec_mul_f32_f32(vsin, v_mscale);
+
+            HVX_VectorPair vstore = Q6_W_vshuff_VVR(vsin, vcos, -4);
+
+            if (((uintptr_t)cache) % 128 == 0) {
+                hvx_vmem(cache + i0 + 0)  = Q6_V_lo_W(vstore);
+                hvx_vmem(cache + i0 + 32) = Q6_V_hi_W(vstore);
+            } else {
+                hvx_vec_store_u(cache + i0 + 0,  32 * sizeof(float), Q6_V_lo_W(vstore));
+                hvx_vec_store_u(cache + i0 + 32, 32 * sizeof(float), Q6_V_hi_W(vstore));
+            }
+
+            theta_block *= theta_scale_32;
+        }
+
+        // Leftovers
+        float theta = theta_block;
+        for (uint32_t i0 = n_blocks * 64; i0 < ne0; i0 += 2) {
+            const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f;
+            rope_yarn_one(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, cache);
+            theta *= theta_scale;
+        }
+    } else {
+        // Fallback to original scalar loop
+        float theta = theta_base;
+        for (uint32_t i0 = 0; i0 < ne0; i0 += 2) {
+            const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f;
+            rope_yarn_one(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, cache);
+            theta *= theta_scale;
+        }
    }
 }

@@ -195,24 +270,18 @@ static void rope_corr_dims(int     n_dims,
 }

 static inline void hvx_rope_neox_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
-    const HVX_Vector * restrict vsrc   = (const HVX_Vector *) src0;
-    const HVX_Vector * restrict vtheta = (const HVX_Vector *) theta_cache;
-    HVX_Vector       * restrict vdst   = (HVX_Vector *) dst;
+    const uint32_t he = ne / 2;
+    const uint32_t nvec = he / 32;
+    const uint32_t nloe = he % 32;

-    uint32_t nvec = (ne / (VLEN_FP32 * 2) * 2); // 2 vecs per loop, step of 2
+    for (uint32_t i = 0; i < nvec; i++) {
+        HVX_Vector v0 = ((const HVX_Vector *) src0)[i];
+        HVX_Vector v1 = hvx_vmemu(src0 + he + i * 32);

-    uint32_t he = ne / 2;         // half_dims offset in elements
-    uint32_t hv = he / VLEN_FP32; // half_dims offset in vectors
+        HVX_Vector v2 = ((const HVX_Vector *) theta_cache)[i * 2 + 0];
+        HVX_Vector v3 = ((const HVX_Vector *) theta_cache)[i * 2 + 1];

-    #pragma unroll(2)
-    for (uint32_t i = 0; i < nvec; i += 2) {
-        HVX_Vector v0 = vsrc[i/2+0];
-        HVX_Vector v1 = vsrc[i/2+hv];
-
-        HVX_Vector v2 = vtheta[i+0];
-        HVX_Vector v3 = vtheta[i+1];
-
-        HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);  // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
+        HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);

        HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_lo_W(vcos_sin));
        HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_hi_W(vcos_sin));
@@ -222,37 +291,45 @@ static inline void hvx_rope_neox_f32_aa(float * restrict dst, const float * rest
        HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
        HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);

-        vdst[i/2+0]  = Q6_Vsf_equals_Vqf32(v4);
-        vdst[i/2+hv] = Q6_Vsf_equals_Vqf32(v5);
+        ((HVX_Vector *) dst)[i] = Q6_Vsf_equals_Vqf32(v4);
+        hvx_vmemu(dst + he + i * 32) = Q6_Vsf_equals_Vqf32(v5);
    }

-    for (uint32_t i = nvec * VLEN_FP32; i < ne; i += 2) {
-        const float cos_theta = theta_cache[i+0];
-        const float sin_theta = theta_cache[i+1];
-        float x0 = src0[i/2];
-        float x1 = src0[i/2 + he];
-        dst[i/2]      = x0 * cos_theta - x1 * sin_theta;
-        dst[i/2 + he] = x0 * sin_theta + x1 * cos_theta;
+    if (nloe > 0) {
+        HVX_Vector v0 = hvx_vmemu(src0 + nvec * 32);
+        HVX_Vector v1 = hvx_vmemu(src0 + he + nvec * 32);
+
+        HVX_Vector v2 = ((const HVX_Vector *) theta_cache)[nvec * 2 + 0];
+        HVX_Vector v3 = ((const HVX_Vector *) theta_cache)[nvec * 2 + 1];
+
+        HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);
+
+        HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_lo_W(vcos_sin));
+        HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_hi_W(vcos_sin));
+        HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(v1, Q6_V_lo_W(vcos_sin));
+        HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(v1, Q6_V_hi_W(vcos_sin));
+
+        HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
+        HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
+
+        hvx_vec_store_u(dst + nvec * 32, nloe * sizeof(float), Q6_Vsf_equals_Vqf32(v4));
+        hvx_vec_store_u(dst + he + nvec * 32, nloe * sizeof(float), Q6_Vsf_equals_Vqf32(v5));
    }
 }

 static inline void hvx_rope_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
-    const HVX_Vector * restrict vsrc   = (const HVX_Vector *) src0;
-    const HVX_Vector * restrict vtheta = (const HVX_Vector *) theta_cache;
-    HVX_Vector       * restrict vdst   = (HVX_Vector *) dst;
+    const uint32_t nvec = ne / 64;
+    const uint32_t nloe = ne % 64;

-    uint32_t nvec = (ne / (VLEN_FP32 * 2)) * 2; // 2 vecs per loop, step of two
+    for (uint32_t i = 0; i < nvec; i++) {
+        HVX_Vector v0 = ((const HVX_Vector *) src0)[i * 2 + 0];
+        HVX_Vector v1 = ((const HVX_Vector *) src0)[i * 2 + 1];

-    #pragma unroll(2)
-    for (uint32_t i = 0; i < nvec; i+=2) {
-        HVX_Vector v0 = vsrc[i+0];
-        HVX_Vector v1 = vsrc[i+1];
+        HVX_Vector v2 = ((const HVX_Vector *) theta_cache)[i * 2 + 0];
+        HVX_Vector v3 = ((const HVX_Vector *) theta_cache)[i * 2 + 1];

-        HVX_Vector v2 = vtheta[i+0];
-        HVX_Vector v3 = vtheta[i+1];
-
-        HVX_VectorPair vx0_x1   = Q6_W_vdeal_VVR(v1, v0, -4);  // vx0_x1[0] = x0, vx0_x1[1] = x1
-        HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);  // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
+        HVX_VectorPair vx0_x1   = Q6_W_vdeal_VVR(v1, v0, -4);
+        HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);

        HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_lo_W(vcos_sin));
        HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_hi_W(vcos_sin));
@@ -264,17 +341,52 @@ static inline void hvx_rope_f32_aa(float * restrict dst, const float * restrict

        HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4);

-        vdst[i+0] = Q6_V_lo_W(vstore);
-        vdst[i+1] = Q6_V_hi_W(vstore);
+        ((HVX_Vector *) dst)[i * 2 + 0] = Q6_V_lo_W(vstore);
+        ((HVX_Vector *) dst)[i * 2 + 1] = Q6_V_hi_W(vstore);
    }

-    for (uint32_t i = nvec * VLEN_FP32; i < ne; i += 2) {
-        const float cos_theta = theta_cache[i+0];
-        const float sin_theta = theta_cache[i+1];
-        float x0 = src0[i+0];
-        float x1 = src0[i+1];
-        dst[i+0] = x0 * cos_theta - x1 * sin_theta;
-        dst[i+1] = x0 * sin_theta + x1 * cos_theta;
+    if (nloe > 0) {
+        if (nloe <= 32) {
+            HVX_Vector v0 = hvx_vmemu(src0 + nvec * 64);
+            HVX_Vector v2 = hvx_vmemu(theta_cache + nvec * 64);
+
+            HVX_VectorPair vx0_x1   = Q6_W_vdeal_VVR(Q6_V_vzero(), v0, -4);
+            HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(Q6_V_vzero(), v2, -4);
+
+            HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_lo_W(vcos_sin));
+            HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_hi_W(vcos_sin));
+            HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_lo_W(vcos_sin));
+            HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_hi_W(vcos_sin));
+
+            HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
+            HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
+
+            HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4);
+
+            hvx_vec_store_u(dst + nvec * 64, nloe * sizeof(float), Q6_V_lo_W(vstore));
+        } else {
+            HVX_Vector v0 = hvx_vmemu(src0 + nvec * 64);
+            HVX_Vector v1 = hvx_vmemu(src0 + nvec * 64 + 32);
+
+            HVX_Vector v2 = hvx_vmemu(theta_cache + nvec * 64);
+            HVX_Vector v3 = hvx_vmemu(theta_cache + nvec * 64 + 32);
+
+            HVX_VectorPair vx0_x1   = Q6_W_vdeal_VVR(v1, v0, -4);
+            HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);
+
+            HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_lo_W(vcos_sin));
+            HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_hi_W(vcos_sin));
+            HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_lo_W(vcos_sin));
+            HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_hi_W(vcos_sin));
+
+            HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
+            HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
+
+            HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4);
+
+            ((HVX_Vector *) dst)[nvec * 2 + 0] = Q6_V_lo_W(vstore);
+            hvx_vec_store_u(dst + nvec * 64 + 32, (nloe - 32) * sizeof(float), Q6_V_hi_W(vstore));
+        }
    }
 }

@@ -348,13 +460,19 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
    const int32_t * pos = (const int32_t *) src1->data;
    const float * freq_factors = src2 ? (const float *) src2->data : NULL;

-    uint32_t ir = 0;
+    const uint32_t i3_start = fastdiv(src0_start_row, &rctx->div_ne2_ne1);
+    const uint32_t rem      = fastmodulo(src0_start_row, ne2 * ne1, &rctx->div_ne2_ne1);
+    const uint32_t i2_start = fastdiv(rem, &rctx->div_ne1);
+    const uint32_t i1_start = fastmodulo(rem, ne1, &rctx->div_ne1);
+
+    uint32_t ir = src0_start_row;
    uint32_t prev_i2 = (uint32_t) -1;

-    for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch
-        for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len
-            for (uint32_t i1 = 0; i1 < ne1; ) { // attn-heads
-                if (ir < src0_start_row) { ir++; i1++; continue; }
+    for (uint32_t i3 = i3_start; i3 < ne3; i3++) { // batch
+        const uint32_t i2_init = (i3 == i3_start) ? i2_start : 0;
+        for (uint32_t i2 = i2_init; i2 < ne2; i2++) { // seq-len
+            const uint32_t i1_init = (i3 == i3_start && i2 == i2_start) ? i1_start : 0;
+            for (uint32_t i1 = i1_init; i1 < ne1; ) { // attn-heads
                if (ir >= src0_end_row) goto done;

                // Rows in this block
@@ -407,9 +525,6 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
                                        ne0, rctx->ext_factor, rctx->attn_factor,
                                        theta_cache, rctx->theta_scale);
                    }
-
-                    // FARF(HIGH, "rope-theta %u: ir %u i1 %u i2 %u i3 %u cache %p : usec %u", ith, ir, i1, i2, i3, theta_cache,
-                    //         (unsigned) HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - rctx->t_start));
                }

                // Skip output DMA transactions from prev block (if any)
@@ -489,7 +604,7 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
    // Aligned row sizes for VTCM
    const size_t src0_row_size_aligned    = hex_round_up(src0_row_size, VLEN);
    const size_t dst_row_size_aligned     = hex_round_up(dst_row_size, VLEN);
-    const size_t theta_cache_size_aligned = hex_round_up(src0->ne[0] * sizeof(float), 128);
+    const size_t theta_cache_size_aligned = hex_round_up(src0->ne[0] * sizeof(float), 256);

    // Calculate spad sizes per thread
    size_t src0_spad_per_thread = theta_cache_size_aligned + HTP_ROPE_SPAD_NROWS * src0_row_size_aligned;
@@ -546,6 +661,11 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
    rctx.src0_nrows = src0_nrows;
    rctx.src0_nrows_per_thread = (src0_nrows + n_threads - 1) / n_threads;

+    if (src0_nrows > 0) {
+        rctx.div_ne2_ne1 = init_fastdiv_values(dst->ne[2] * dst->ne[1]);
+        rctx.div_ne1     = init_fastdiv_values(dst->ne[1]);
+    }
+
    FARF(HIGH, "rope-f32 n-rows %u n-dims %d ne0 %u ext-factor %.6f theta-scale %.6f attn-factor %.6f\n", rctx.src0_nrows, rctx.n_dims, ne0,
         rctx.ext_factor, rctx.theta_scale, rctx.attn_factor);

--- a/ggml/src/ggml-hexagon/htp/set-rows-ops.c
+++ b/ggml/src/ggml-hexagon/htp/set-rows-ops.c
@@ -65,6 +65,9 @@ static void set_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da
    // parallelize by rows of src0
    const uint32_t dr  = srctx->src0_nrows_per_thread;
    const uint32_t ir0 = dr * ith;
+    if (ir0 >= nr) {
+        return;
+    }
    const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;

    const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
@@ -109,6 +112,9 @@ static void set_rows_thread_f16_f32(unsigned int nth, unsigned int ith, void *da
    // parallelize by rows of src0
    const uint32_t dr  = srctx->src0_nrows_per_thread;
    const uint32_t ir0 = dr * ith;
+    if (ir0 >= nr) {
+        return;
+    }
    const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;

    const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
--- a/ggml/src/ggml-hexagon/htp/unary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/unary-ops.c
@@ -207,7 +207,7 @@ static void hvx_fast_norm_f32(const uint8_t * restrict src,

    // scale = rsqrt(variance + epsilon),  mean_x broadcast for subtraction
    HVX_Vector scale_v  = hvx_vec_rsqrt_f32(Q6_Vsf_equals_Vqf32(var_epsilon_v));
-    HVX_Vector mean_x_b = hvx_vec_splat_f32(hvx_vec_get_f32(Q6_Vsf_equals_Vqf32(mean_x_v)));
+    HVX_Vector mean_x_b = hvx_vec_repl_f32(Q6_Vsf_equals_Vqf32(mean_x_v));

    #pragma unroll(4)
    for (int i = 0; i < nvec; i++) {
--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@@ -215,6 +215,30 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets);
 // device
 //

+enum ggml_metal_device_id {
+    GGML_METAL_DEVICE_GENERIC = 0,
+
+    GGML_METAL_DEVICE_M1,
+    GGML_METAL_DEVICE_M1_PRO,
+    GGML_METAL_DEVICE_M1_MAX,
+    GGML_METAL_DEVICE_M1_ULTRA,
+    GGML_METAL_DEVICE_M2,
+    GGML_METAL_DEVICE_M2_PRO,
+    GGML_METAL_DEVICE_M2_MAX,
+    GGML_METAL_DEVICE_M2_ULTRA,
+    GGML_METAL_DEVICE_M3,
+    GGML_METAL_DEVICE_M3_PRO,
+    GGML_METAL_DEVICE_M3_MAX,
+    GGML_METAL_DEVICE_M3_ULTRA,
+    GGML_METAL_DEVICE_M4,
+    GGML_METAL_DEVICE_M4_PRO,
+    GGML_METAL_DEVICE_M4_MAX,
+    GGML_METAL_DEVICE_M5,
+    GGML_METAL_DEVICE_M5_PRO,
+    GGML_METAL_DEVICE_M5_MAX,
+    GGML_METAL_DEVICE_M5_ULTRA,
+};
+
 struct ggml_metal_device_props {
    int device;
    char name[128];
@@ -234,6 +258,8 @@ struct ggml_metal_device_props {

    bool supports_gpu_family_apple7;

+    enum ggml_metal_device_id device_id;
+
    int op_offload_min_batch_size;
 };

--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -628,6 +628,50 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) {
    free(rsets);
 }

+static enum ggml_metal_device_id ggml_metal_device_id_parse(const char * name) {
+    if (!name) {
+        return GGML_METAL_DEVICE_GENERIC;
+    }
+
+    static const char prefix[] = "Apple ";
+    if (strncmp(name, prefix, sizeof(prefix) - 1) != 0) {
+        return GGML_METAL_DEVICE_GENERIC;
+    }
+    const char * suffix = name + sizeof(prefix) - 1;
+
+    static const struct {
+        const char * name;
+        enum ggml_metal_device_id id;
+    } table[] = {
+        {"M1",       GGML_METAL_DEVICE_M1},
+        {"M1 Pro",   GGML_METAL_DEVICE_M1_PRO},
+        {"M1 Max",   GGML_METAL_DEVICE_M1_MAX},
+        {"M1 Ultra", GGML_METAL_DEVICE_M1_ULTRA},
+        {"M2",       GGML_METAL_DEVICE_M2},
+        {"M2 Pro",   GGML_METAL_DEVICE_M2_PRO},
+        {"M2 Max",   GGML_METAL_DEVICE_M2_MAX},
+        {"M2 Ultra", GGML_METAL_DEVICE_M2_ULTRA},
+        {"M3",       GGML_METAL_DEVICE_M3},
+        {"M3 Pro",   GGML_METAL_DEVICE_M3_PRO},
+        {"M3 Max",   GGML_METAL_DEVICE_M3_MAX},
+        {"M3 Ultra", GGML_METAL_DEVICE_M3_ULTRA},
+        {"M4",       GGML_METAL_DEVICE_M4},
+        {"M4 Pro",   GGML_METAL_DEVICE_M4_PRO},
+        {"M4 Max",   GGML_METAL_DEVICE_M4_MAX},
+        {"M5",       GGML_METAL_DEVICE_M5},
+        {"M5 Pro",   GGML_METAL_DEVICE_M5_PRO},
+        {"M5 Max",   GGML_METAL_DEVICE_M5_MAX},
+        {"M5 Ultra", GGML_METAL_DEVICE_M5_ULTRA},
+    };
+
+    for (size_t i = 0; i < sizeof(table)/sizeof(table[0]); ++i) {
+        if (strcmp(suffix, table[i].name) == 0) {
+            return table[i].id;
+        }
+    }
+    return GGML_METAL_DEVICE_GENERIC;
+}
+
 ggml_metal_device_t ggml_metal_device_init(int device) {
    ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));

@@ -795,6 +839,8 @@ ggml_metal_device_t ggml_metal_device_init(int device) {

            dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];

+            dev->props.device_id = ggml_metal_device_id_parse([[dev->mtl_device name] UTF8String]);
+
            dev->props.op_offload_min_batch_size  = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;

            dev->props.max_buffer_size            = dev->mtl_device.maxBufferLength;
--- a/ggml/src/ggml-opencl/CMakeLists.txt
+++ b/ggml/src/ggml-opencl/CMakeLists.txt
@@ -164,6 +164,7 @@ set(GGML_OPENCL_KERNELS
    sqr
    sqrt
    ssm_conv
+    gated_delta_net
    sub
    sum_rows
    cumsum
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -412,6 +412,7 @@ struct ggml_backend_opencl_context {
    size_t max_workgroup_size;
    bool fp16_support;
    bool has_vector_subgroup_broadcast;
+    bool has_qcom_subgroup_shuffle = false;     // cl_qcom_subgroup_shuffle
    bool disable_fusion;

    std::regex *opfilter = nullptr; // regex of ops to not claim
@@ -634,6 +635,10 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_conv_2d_f32;
    cl_kernel kernel_conv_2d_f16_f32;
    cl_kernel kernel_ssm_conv_f32_f32, kernel_ssm_conv_f32_f32_4;
+    // [size_idx][kda][tgpp] where size_idx: 0=S_V=16, 1=32, 2=64, 3=128; kda: 0 or 1.
+    // tgpp 0 = TG variant (COLS_PER_LANE_GROUP=1), tgpp 1 = prefill variant (COLS_PER_LANE_GROUP=4).
+    cl_kernel kernel_gated_delta_net_f32[4][2][2] = {};
+
    cl_kernel kernel_timestep_embedding;
    cl_kernel kernel_gemv_moe_q4_0_f32_ns, kernel_gemm_moe_q4_0_f32_ns;
    cl_kernel kernel_gemv_moe_q4_1_f32_ns, kernel_gemm_moe_q4_1_f32_ns;
@@ -837,16 +842,16 @@ static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
 static std::vector<std::unique_ptr<ggml_backend_opencl_device_context>> g_ggml_backend_opencl_dev_ctxs;

 inline std::string read_file(const std::string &path) {
-  std::ifstream ifs(path);
-  if (!ifs) {
-    return "";
-  }
-  std::string text;
-  ifs.seekg(0, std::ios::end);
-  text.resize(ifs.tellg());
-  ifs.seekg(0, std::ios::beg);
-  ifs.read(&text[0], text.size());
-  return text;
+    std::ifstream ifs(path);
+    if (!ifs) {
+        return "";
+    }
+    std::string text;
+    ifs.seekg(0, std::ios::end);
+    text.resize(ifs.tellg());
+    ifs.seekg(0, std::ios::beg);
+    ifs.read(&text[0], text.size());
+    return text;
 }

 static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts) {
@@ -2463,12 +2468,12 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
                build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
            CL_CHECK((backend_ctx->kernel_upscale = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale", &err), err));
            if (backend_ctx->program_upscale) {
-                 cl_int err_bilinear;
-                 backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear);
-                 if (err_bilinear != CL_SUCCESS) {
+                cl_int err_bilinear;
+                backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear);
+                if (err_bilinear != CL_SUCCESS) {
                    GGML_LOG_WARN("ggml_opencl: kernel_upscale_bilinear not found in upscale.cl. Bilinear upscale will not be available. Error: %d\n", err_bilinear);
                    backend_ctx->kernel_upscale_bilinear = nullptr;
-                 }
+                }
            } else {
                backend_ctx->kernel_upscale_bilinear = nullptr;
            }
@@ -2538,8 +2543,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
        GGML_LOG_CONT(".");
    }

-     // conv2d
-     {
+    // conv2d
+    {
        #ifdef GGML_OPENCL_EMBED_KERNELS
                const std::string kernel_src {
                    #include "conv2d.cl.h"
@@ -2597,6 +2602,86 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
        GGML_LOG_CONT(".");
    }

+    // gated_delta_net: one kernel per (S_V, KDA, tgpp) triple.
+    {
+    #ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "gated_delta_net.cl.h"
+        };
+    #else
+        const std::string kernel_src = read_file("gated_delta_net.cl");
+    #endif
+
+        const int gdn_sizes[4] = { 16, 32, 64, 128 };
+        const int sg_size = backend_ctx->gpu_family == GPU_FAMILY::ADRENO ? 64 : backend_ctx->gpu_family == GPU_FAMILY::INTEL ? 32 : -1;
+        if (sg_size < 0) {
+            GGML_LOG_ERROR("Unsupported GPU Family: only Adreno and Intel are supported.\n");
+            exit(1);
+        }
+
+        for (int si = 0; si < 4; si++) {
+            const int S_V = gdn_sizes[si];
+
+            // MUST match the dispatcher heuristic in ggml_cl_gated_delta_net exactly.
+            int lanes_per_column;
+            if (S_V >= 128) {
+                lanes_per_column = 8;
+            } else {
+                lanes_per_column = std::min(S_V, sg_size);
+            }
+
+            // Round LANES_PER_COLUMN down until it is:
+            //  * power-of-two
+            //  * divides both S_V and sg_size
+            while (lanes_per_column > 1 &&
+                    (((lanes_per_column & (lanes_per_column - 1)) != 0) ||
+                    (S_V % lanes_per_column) != 0 ||
+                    (sg_size % lanes_per_column) != 0)) {
+                lanes_per_column >>= 1;
+            }
+
+            GGML_ASSERT(lanes_per_column >= 1);
+            GGML_ASSERT(((lanes_per_column & (lanes_per_column - 1)) == 0));
+            GGML_ASSERT((S_V % lanes_per_column) == 0);
+            GGML_ASSERT((sg_size % lanes_per_column) == 0);
+
+            const bool is_partial_reduce = (lanes_per_column != 1) && (lanes_per_column < sg_size);
+            int use_qcom_shuffle = 0;
+            if (is_partial_reduce) {
+                if (backend_ctx->has_qcom_subgroup_shuffle) {
+                    use_qcom_shuffle = 1;
+                }
+            }
+            for (int kda = 0; kda < 2; kda++) {
+                for (int tgpp = 0; tgpp < 2; tgpp++) {
+                    const int cpl = (tgpp == 0) ? 1 : 4;
+                    const int spw  = (tgpp == 0) ? 1 : 1;
+
+                    std::string opts = compile_opts;
+                    opts += " -DS_V=" + std::to_string(S_V);
+                    opts += " -DKDA=" + std::to_string(kda);
+                    opts += " -DSUBGROUP_SIZE=" + std::to_string(sg_size);
+                    opts += " -DLANES_PER_COLUMN=" + std::to_string(lanes_per_column);
+                    opts += " -DCOLS_PER_LANE_GROUP=" + std::to_string(cpl);
+                    opts += " -DUSE_QCOM_SUBGROUP_SHUFFLE=" + std::to_string(use_qcom_shuffle);
+
+                    // Since spw=1 is found to be optimal, SUBGROUPS_PER_WG > 1 code in
+                    // the kernel is removed. If you want to experiment with spw > 1,
+                    // Please remember to implement code to handle it.
+                    opts += " -DSUBGROUPS_PER_WG=" + std::to_string(spw);
+
+                    cl_program prog = build_program_from_source(
+                        backend_ctx->context, backend_ctx->device, kernel_src.c_str(), opts);
+
+                    CL_CHECK((backend_ctx->kernel_gated_delta_net_f32[si][kda][tgpp] =
+                                clCreateKernel(prog, "kernel_gated_delta_net", &err), err));
+                    CL_CHECK(clReleaseProgram(prog));
+                }
+            }
+        }
+        GGML_LOG_CONT(".");
+    }
+
    // mul_mv_id_q4_0_f32_8x_flat
    {
 #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2827,7 +2912,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
 #ifdef GGML_OPENCL_EMBED_KERNELS
        const std::string kernel_src {
            #include "gemm_noshuffle_q4_1_f32.cl.h"
-       };
+        };
 #else
        const std::string kernel_src = read_file("gemm_noshuffle_q4_1_f32.cl");
 #endif
@@ -2866,7 +2951,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
 #ifdef GGML_OPENCL_EMBED_KERNELS
        const std::string kernel_src {
            #include "gemm_noshuffle_iq4_nl_f32.cl.h"
-       };
+        };
 #else
        const std::string kernel_src = read_file("gemm_noshuffle_iq4_nl_f32.cl");
 #endif
@@ -2905,7 +2990,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
 #ifdef GGML_OPENCL_EMBED_KERNELS
        const std::string kernel_src {
            #include "gemm_noshuffle_q8_0_f32.cl.h"
-       };
+        };
 #else
        const std::string kernel_src = read_file("gemm_noshuffle_q8_0_f32.cl");
 #endif
@@ -2946,7 +3031,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
 #ifdef GGML_OPENCL_EMBED_KERNELS
        const std::string kernel_src {
            #include "gemm_noshuffle_q4_k_f32.cl.h"
-       };
+        };
 #else
        const std::string kernel_src = read_file("gemm_noshuffle_q4_k_f32.cl");
 #endif
@@ -3781,6 +3866,16 @@ static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) {
    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
    ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated

+    // check support for qcom_subgroup_shuffle
+    if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") != NULL) {
+        GGML_LOG_INFO("ggml_opencl: cl_khr_subgroups support: true\n");
+        if (strstr(ext_buffer, "cl_qcom_subgroup_shuffle") != NULL) {
+            backend_ctx->has_qcom_subgroup_shuffle = true;
+        }
+    }
+    GGML_LOG_INFO("ggml_opencl: cl_qcom_subgroup_shuffle support: %s\n",
+        backend_ctx->has_qcom_subgroup_shuffle ? "true" : "false");
+
    // Check if ext_buffer contains cl_khr_fp16
    backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
    GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
@@ -4832,17 +4927,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
                case GGML_UNARY_OP_RELU:
                case GGML_UNARY_OP_GELU_ERF:
                case GGML_UNARY_OP_GELU_QUICK:
-                   return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
+                    return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
                case GGML_UNARY_OP_SIGMOID:
                    return ggml_is_contiguous(op->src[0]);
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_NEG:
                case GGML_UNARY_OP_EXP:
-                   return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
+                    return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
                case GGML_UNARY_OP_EXPM1:
-                   return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
+                    return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
                case GGML_UNARY_OP_SOFTPLUS:
-                   return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
+                    return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
                default:
                    return false;
            }
@@ -4891,6 +4986,15 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
                   (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32);
        case GGML_OP_SSM_CONV:
            return (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32);
+        case GGML_OP_GATED_DELTA_NET:
+            {
+                // Match the Vulkan backend: only F32 -> F32, S_v in {16, 32, 64, 128}.
+                if (op->src[0]->type != GGML_TYPE_F32 || op->type != GGML_TYPE_F32) {
+                    return false;
+                }
+                const int64_t S_v = op->src[2]->ne[0];
+                return S_v == 16 || S_v == 32 || S_v == 64 || S_v == 128;
+            }
        case GGML_OP_CONCAT:
            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
        case GGML_OP_TIMESTEP_EMBEDDING:
@@ -10555,7 +10659,7 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
    size_t local_work_size[]  = { lws0, 1, 1 };

    size_t * local_work_size_ptr = local_work_size;
-     if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) {
+    if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) {
        local_work_size_ptr = nullptr;
    }

@@ -17052,6 +17156,185 @@ static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const
    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 }

+static void ggml_cl_gated_delta_net(ggml_backend_t backend, ggml_tensor * dst) {
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    const ggml_tensor * src_q     = dst->src[0];
+    const ggml_tensor * src_k     = dst->src[1];
+    const ggml_tensor * src_v     = dst->src[2];
+    const ggml_tensor * src_g     = dst->src[3];
+    const ggml_tensor * src_beta  = dst->src[4];
+    const ggml_tensor * src_state = dst->src[5];
+
+    GGML_ASSERT(src_q && src_q->extra);
+    GGML_ASSERT(src_k && src_k->extra);
+    GGML_ASSERT(src_v && src_v->extra);
+    GGML_ASSERT(src_g && src_g->extra);
+    GGML_ASSERT(src_beta && src_beta->extra);
+    GGML_ASSERT(src_state && src_state->extra);
+
+    ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *) backend->context;
+
+    const cl_uint S_v      = (cl_uint) src_v->ne[0];
+    const cl_uint H_v      = (cl_uint) src_v->ne[1];
+    const cl_uint n_tokens = (cl_uint) src_v->ne[2];
+    const cl_uint n_seqs   = (cl_uint) src_v->ne[3];
+    const cl_uint K        = (cl_uint) src_state->ne[1];
+
+    int si;
+    switch (S_v) {
+        case 16:  si = 0; break;
+        case 32:  si = 1; break;
+        case 64:  si = 2; break;
+        case 128: si = 3; break;
+        default:
+            GGML_ASSERT(false && "ggml_cl_gated_delta_net: unsupported S_v");
+    }
+
+    const int kda = (src_g->ne[0] == (int64_t) S_v) ? 1 : 0;
+
+    // TODO: Optimize when S_v!=128. Not necessary for now as Qwen3.5/6 are all S_v=128
+    // token generation mode (tgpp=0):
+    // process 1 token at a time, so columns per lane (cpl) == 1
+    // prompt processing mode (tgpp=1):
+    // cpl=4 to process 4 tokens for single-token. 4 is chosen for Adreno 750 as per
+    // work-item/thread has at most 128 registers.
+    // All Qwen3.5/6 models are S_v == 128, so LANES_PER_COLUMN == 8
+    // such that ROWS_PER_LANE = 128/8 = 16
+    // Variables in the kernel:
+    // k_reg, q_reg, g_exp are all 16 floats
+    // s_shard has cpl*ROWS_PER_LANE = 4*16 = 64 floats
+    // Total 112 registers used.
+    // subgroups_per_workgroup (spw) can be set to 1,2,4,8,16 for tg and 1,2,4 for pp
+    // for S_v=128.
+    // Empirically found that when spw=1, we get the best performance for both tg and pp
+    const int tgpp = (n_tokens == 1) ? 0 : 1;
+    const int cpl  = (tgpp == 0) ? 1 : 4;
+    // spw needs adjustment when S_v != 128
+    const int spw  = (tgpp == 0) ? 1 : 1;
+
+    cl_kernel kernel = backend_ctx->kernel_gated_delta_net_f32[si][kda][tgpp];
+    GGML_ASSERT(kernel != nullptr);
+
+    const cl_uint s_off = S_v * H_v * n_tokens * n_seqs;
+
+    const cl_uint sq1 = (cl_uint)(src_q->nb[1]    / sizeof(float));
+    const cl_uint sq2 = (cl_uint)(src_q->nb[2]    / sizeof(float));
+    const cl_uint sq3 = (cl_uint)(src_q->nb[3]    / sizeof(float));
+    const cl_uint sv1 = (cl_uint)(src_v->nb[1]    / sizeof(float));
+    const cl_uint sv2 = (cl_uint)(src_v->nb[2]    / sizeof(float));
+    const cl_uint sv3 = (cl_uint)(src_v->nb[3]    / sizeof(float));
+    const cl_uint sb1 = (cl_uint)(src_beta->nb[1] / sizeof(float));
+    const cl_uint sb2 = (cl_uint)(src_beta->nb[2] / sizeof(float));
+    const cl_uint sb3 = (cl_uint)(src_beta->nb[3] / sizeof(float));
+
+    const cl_uint H_k = (cl_uint) src_q->ne[1];
+    const cl_uint rq3 = (cl_uint)(src_v->ne[3] / src_q->ne[3]);
+
+    const float scale = 1.0f / sqrtf((float) S_v);
+
+    ggml_tensor_extra_cl * extra_q     = (ggml_tensor_extra_cl *) src_q->extra;
+    ggml_tensor_extra_cl * extra_k     = (ggml_tensor_extra_cl *) src_k->extra;
+    ggml_tensor_extra_cl * extra_v     = (ggml_tensor_extra_cl *) src_v->extra;
+    ggml_tensor_extra_cl * extra_g     = (ggml_tensor_extra_cl *) src_g->extra;
+    ggml_tensor_extra_cl * extra_beta  = (ggml_tensor_extra_cl *) src_beta->extra;
+    ggml_tensor_extra_cl * extra_state = (ggml_tensor_extra_cl *) src_state->extra;
+    ggml_tensor_extra_cl * extra_dst   = (ggml_tensor_extra_cl *) dst->extra;
+
+    const cl_ulong off_q     = extra_q->offset     + src_q->view_offs;
+    const cl_ulong off_k     = extra_k->offset     + src_k->view_offs;
+    const cl_ulong off_v     = extra_v->offset     + src_v->view_offs;
+    const cl_ulong off_g     = extra_g->offset     + src_g->view_offs;
+    const cl_ulong off_beta  = extra_beta->offset  + src_beta->view_offs;
+    const cl_ulong off_state = extra_state->offset + src_state->view_offs;
+    const cl_ulong off_dst   = extra_dst->offset   + dst->view_offs;
+
+    int idx = 0;
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem),   &extra_q->data_device));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_q));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem),   &extra_k->data_device));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_k));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem),   &extra_v->data_device));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_v));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem),   &extra_g->data_device));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_g));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem),   &extra_beta->data_device));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_beta));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem),   &extra_state->data_device));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_state));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem),   &extra_dst->data_device));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_dst));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint),  &H_v));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint),  &n_tokens));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint),  &n_seqs));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint),  &s_off));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint),  &sq1));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint),  &sq2));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint),  &sq3));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint),  &sv1));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint),  &sv2));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint),  &sv3));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint),  &sb1));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint),  &sb2));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint),  &sb3));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint),  &H_k));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint),  &rq3));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(float),    &scale));
+    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint),    &K));
+
+    // Subgroup size is 64 for Adreno and 32 for Intel
+    const int sg_size = backend_ctx->gpu_family == GPU_FAMILY::ADRENO ? 64 : backend_ctx->gpu_family == GPU_FAMILY::INTEL ? 32 : -1;
+    if (sg_size < 0) {
+        GGML_LOG_ERROR("Unsupported GPU Family: only Adreno and Intel are supported.\n");
+        exit(1);
+    }
+
+    // For the subgroup-shuffle kernel, we can safely prefer 8 lanes/column for S_v>=128
+    // For the subgroup-shuffle kernel:
+    //   S_v >= 128  -> prefer 8 lanes/column (good occupancy & register pressure tradeoff)
+    //   else        -> min(S_v, subgroup_size)
+    int lanes_per_column;
+    if ((int)S_v >= 128) {
+        lanes_per_column = 8;
+    } else {
+        lanes_per_column = std::min((int)S_v, sg_size);
+    }
+
+    // Max workgroup size for Adreno 750 is 1024
+    const int wg_size = sg_size * spw;
+
+    // Ensure lanes_per_column is a power-of-two and divides both S_v and subgroup_size.
+    // (Required for lane-group shuffle-xor reduction correctness.)
+    while (lanes_per_column > 1 &&
+            (((lanes_per_column & (lanes_per_column - 1)) != 0) ||
+            (((int)S_v % lanes_per_column) != 0) ||
+            (sg_size % lanes_per_column) != 0)) {
+        lanes_per_column >>= 1;
+    }
+    GGML_ASSERT(lanes_per_column >= 1);
+    GGML_ASSERT(((lanes_per_column & (lanes_per_column - 1)) == 0));
+    GGML_ASSERT(((int)S_v % lanes_per_column) == 0);
+    GGML_ASSERT((sg_size % lanes_per_column) == 0);
+
+    const int cols_per_wg = spw * (sg_size / lanes_per_column) * cpl;
+    GGML_ASSERT(cols_per_wg > 0);
+    GGML_ASSERT(((int)S_v % cols_per_wg) == 0);
+
+    size_t global_work_size[3];
+    size_t local_work_size[3];
+
+    global_work_size[0] = (size_t) H_v * (size_t) wg_size;
+    global_work_size[1] = (size_t) n_seqs;
+    global_work_size[2] = (size_t) S_v / (size_t) cols_per_wg;
+
+    local_work_size[0]  = (size_t) wg_size;
+    local_work_size[1]  = 1;
+    local_work_size[2]  = 1;
+
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+}
+
 //------------------------------------------------------------------------------
 // Op offloading
 //------------------------------------------------------------------------------
@@ -17267,8 +17550,8 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
            }
            func = ggml_cl_group_norm;
            break;
-                case GGML_OP_REPEAT:
-             if (!any_on_device) {
+        case GGML_OP_REPEAT:
+            if (!any_on_device) {
                return false;
            }
            func = ggml_cl_repeat;
@@ -17297,6 +17580,14 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
            }
            func = ggml_cl_ssm_conv;
            break;
+        case GGML_OP_GATED_DELTA_NET:
+            if (!any_on_device) {
+                return false;
+            }
+            // GDN has 6 source tensors, so it cannot use the standard
+            // (src0, src1, dst) func signature. Dispatch directly and return.
+            ggml_cl_gated_delta_net(backend, tensor);
+            return true;
        case GGML_OP_CONCAT:
            if (!any_on_device) {
                return false;
--- a/ggml/src/ggml-opencl/kernels/gated_delta_net.cl
+++ b/ggml/src/ggml-opencl/kernels/gated_delta_net.cl
@@ -0,0 +1,247 @@
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+#ifdef cl_intel_required_subgroup_size
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+
+#ifndef S_V
+#define S_V 128
+#endif
+#ifndef KDA
+#define KDA 0
+#endif
+#ifndef SUBGROUP_SIZE
+#define SUBGROUP_SIZE 64
+#endif
+#ifndef LANES_PER_COLUMN
+#define LANES_PER_COLUMN 8
+#endif
+#ifndef COLS_PER_LANE_GROUP
+#define COLS_PER_LANE_GROUP 1
+#endif
+#ifndef SUBGROUPS_PER_WG
+#define SUBGROUPS_PER_WG 1
+#endif
+#ifndef USE_QCOM_SUBGROUP_SHUFFLE
+#define USE_QCOM_SUBGROUP_SHUFFLE 0
+#endif
+
+#define WG_SIZE             (SUBGROUP_SIZE * SUBGROUPS_PER_WG)
+#define LANE_GROUPS_PER_SG  (SUBGROUP_SIZE / LANES_PER_COLUMN)
+#define COLS_PER_SG         (LANE_GROUPS_PER_SG * COLS_PER_LANE_GROUP)
+#define COLS_PER_WG         (SUBGROUPS_PER_WG * COLS_PER_SG)
+#define ROWS_PER_LANE       (S_V / LANES_PER_COLUMN)
+
+#if USE_QCOM_SUBGROUP_SHUFFLE
+#pragma OPENCL EXTENSION cl_qcom_subgroup_shuffle : enable
+#endif
+
+// XOR-based parallel sum
+// This does a reduction across groups of LANES_PER_COLUMN
+static inline float reduce_add_shmem(float partial, __local float * temp, uint lane) {
+#if USE_QCOM_SUBGROUP_SHUFFLE
+   #pragma unroll
+    for (uint s = LANES_PER_COLUMN / 2u; s > 0u; s >>= 1u) {
+        partial += qcom_sub_group_shuffle_xor(partial, s, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, partial);
+    }
+    return partial;
+#else
+    temp[lane] = partial;
+    sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+    #pragma unroll
+    for (uint s = LANES_PER_COLUMN / 2u; s > 0u; s >>= 1u) {
+        float other = temp[lane ^ s];
+        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+        temp[lane] += other;
+        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    const float result = temp[lane];
+    sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+    return result;
+#endif
+}
+
+#define REDUCE_PARTIAL(partial, temp_ptr, lid) \
+    ((LANES_PER_COLUMN == 1u) ? (partial) : reduce_add_shmem((partial), (temp_ptr), (lid)))
+
+// force compiler to optimize kernel for a specific fixed work-group size
+__attribute__((reqd_work_group_size(WG_SIZE, 1, 1)))
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_32
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_gated_delta_net(
+        global const char * q_buf,     ulong off_q,
+        global const char * k_buf,     ulong off_k,
+        global const char * v_buf,     ulong off_v,
+        global const char * g_buf,     ulong off_g,
+        global const char * beta_buf,  ulong off_beta,
+        global const char * state_buf, ulong off_state,
+        global       char * dst_buf,   ulong off_dst,
+        uint  H_v,
+        uint  n_tokens,
+        uint  n_seqs,
+        uint  s_off,
+        uint  sq1, uint sq2, uint sq3,
+        uint  sv1, uint sv2, uint sv3,
+        uint  sb1, uint sb2, uint sb3,
+        uint  H_k,
+        uint  rq3,
+        float scale,
+        uint K) {
+
+    global const float * data_q     = (global const float *)(q_buf     + off_q);
+    global const float * data_k     = (global const float *)(k_buf     + off_k);
+    global const float * data_v     = (global const float *)(v_buf     + off_v);
+    global const float * data_g     = (global const float *)(g_buf     + off_g);
+    global const float * data_beta  = (global const float *)(beta_buf  + off_beta);
+    global const float * data_state = (global const float *)(state_buf + off_state);
+    global       float * data_dst   = (global       float *)(dst_buf   + off_dst);
+
+    const uint head_id     = get_group_id(0);
+    const uint seq_id      = get_group_id(1);
+    const uint tid         = (uint)get_local_id(0);
+
+    const uint sg_id       = get_sub_group_id(); // subgroup id
+    const uint sg_lid      = get_sub_group_local_id(); // subgroup lane id
+
+    const uint lane        = sg_lid % LANES_PER_COLUMN;
+    const uint lane_group  = sg_lid / LANES_PER_COLUMN;
+    const uint wg_col_base = get_group_id(2) * COLS_PER_WG;
+    const uint sg_col_base = wg_col_base + sg_id * COLS_PER_SG;
+
+    const uint iq1 = head_id % H_k; // head index for Q and K
+    const uint iq3 = seq_id / rq3; // seq index for Q and K
+
+    const uint state_size = S_V * S_V;
+    const uint state_base = (seq_id * K * H_v + head_id) * state_size;
+    const uint q_off_base  = iq3 * sq3 + iq1 * sq1;
+    const uint v_off_base  = seq_id * sv3 + head_id * sv1;
+    const uint gb_off_base = seq_id * sb3 + head_id * sb1;
+    const uint state_out_base      = (seq_id * H_v + head_id) * state_size;
+    const uint state_size_per_snap = state_size * H_v * n_seqs;
+
+    __local float reduce_temp[WG_SIZE];
+    __local float * temp_ptr = reduce_temp + sg_id * SUBGROUP_SIZE;
+
+    float s_shard[COLS_PER_LANE_GROUP][ROWS_PER_LANE];
+    #pragma unroll
+    for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) {
+        const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group;
+        #pragma unroll
+        for (uint r = 0; r < ROWS_PER_LANE; r++) {
+            s_shard[cg][r] = data_state[state_base + col * S_V + r * LANES_PER_COLUMN + lane];
+        }
+    }
+
+    const int shift = (int)n_tokens - (int)K;
+    uint attn_off = (seq_id * n_tokens * H_v + head_id) * S_V;
+
+    for (uint t = 0; t < n_tokens; t++) {
+        const uint  q_off    = q_off_base + t * sq2;
+        const uint  k_off    = q_off;
+        const uint  v_off    = v_off_base + t * sv2;
+        const uint  gb_off   = gb_off_base + t * sb2;
+        const float beta_val = data_beta[gb_off];
+
+        float k_reg[ROWS_PER_LANE];
+        float q_reg[ROWS_PER_LANE];
+#if KDA
+        float g_exp[ROWS_PER_LANE];
+        #pragma unroll
+        for (uint r = 0; r < ROWS_PER_LANE; r++) {
+            const uint i = r * LANES_PER_COLUMN + lane;
+            k_reg[r] = data_k[k_off + i];
+            q_reg[r] = data_q[q_off + i];
+            g_exp[r] = exp(data_g[gb_off * S_V + i]);
+        }
+#else
+        const float g_val = exp(data_g[gb_off]);
+
+        #pragma unroll
+        for (uint r = 0; r < ROWS_PER_LANE; r++) {
+            const uint i = r * LANES_PER_COLUMN + lane;
+            k_reg[r] = data_k[k_off + i];
+            q_reg[r] = data_q[q_off + i];
+        }
+#endif
+
+        #pragma unroll
+        for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) {
+            const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group;
+            float v_val = data_v[v_off + col];
+
+            float kv_shard = 0.0f;
+            #pragma unroll
+            for (uint r = 0; r < ROWS_PER_LANE; r++) {
+#if KDA
+                float gs = g_exp[r] * s_shard[cg][r];
+                kv_shard += gs * k_reg[r];
+#else
+                kv_shard += s_shard[cg][r] * k_reg[r];
+#endif
+            }
+
+#if !KDA
+            kv_shard *= g_val; // Applied once instead of ROWS_PER_LANE times
+#endif
+
+            const float kv_col = REDUCE_PARTIAL(kv_shard, temp_ptr, sg_lid);
+
+            const float delta_col = (v_val - kv_col) * beta_val;
+
+            float attn_partial = 0.0f;
+            #pragma unroll
+            for (uint r = 0; r < ROWS_PER_LANE; r++) {
+#if KDA
+                float gs = g_exp[r] * s_shard[cg][r];
+#else
+                float gs = g_val * s_shard[cg][r];
+#endif
+                s_shard[cg][r] = gs + k_reg[r] * delta_col;
+                attn_partial += s_shard[cg][r] * q_reg[r];
+            }
+            const float attn_col = REDUCE_PARTIAL(attn_partial, temp_ptr, sg_lid);
+
+            if (lane == 0) {
+                data_dst[attn_off + col] = attn_col * scale;
+            }
+        }
+        attn_off += S_V * H_v;
+
+        if (K > 1u) {
+            const int target_slot = (int)t - shift;
+            if (target_slot >= 0 && target_slot < (int)K) {
+                #pragma unroll
+                for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) {
+                    const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group;
+                    const uint slot_base = s_off + (uint)target_slot * state_size_per_snap + state_out_base;
+                    #pragma unroll
+                    for (uint r = 0; r < ROWS_PER_LANE; r++) {
+                        data_dst[slot_base + col * S_V + r * LANES_PER_COLUMN + lane] = s_shard[cg][r];
+                    }
+                }
+            }
+        }
+    }
+
+    if (K == 1u) {
+        #pragma unroll
+        for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) {
+            const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group;
+            #pragma unroll
+            for (uint r = 0; r < ROWS_PER_LANE; r++) {
+                data_dst[s_off + state_base + col * S_V + r * LANES_PER_COLUMN + lane] = s_shard[cg][r];
+            }
+        }
+    }
+}
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -224,6 +224,7 @@ struct sycl_device_info {
    int max_wg_per_cu; // max work groups per compute unit - refer to
                       // cudaOccupancyMaxActiveBlocksPerMultiprocessor
    bool    vmm;                // virtual memory support
+    size_t  vmm_granularity;    // granularity of virtual memory
    size_t  total_vram;
    sycl_hw_info hw_info;
    optimize_feature opt_feature;
@@ -244,6 +245,8 @@ struct ggml_sycl_device_info {

 const ggml_sycl_device_info & ggml_sycl_info();

+static constexpr size_t SYCL_BUFFER_ALIGNMENT = 128;
+
 struct ggml_sycl_pool {
    virtual ~ggml_sycl_pool() = default;

--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -19,6 +19,7 @@
 #include <cstdlib>
 #include <float.h>
 #include <limits>
+#include <optional>
 #include <stdint.h>
 #include <stdio.h>
 #include <vector>
@@ -37,6 +38,11 @@
 #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
 #    include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
 #endif
+#if SYCL_EXT_ONEAPI_VIRTUAL_MEM
+#    include <sycl/ext/oneapi/virtual_mem/physical_mem.hpp>
+#    include <sycl/ext/oneapi/virtual_mem/virtual_mem.hpp>
+#    define GGML_SYCL_USE_VMM
+#endif
 #include <sycl/half_type.hpp>

 #include "ggml.h"
@@ -70,6 +76,7 @@ int g_ggml_sycl_debug = 0;
 int g_ggml_sycl_disable_optimize = 0;
 int g_ggml_sycl_disable_graph = 0;
 int g_ggml_sycl_disable_dnn = 0;
+int g_ggml_sycl_enable_vmm = 1;
 int g_ggml_sycl_prioritize_dmmv = 0;
 int g_ggml_sycl_use_async_mem_op = 0;
 int g_ggml_sycl_use_async_mem_op_requested = 1;
@@ -96,13 +103,30 @@ static ggml_sycl_device_info ggml_sycl_init() {
 //     GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
 // #endif
    for (int i = 0; i < info.device_count; ++i) {
-        info.devices[i].vmm = 0;
        dpct::device_info prop;
        auto & device = dpct::dev_mgr::instance().get_device(i);

        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
            prop, device)));

+#if !defined(GGML_SYCL_USE_VMM)
+        info.devices[i].vmm = 0;
+#else
+        info.devices[i].vmm = device.has(sycl::aspect::ext_oneapi_virtual_mem);
+        if (info.devices[i].vmm) {
+            // NB: SYCL's get_mem_granularity always returns the _minimum_ granularity,
+            // but the L0 API requires a larger page size for allocs above 2 MiB and
+            // rejects non-multiples with UR_RESULT_ERROR_INVALID_VALUE [sic].
+            // Here we clamp it to 2 MiB for simplicity, but other devices may require
+            // calling zeVirtualMemQueryPageSize or yet unexposed public API.
+            const size_t physical_page = 2ull << 20; // 2 MiB
+            info.devices[i].vmm_granularity = std::max<size_t>(
+                sycl::ext::oneapi::experimental::get_mem_granularity(
+                    device, sycl::context(device)),
+                physical_page);
+        }
+#endif
+
        info.default_tensor_split[i] = total_vram;
        total_vram += prop.get_global_mem_size();

@@ -234,6 +258,7 @@ static void ggml_check_sycl() try {
        g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
        g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
        g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
+        g_ggml_sycl_enable_vmm = get_sycl_env("GGML_SYCL_ENABLE_VMM", 1);
        g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
 #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
        g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", ggml_sycl_info().ext_oneapi_level_zero);
@@ -275,6 +300,11 @@ static void ggml_check_sycl() try {
 #else
        GGML_LOG_INFO("  GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n");
 #endif
+#if defined(GGML_SYCL_USE_VMM)
+        GGML_LOG_INFO("  GGML_SYCL_USE_VMM: yes\n");
+#else
+        GGML_LOG_INFO("  GGML_SYCL_USE_VMM: no\n");
+#endif

        GGML_LOG_INFO("Running with Environment Variables:\n");
        GGML_LOG_INFO("  GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
@@ -293,6 +323,11 @@ static void ggml_check_sycl() try {
        GGML_LOG_INFO("  GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
 #else
        GGML_LOG_INFO("  GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n");
+#endif
+#if defined(GGML_SYCL_USE_VMM)
+        GGML_LOG_INFO("  GGML_SYCL_ENABLE_VMM: %d\n", g_ggml_sycl_enable_vmm);
+#else
+        GGML_LOG_INFO("  GGML_SYCL_ENABLE_VMM: virtual memory extension is not available\n");
 #endif
        GGML_LOG_INFO("  GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
        g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1);
@@ -754,7 +789,7 @@ catch (sycl::exception const &exc) {
 }

 static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return 128;
+    return SYCL_BUFFER_ALIGNMENT;
    GGML_UNUSED(buft);
 }

@@ -1177,7 +1212,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(gg
 }

 static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return 128;
+    return SYCL_BUFFER_ALIGNMENT;
    GGML_UNUSED(buft);
 }

@@ -1462,6 +1497,121 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
    }
 };

+// pool with virtual memory management
+#if defined(GGML_SYCL_USE_VMM)
+struct ggml_sycl_pool_vmm : public ggml_sycl_pool {
+    static const size_t SYCL_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
+
+    int           device;
+    sycl::context ctx;
+    sycl::device  dev;
+
+    uintptr_t pool_addr = 0;
+    size_t    pool_used = 0;
+    size_t    pool_size = 0;
+    size_t    granularity;
+
+    // physical_mem owns the commits (unlike cuMemMap)
+    struct mapping {
+        sycl::ext::oneapi::experimental::physical_mem phys;
+        void * map_ptr;
+    };
+    std::vector<mapping> mappings;
+
+    explicit ggml_sycl_pool_vmm(queue_ptr qptr_, int device_) :
+        device(device_),
+        ctx(qptr_->get_context()),
+        dev(qptr_->get_device()),
+        granularity(ggml_sycl_info().devices[device_].vmm_granularity) {
+    }
+
+    ~ggml_sycl_pool_vmm() {
+        if (pool_addr == 0) {
+            return;
+        }
+
+        // Per spec, unmap must (a) match the exact (ptr, size) of an earlier
+        // physical_mem::map() call and (b) precede destruction of the
+        // physical_mem objects (their dtors won't unmap).
+        for (auto & m : mappings) {
+            SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::unmap(
+                m.map_ptr, m.phys.size(), ctx)));
+        }
+        SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::free_virtual_mem(
+            pool_addr, SYCL_POOL_VMM_MAX_SIZE, ctx)));
+    }
+
+    void * alloc(size_t size, size_t * actual_size) override {
+        // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
+        size = GGML_PAD(size, SYCL_BUFFER_ALIGNMENT);
+
+        size_t avail = pool_size - pool_used;
+
+        if (size > avail) {
+            // round up to the next multiple of the granularity
+            size_t reserve_size = GGML_PAD(size - avail, granularity);
+
+            GGML_ASSERT(pool_size + reserve_size <= SYCL_POOL_VMM_MAX_SIZE);
+
+            // allocate more physical memory
+            std::optional<sycl::ext::oneapi::experimental::physical_mem> phys;
+            SYCL_CHECK(CHECK_TRY_ERROR(phys.emplace(dev, ctx, reserve_size)));
+
+            // reserve virtual address space (if not already reserved)
+            if (pool_addr == 0) {
+                SYCL_CHECK(CHECK_TRY_ERROR(
+                    pool_addr = sycl::ext::oneapi::experimental::reserve_virtual_mem(
+                        SYCL_POOL_VMM_MAX_SIZE, ctx)));
+            }
+
+            // map at the end of the pool
+            void * map_ptr = nullptr;
+            SYCL_CHECK(CHECK_TRY_ERROR(
+                map_ptr = phys->map(pool_addr + pool_size, reserve_size,
+                                    sycl::ext::oneapi::experimental::address_access_mode::read_write)));
+
+            // stash these so we could unmap this exact range in dtor
+            mappings.push_back({
+                std::move(*phys),
+                map_ptr,
+            });
+
+            // add to the pool
+            pool_size += reserve_size;
+
+#ifdef DEBUG_SYCL_MALLOC
+            GGML_LOG_INFO("sycl pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
+                          device, (unsigned long long) (pool_size/1024/1024),
+                          (unsigned long long) (reserve_size/1024/1024));
+#endif
+        }
+
+        GGML_ASSERT(pool_addr != 0);
+
+        void * ptr = reinterpret_cast<void *>(pool_addr + pool_used);
+        *actual_size = size;
+        pool_used += size;
+
+#ifdef DEBUG_SYCL_MALLOC
+        GGML_LOG_INFO("sycl pool[%d]: allocated %llu bytes at %p\n", device, (unsigned long long) size, ptr);
+#endif
+
+        return ptr;
+    }
+
+    void free(void * ptr, size_t size) override {
+#ifdef DEBUG_SYCL_MALLOC
+        GGML_LOG_INFO("sycl pool[%d]: freed %llu bytes at %p\n", device, (unsigned long long) size, ptr);
+#endif
+
+        pool_used -= size;
+
+        // all deallocations must be in reverse order of the allocations
+        GGML_ASSERT(ptr == reinterpret_cast<void *>(pool_addr + pool_used));
+    }
+};
+#endif // defined(GGML_SYCL_USE_VMM)
+
 struct ggml_sycl_pool_host : public ggml_sycl_pool {
    queue_ptr qptr;
    int       device;
@@ -1542,20 +1692,19 @@ std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_host(que
 }

 std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
-    // TBD: NO VMM support
-    // if (ggml_sycl_info().devices[device].vmm) {
-    //     return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(device));
-    // }
-   return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
+#if defined(GGML_SYCL_USE_VMM)
+    if (g_ggml_sycl_enable_vmm && ggml_sycl_info().devices[device].vmm) {
+        return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(qptr, device));
+    }
+#endif // defined(GGML_SYCL_USE_VMM)
+    return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
 }

+
 std::unique_ptr<ggml_sycl_fattn_kv_buffers> ggml_backend_sycl_context::new_fattn_kv_buffers(queue_ptr qptr, int device) {
    return std::unique_ptr<ggml_sycl_fattn_kv_buffers>(new ggml_sycl_fattn_kv_buffers(qptr, device));
 }

-// TBD pool with virtual memory management
-// struct ggml_sycl_pool_vmm : public ggml_sycl_pool
-
 /// kernels
 typedef void (*ggml_sycl_op_mul_mat_t)(
    ggml_backend_sycl_context & ctx,
--- a/ggml/src/ggml-vulkan/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/CMakeLists.txt
@@ -79,6 +79,12 @@ if (Vulkan_FOUND)
        "GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT"
    )

+    test_shader_extension_support(
+        "GL_NV_cooperative_matrix_decode_vector"
+        "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp"
+        "GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT"
+    )
+
    test_shader_extension_support(
        "GL_EXT_integer_dot_product"
        "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/feature-tests/integer_dot.comp"
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -21,6 +21,19 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();

 #include <vulkan/vulkan.hpp>

+// Fallback definitions for VK_NV_cooperative_matrix_decode_vector in case the
+// installed Vulkan headers predate the extension.
+#ifndef VK_NV_cooperative_matrix_decode_vector
+#define VK_NV_cooperative_matrix_decode_vector 1
+#define VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME "VK_NV_cooperative_matrix_decode_vector"
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV ((VkStructureType)1000689000)
+typedef struct VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV {
+    VkStructureType    sType;
+    void*              pNext;
+    VkBool32           cooperativeMatrixDecodeVector;
+} VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV;
+#endif
+
 // SPIR-V Headers: different SDK installations expose different include paths.
 // LunarG Vulkan SDK on Windows typically provides <spirv-headers/spirv.hpp>.
 // Linux packages, MSYS2 and MinGW often use the Khronos layout <spirv/unified1/spirv.hpp>.
@@ -398,6 +411,7 @@ enum vk_conv_shapes {
    CONV_SHAPE_128x128,
    CONV_SHAPE_64x32,
    CONV_SHAPE_32x256,
+    CONV_SHAPE_64x128,
    CONV_SHAPE_COUNT,
 };

@@ -412,6 +426,7 @@ vk_conv_block_size vk_conv_block_sizes[CONV_SHAPE_COUNT] = {
    { 128, 128, 16 }, // CONV_SHAPE_128x128
    {  64,  32, 32 }, // CONV_SHAPE_64x32
    {  32, 256, 16 }, // CONV_SHAPE_32x256
+    {  64, 128, 16 }, // CONV_SHAPE_64x128
 };

 enum dmmv_wg_sizes {
@@ -447,14 +462,16 @@ struct vk_fa_pipeline_state {
 };

 struct vk_conv2d_pipeline_state {
-    vk_conv2d_pipeline_state(uint32_t s0, uint32_t s1, uint32_t p0, uint32_t p1, uint32_t d0, uint32_t d1, uint32_t KW, uint32_t KH)
-        : s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), KW(KW), KH(KH) {}
+    vk_conv2d_pipeline_state(uint32_t s0, uint32_t s1, uint32_t p0, uint32_t p1, uint32_t d0, uint32_t d1, uint32_t KW, uint32_t KH, uint32_t aligned)
+        : s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), KW(KW), KH(KH), aligned(aligned) {}

    uint32_t s0, s1, p0, p1, d0, d1, KW, KH;
+    // when set, shader can skip K/CRS/NPQ bounds checks and address clamps
+    uint32_t aligned;

    bool operator<(const vk_conv2d_pipeline_state &b) const {
-        return std::tie(s0, s1, p0, p1, d0, d1, KW, KH) <
-               std::tie(b.s0, b.s1, b.p0, b.p1, b.d0, b.d1, b.KW, b.KH);
+        return std::tie(s0, s1, p0, p1, d0, d1, KW, KH, aligned) <
+               std::tie(b.s0, b.s1, b.p0, b.p1, b.d0, b.d1, b.KW, b.KH, b.aligned);
    }
 };

@@ -674,6 +691,7 @@ struct vk_device_struct {
    uint32_t coopmat_int_k;

    bool coopmat2;
+    bool coopmat2_decode_vector;

    bool pipeline_executable_properties_support {};

@@ -764,7 +782,8 @@ struct vk_device_struct {
    vk_pipeline pipeline_clamp_f32;
    vk_pipeline pipeline_pad_f32;
    vk_pipeline pipeline_roll_f32;
-    vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32;
+    vk_pipeline pipeline_repeat_i32, pipeline_repeat_back_f32;
+    vk_pipeline pipeline_repeat_i16;
    vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16, pipeline_cpy_bf16_f32, pipeline_cpy_f32_i32, pipeline_cpy_i32_f32;
    vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16, pipeline_contig_cpy_bf16_f32, pipeline_contig_cpy_f32_i32, pipeline_contig_cpy_i32_f32;
    vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT];
@@ -841,6 +860,7 @@ struct vk_device_struct {
    vk_pipeline pipeline_argsort_large_f32[num_argsort_pipelines];
    vk_pipeline pipeline_topk_f32[num_topk_pipelines];
    vk_pipeline pipeline_sum_rows_f32;
+    vk_pipeline pipeline_fwht_f32[4];
    vk_pipeline pipeline_cumsum_f32;
    vk_pipeline pipeline_cumsum_small_f32;
    vk_pipeline pipeline_cumsum_multipass1_f32;
@@ -1131,6 +1151,13 @@ struct vk_op_push_constants {
    float param4;
 };

+struct vk_op_fwht_push_constants {
+    uint32_t n_rows;
+    uint32_t src_offset;
+    uint32_t dst_offset;
+    float scale;
+};
+
 struct vk_op_count_experts_push_constants {
    uint32_t ne00;
    uint32_t ne01;
@@ -2036,6 +2063,15 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk
    GGML_UNUSED(src3);
 }

+template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_fwht_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
+    p.src_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
+    p.dst_offset = get_misalign_bytes(ctx, dst)  / ggml_type_size(dst->type);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(src2);
+    GGML_UNUSED(src3);
+}
+
 struct ggml_backend_vk_buffer_context {
    vk_device_ref device;
    vk_buffer dev_buffer;
@@ -2076,9 +2112,9 @@ void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
    const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
    std::string type = device ? "device" : "host";
    auto it = allocations.find(buf->buffer);
-    total_device -= device ? it->second : 0;
-    total_host -= device ? 0 : it->second;
    if (it != allocations.end()) {
+        total_device -= device ? it->second : 0;
+        total_host -= device ? 0 : it->second;
        VK_LOG_MEMORY(buf->device->name << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
        allocations.erase(it);
    } else {
@@ -2162,6 +2198,136 @@ static uint32_t compile_count = 0;
 static std::mutex compile_count_mutex;
 static std::condition_variable compile_count_cond;

+static constexpr uint32_t kSpvOpCooperativeMatrixLoadTensorNV = 5367;
+static constexpr uint32_t kSpvCapabilityCooperativeMatrixDecodeVectorNV = 5447;
+static constexpr uint32_t kSpvTensorAddressingDecodeVectorFuncBit = 0x4;
+
+// Remove SPV_NV_cooperative_matrix_decode_vector usage from a SPIR-V module so it
+// can be loaded on drivers that only support SPV_NV_cooperative_matrix2. Drops the
+// OpExtension declaration, the CooperativeMatrixDecodeVectorNV OpCapability, and the
+// DecodeVectorFunc operand from any OpCooperativeMatrixLoadTensorNV instruction.
+// Returns true when the input used the extension (and `out` was populated with a
+// stripped copy); returns false otherwise without touching `out`.
+static bool ggml_vk_strip_decode_vector(const uint32_t * code, size_t word_count, std::vector<uint32_t> & out) {
+    static const char kDecodeVectorExt[] = "SPV_NV_cooperative_matrix_decode_vector";
+
+    if (word_count < 5) {
+        return false;
+    }
+
+    bool uses_decode_vector = false;
+    for (size_t pos = 5; pos < word_count; ) {
+        uint32_t word = code[pos];
+        uint32_t wc   = word >> spv::WordCountShift;
+        uint32_t op   = word & spv::OpCodeMask;
+        GGML_ASSERT(wc > 0 && pos + wc <= word_count);
+        if (op == spv::OpExtension && wc >= 2) {
+            const char * s = reinterpret_cast<const char *>(&code[pos + 1]);
+            if (strcmp(s, kDecodeVectorExt) == 0) {
+                uses_decode_vector = true;
+                break;
+            }
+        }
+        pos += wc;
+    }
+
+    if (!uses_decode_vector) {
+        return false;
+    }
+
+    VK_LOG_DEBUG("ggml_vk_strip_decode_vector: stripping SPV_NV_cooperative_matrix_decode_vector");
+
+    // Bulk-copy unchanged runs and only break the run when an instruction needs to
+    // be dropped or patched. Use reserve + insert/push_back so the destination buffer
+    // is touched exactly once (no zero-initialization pass from resize()).
+    out.clear();
+    out.reserve(word_count);
+
+    size_t run_start = 0;
+    auto flush_run = [&](size_t up_to) {
+        if (up_to > run_start) {
+            out.insert(out.end(), code + run_start, code + up_to);
+        }
+    };
+
+    for (size_t pos = 5; pos < word_count; ) {
+        uint32_t word = code[pos];
+        uint32_t wc   = word >> spv::WordCountShift;
+        uint32_t op   = word & spv::OpCodeMask;
+        GGML_ASSERT(wc > 0 && pos + wc <= word_count);
+
+        if (op == spv::OpExtension && wc >= 2) {
+            const char * s = reinterpret_cast<const char *>(&code[pos + 1]);
+            if (strcmp(s, kDecodeVectorExt) == 0) {
+                flush_run(pos);
+                pos += wc;
+                run_start = pos;
+                continue;
+            }
+        }
+
+        if (op == spv::OpCapability && wc == 2 && code[pos + 1] == kSpvCapabilityCooperativeMatrixDecodeVectorNV) {
+            flush_run(pos);
+            pos += wc;
+            run_start = pos;
+            continue;
+        }
+
+        if (op == kSpvOpCooperativeMatrixLoadTensorNV) {
+            // [opcode/wc][ResultType][Result][Pointer][Object][TensorLayout][MemOperand mask][mem extras...][TA mask][ta extras...]
+            GGML_ASSERT(wc >= 8);
+
+            uint32_t mem_mask = code[pos + 6];
+            size_t   cur      = pos + 7;
+            // Each of these MemoryAccess bits (when set) carries one trailing operand.
+            cur += (mem_mask & 0x2)     ? 1 : 0; // Aligned
+            cur += (mem_mask & 0x8)     ? 1 : 0; // MakePointerAvailable
+            cur += (mem_mask & 0x10)    ? 1 : 0; // MakePointerVisible
+            cur += (mem_mask & 0x10000) ? 1 : 0; // AliasScopeINTELMask
+            cur += (mem_mask & 0x20000) ? 1 : 0; // NoAliasINTELMask
+            GGML_ASSERT(cur < pos + wc);
+
+            uint32_t ta_mask = code[cur];
+            if ((ta_mask & kSpvTensorAddressingDecodeVectorFuncBit) == 0) {
+                pos += wc;
+                continue; // leave instruction inside the current unchanged run
+            }
+
+            flush_run(pos);
+
+            // Append unchanged prefix of the instruction (header through the mem-extras).
+            size_t inst_start = out.size();
+            size_t pre_n      = cur - pos;
+            out.insert(out.end(), code + pos, code + pos + pre_n);
+
+            // Emit TA mask with the DecodeVectorFunc bit cleared.
+            out.push_back(ta_mask & ~kSpvTensorAddressingDecodeVectorFuncBit);
+
+            // TA extras: TensorView (0x1) and DecodeFunc (0x2) are kept verbatim;
+            // DecodeVectorFunc (0x4) is dropped along with its trailing id operand.
+            size_t keep_ta_extras = ((ta_mask & 0x1) ? 1 : 0) + ((ta_mask & 0x2) ? 1 : 0);
+            if (keep_ta_extras) {
+                out.insert(out.end(), code + cur + 1, code + cur + 1 + keep_ta_extras);
+            }
+
+            GGML_ASSERT(wc == pre_n + 1 + keep_ta_extras + 1);
+
+            // Patch the instruction header with the new (one-shorter) word count.
+            uint32_t new_wc = wc - 1;
+            out[inst_start] = (new_wc << spv::WordCountShift) | op;
+
+            pos += wc;
+            run_start = pos;
+            continue;
+        }
+
+        pos += wc;
+    }
+
+    flush_run(word_count);
+    return true;
+}
+
 static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, size_t spv_size, const void* spv_data, const std::string entrypoint,
                                         uint32_t parameter_count, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants,
                                         bool disable_robustness, bool require_full_subgroups, uint32_t required_subgroup_size) {
@@ -2233,6 +2399,18 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
        shader_module_create_info = vk::ShaderModuleCreateInfo({}, spirv.size() * sizeof(uint32_t), spirv.data());
    }

+#if defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT)
+    if (device->coopmat2 && !device->coopmat2_decode_vector) {
+        const uint32_t * src   = spirv.empty() ? reinterpret_cast<const uint32_t *>(spv_data) : spirv.data();
+        size_t           src_n = spirv.empty() ? spv_size / sizeof(uint32_t) : spirv.size();
+        std::vector<uint32_t> stripped;
+        if (ggml_vk_strip_decode_vector(src, src_n, stripped)) {
+            spirv = std::move(stripped);
+            shader_module_create_info = vk::ShaderModuleCreateInfo({}, spirv.size() * sizeof(uint32_t), spirv.data());
+        }
+    }
+#endif
+
    pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);

    vk::PushConstantRange pcr(
@@ -4704,9 +4882,11 @@ static void ggml_vk_load_shaders(vk_device& device) {

    ggml_vk_create_pipeline(device, device->pipeline_roll_f32, "roll_f32", roll_f32_len, roll_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);

-    ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_repeat_i32, "repeat_i32", repeat_i32_len, repeat_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_repeat_back_f32, "repeat_back_f32", repeat_back_f32_len, repeat_back_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);

+    ggml_vk_create_pipeline(device, device->pipeline_repeat_i16, "repeat_i16", repeat_i16_len, repeat_i16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+
 #define CREATE_UNARY(name)  \
    ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);  \
    ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
@@ -4819,6 +4999,16 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_argmax_f32, "argmax_f32", argmax_f32_len, argmax_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);

    ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
+    // Intel Arc B390 was observed segfaulting with this shader.
+    if (device->subgroup_basic && device->subgroup_shuffle && device->vendor_id != VK_VENDOR_ID_INTEL) {
+        int idx = 0;
+        for (uint32_t n : {64, 128, 256, 512}) {
+            if (device->subgroup_size <= n) {
+                ggml_vk_create_pipeline(device, device->pipeline_fwht_f32[idx], "fwht_f32", fwht_f32_len, fwht_f32_data, "main", 2, sizeof(vk_op_fwht_push_constants), {1, 1, 1}, { device->subgroup_size, n }, 1, true, true, device->subgroup_size);
+            }
+            ++idx;
+        }
+    }

    const uint32_t cumsum_elem_per_thread = (device->vendor_id == VK_VENDOR_ID_AMD || device->vendor_id == VK_VENDOR_ID_INTEL) ? 2 : 4;
    ggml_vk_create_pipeline(device, device->pipeline_cumsum_f32,       "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 256, device->subgroup_size, cumsum_elem_per_thread }, 1, true, true, device->subgroup_size);
@@ -4934,7 +5124,8 @@ static void ggml_vk_load_shaders(vk_device& device) {

    // conv2d, conv_transpose_2d
    for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) {
-        uint32_t conv2d_WG_SIZE  = 256;
+        // smaller WG for the small-tile fallback gives more concurrent WGs per SM
+        uint32_t conv2d_WG_SIZE  = (s == CONV_SHAPE_64x32) ? 128 : 256;
        uint32_t use_collectives = 0;  // Enables subgroup ops for preventing the re-calculation of indices.
        uint32_t conv2d_TS_K     = (s == CONV_SHAPE_64x32) ? 4 : 8;
        uint32_t conv2d_SHMEM_PAD = 4;
@@ -4973,18 +5164,77 @@ static void ggml_vk_load_shaders(vk_device& device) {
                conv2d_BS.CRS);  // CRS block size should be capped at subgroup size for correctness when shuffle is used.
        }

-        uint32_t conv2d_shmem_req =
-            (conv2d_BS.K * (conv2d_BS.CRS + conv2d_SHMEM_PAD) + conv2d_BS.CRS * (conv2d_BS.NPQ + conv2d_SHMEM_PAD)) * sizeof(float);
-        if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
+        // cm1 is used only when cm2 is unavailable; capped at 64x128 (due to shared memory size).
+        // Requires 16x16x16 f16-acc since that's the fragment shape hard-coded in the shader.
+        // Subgroup size must be 32 or 64 (to keep WG_SIZE sane) and we need
+        // subgroup_size_control to force the driver to actually use it.
+        bool conv2d_use_cm1 = false;
+#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+        conv2d_use_cm1 = !device->coopmat2 &&
+                         device->coopmat_support && device->coopmat_support_16x16x16_f16acc &&
+                         device->subgroup_size_control &&
+                         (device->subgroup_size == 32 || device->subgroup_size == 64) &&
+                         s != CONV_SHAPE_128x128;
+#endif
+
+        const uint32_t conv2d_cm1_shmem_pad = 8;
+
+        auto shmem_req = [&](uint32_t pad, bool csh_store, bool fp16_shmem) {
+            const uint32_t elem_size = fp16_shmem ? (uint32_t)sizeof(uint16_t) : (uint32_t)sizeof(float);
+            const uint32_t csh_elems = csh_store ? conv2d_BS.K * conv2d_BS.NPQ : 0u;
+            return (conv2d_BS.K * (conv2d_BS.CRS + pad) + conv2d_BS.CRS * (conv2d_BS.NPQ + pad) + csh_elems) * elem_size;
+        };
+
+        // coopmat1 needs to store the output through shared memory, so check up front
+        // whether it'll fit and disable it before applying coopmat1 parameters.
+        if (conv2d_use_cm1 && device->properties.limits.maxComputeSharedMemorySize < shmem_req(conv2d_cm1_shmem_pad, true, true)) {
+            conv2d_use_cm1 = false;
+        }
+
+        uint32_t conv2d_WM = 16, conv2d_WN = 16;  // cm1 subgroup tile, ignored otherwise
+        if (conv2d_use_cm1) {
+            conv2d_SHMEM_PAD = conv2d_cm1_shmem_pad;
+            // 16x16x16 fragments; pick WM/WN to keep WG_SIZE at 256
+            // (i.e. 8 subgroups for sg=32, 4 subgroups for sg=64).
+            const bool sg64 = (device->subgroup_size == 64);
+            switch (s) {
+                case CONV_SHAPE_64x32:   conv2d_WM = sg64 ? 32 : 16; conv2d_WN = 16; break;
+                case CONV_SHAPE_64x128:  conv2d_WM = 32; conv2d_WN = sg64 ? 64 : 32; break;
+                case CONV_SHAPE_32x256:  conv2d_WM = sg64 ? 16 : 32; conv2d_WN = sg64 ? 128 : 32; break;
+                default: break;
+            }
+            const uint32_t warps_M = conv2d_BS.K / conv2d_WM;
+            const uint32_t warps_N = conv2d_BS.NPQ / conv2d_WN;
+            conv2d_WG_SIZE         = warps_M * warps_N * device->subgroup_size;
+        }
+
+        // stage cm2 accumulator through shmem for coalesced global stores;
+        // skipped on 128x128 where the extra Csh footprint hurts occupancy.
+        // cm1 always uses the staged path.
+        uint32_t conv2d_csh_store = (device->coopmat2 && s != CONV_SHAPE_128x128) ? 1u : 0u;
+        if (conv2d_use_cm1) {
+            conv2d_csh_store = 1;
+        }
+
+        // shmem is fp16 on cm2/cm1 (matches Csh), fp32 on scalar
+        const bool conv2d_use_fp16_shmem = device->coopmat2 || conv2d_use_cm1;
+
+        // shrink CRS if the non-cm1 config still doesn't fit
+        if (device->properties.limits.maxComputeSharedMemorySize < shmem_req(conv2d_SHMEM_PAD, conv2d_csh_store, conv2d_use_fp16_shmem)) {
+            GGML_ASSERT(!conv2d_use_cm1);
            conv2d_BS.CRS = 8;
            if (use_collectives) {
                conv2d_BS.CRS = std::min(device->subgroup_size, conv2d_BS.CRS);
            }
+            conv2d_csh_store = 0;
        }

        std::array<uint32_t, 3> wg_denoms = { conv2d_BS.K, 1, 1 };
        std::vector<uint32_t> spec_constants = { conv2d_WG_SIZE, conv2d_BS.K, conv2d_BS.CRS, conv2d_BS.NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD };

+        // cm1 needs a fixed subgroup width to match the WG_SIZE we computed
+        const uint32_t conv2d_required_subgroup_size = conv2d_use_cm1 ? device->subgroup_size : 0;
+
 #define CREATE_CONV(name, type_suffix, spv_suffix) \
        for (auto &c : device->pipeline_##name##type_suffix[s]) { \
            const vk_conv2d_pipeline_state &state = c.first;  \
@@ -4997,10 +5247,14 @@ static void ggml_vk_load_shaders(vk_device& device) {
            spec_constants_cpy.push_back(state.d1); \
            spec_constants_cpy.push_back(state.KW); \
            spec_constants_cpy.push_back(state.KH); \
+            spec_constants_cpy.push_back(state.aligned); \
+            spec_constants_cpy.push_back(conv2d_csh_store); \
+            spec_constants_cpy.push_back(conv2d_WM); \
+            spec_constants_cpy.push_back(conv2d_WN); \
            ggml_vk_create_pipeline( \
                device, c.second, #name #type_suffix, \
                name##type_suffix##spv_suffix##_len, name##type_suffix##spv_suffix##_data, "main", 3, \
-                sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants_cpy, 1, true, use_collectives);    \
+                sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants_cpy, 1, true, use_collectives || conv2d_required_subgroup_size, conv2d_required_subgroup_size);    \
        }
 #define CREATE_CONVS(spv_suffix) \
        CREATE_CONV(conv2d, _f32, spv_suffix) \
@@ -5011,6 +5265,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
        if (device->coopmat2) {
            CREATE_CONVS(_cm2)
        } else
+#endif
+#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+        if (conv2d_use_cm1) {
+            CREATE_CONVS(_cm1)
+        } else
 #endif
        if (conv2d_UNROLL) {
            CREATE_CONVS(_unroll)
@@ -5083,6 +5342,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
        bool amd_shader_core_properties2 = false;
        bool pipeline_robustness = false;
        bool coopmat2_support = false;
+        bool coopmat2_decode_vector_support = false;
        bool pipeline_executable_properties_support = false;
        device->coopmat_support = false;
        device->integer_dot_product = false;
@@ -5117,6 +5377,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
                       !getenv("GGML_VK_DISABLE_COOPMAT2")) {
                coopmat2_support = true;
 #endif
+            } else if (strcmp(VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME, properties.extensionName) == 0 &&
+                       !getenv("GGML_VK_DISABLE_COOPMAT2_DECODE_VECTOR")) {
+                coopmat2_decode_vector_support = true;
 #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
            } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 &&
                       !getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) {
@@ -5394,6 +5657,14 @@ static vk_device ggml_vk_get_device(size_t idx) {
        }
 #endif

+        VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV coopmat2_decode_vector_features {};
+        coopmat2_decode_vector_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV;
+        if (coopmat2_decode_vector_support) {
+            last_struct->pNext = (VkBaseOutStructure *)&coopmat2_decode_vector_features;
+            last_struct = (VkBaseOutStructure *)&coopmat2_decode_vector_features;
+            device_extensions.push_back(VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME);
+        }
+
 #if defined(VK_KHR_shader_bfloat16)
        VkPhysicalDeviceShaderBfloat16FeaturesKHR bfloat16_features {};
        bfloat16_features.pNext = nullptr;
@@ -5553,6 +5824,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
                    found_fp32_128 && found_fp32_256 &&
                    coopmat2_props.cooperativeMatrixFlexibleDimensionsMaxDimension >= 512) {
                    device->coopmat2 = true;
+                    device->coopmat2_decode_vector = coopmat2_decode_vector_support && coopmat2_decode_vector_features.cooperativeMatrixDecodeVector;
                }
            }
 #endif
@@ -5768,8 +6040,12 @@ static vk_device ggml_vk_get_device(size_t idx) {

        ggml_vk_load_shaders(device);

-        // Only use transfer queue on AMD non-GCN, when the graphics queue is not enabled
-        const bool prefers_transfer_queue = device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != AMD_GCN && !allow_graphics_queue;
+        // Prefer a dedicated transfer queue on AMD dGPUs (non-GCN) when graphics queue use is disabled.
+        const bool prefers_transfer_queue =
+            device->vendor_id == VK_VENDOR_ID_AMD &&
+            device->architecture != AMD_GCN &&
+            !device->uma &&
+            !allow_graphics_queue;

        if (!device->single_queue) {
            const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0;
@@ -5835,6 +6111,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
    bool fp16_compute = false;
    bool coopmat_support = false;
    bool coopmat2_support = false;
+    bool coopmat2_decode_vector_support = false;
    bool integer_dot_product = false;
    bool bfloat16_support = false;

@@ -5853,6 +6130,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
                   !getenv("GGML_VK_DISABLE_COOPMAT2")) {
            coopmat2_support = true;
 #endif
+        } else if (strcmp(VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME, properties.extensionName) == 0 &&
+                   !getenv("GGML_VK_DISABLE_COOPMAT2_DECODE_VECTOR")) {
+            coopmat2_decode_vector_support = true;
 #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
        } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 &&
                    !getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) {
@@ -5937,6 +6217,13 @@ static void ggml_vk_print_gpu_info(size_t idx) {
    }
 #endif

+    VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV coopmat2_decode_vector_features {};
+    coopmat2_decode_vector_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV;
+    if (coopmat2_decode_vector_support) {
+        last_struct->pNext = (VkBaseOutStructure *)&coopmat2_decode_vector_features;
+        last_struct = (VkBaseOutStructure *)&coopmat2_decode_vector_features;
+    }
+
    vkGetPhysicalDeviceFeatures2(physical_device, &device_features2);

    fp16 = fp16 && vk12_features.shaderFloat16;
@@ -5961,7 +6248,14 @@ static void ggml_vk_print_gpu_info(size_t idx) {
 #endif
                   && ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props, device_architecture);

-    std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none";
+    coopmat2_decode_vector_support = coopmat2_decode_vector_support && coopmat2_decode_vector_features.cooperativeMatrixDecodeVector;
+#if !defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT)
+    coopmat2_decode_vector_support = false;
+#endif
+
+    std::string matrix_cores = coopmat2_support ? (coopmat2_decode_vector_support ? "NV_coopmat2v" : "NV_coopmat2")
+                             : coopmat_support  ? "KHR_coopmat"
+                             : "none";

    std::string device_name = props2.properties.deviceName.data();
    GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | bf16: %d | warp size: %zu | shared memory: %d | int dot: %d | matrix cores: %s\n",
@@ -6966,7 +7260,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
                            const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1;
                            const uint64_t d_off = offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
                            for (uint64_t i0 = 0; i0 < ne0; i0++) {
-                                slices.push_back({ s_off + i1*nb0, d_off + i0*dstnb0, dstnb0 });
+                                slices.push_back({ s_off + i0*nb0, d_off + i0*dstnb0, dstnb0 });
                            }
                        }
                    }
@@ -8474,6 +8768,68 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
        }, pc, { (uint32_t)ne03, (uint32_t)ne01, (uint32_t)ne12 });
 }

+static int ggml_vk_fwht_pipeline_idx(int64_t n) {
+    switch (n) {
+        case 64:  return 0;
+        case 128: return 1;
+        case 256: return 2;
+        case 512: return 3;
+        default:  return -1;
+    }
+}
+
+static bool ggml_vk_can_use_fwht(const ggml_backend_vk_context * ctx, const ggml_tensor * src1, const ggml_tensor * dst) {
+    if (ctx->num_additional_fused_ops != 0) {
+        return false;
+    }
+
+    if (ggml_get_op_params_i32(dst, 1) != GGML_HINT_SRC0_IS_HADAMARD) {
+        return false;
+    }
+
+    const int idx = ggml_vk_fwht_pipeline_idx(src1->ne[0]);
+    if (idx < 0 || ctx->device->pipeline_fwht_f32[idx] == nullptr) {
+        return false;
+    }
+
+    if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
+        return false;
+    }
+
+    if (!ggml_is_contiguous(src1)) {
+        return false;
+    }
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    return true;
+}
+
+static void ggml_vk_fwht(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src, ggml_tensor * dst) {
+    const int idx = ggml_vk_fwht_pipeline_idx(src->ne[0]);
+    vk_pipeline pipeline = ctx->device->pipeline_fwht_f32[idx];
+
+    const uint32_t rows_per_workgroup = 4;
+    const uint32_t n_rows = (uint32_t)ggml_nrows(src);
+    const uint32_t max_workgroups_x = ctx->device->properties.limits.maxComputeWorkGroupCount[0];
+
+    const uint32_t total_workgroups = CEIL_DIV(n_rows, rows_per_workgroup);
+    const uint32_t workgroups_x = std::min(total_workgroups, max_workgroups_x);
+    ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
+
+    const vk_subbuffer src_buf = ggml_vk_tensor_subbuffer(ctx, src, true);
+    const vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, true);
+
+    vk_op_fwht_push_constants pc = {
+        n_rows,
+        0,
+        0,
+        1.0f / std::sqrt((float)src->ne[0]),
+    };
+    init_pushconst_tensor_offsets(ctx, pc, src, nullptr, nullptr, nullptr, dst);
+
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src_buf, dst_buf }, pc, { workgroups_x, 1, 1 });
+}
+
 static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) {
    ggml_tensor * dst = cgraph->nodes[node_idx];
    ggml_tensor * src0 = dst->src[0];
@@ -8507,6 +8863,8 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, c

            m_offset += cur_M_size;
        }
+    } else if (ggml_vk_can_use_fwht(ctx, src1, dst)) {
+        ggml_vk_fwht(ctx, subctx, src1, dst);
    } else if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1 &&
        // detect 0213 permutation, and batch size of 1
        src0->nb[0] <= src0->nb[2] &&
@@ -9473,10 +9831,23 @@ static vk_conv_shapes ggml_vk_conv_select_shape(ggml_backend_vk_context * ctx, u
    // so small convolutions will still choose a smaller tile.
    const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32;

-    if (K > 64 && n_tiles(CONV_SHAPE_128x128) >= shader_core_count * 2) {
+    // 128x128 isn't used with cm1 due to shared memory size; fall through to a smaller tile.
+    bool allow_128x128 = true;
+#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+    if (!ctx->device->coopmat2 && ctx->device->coopmat_support && ctx->device->coopmat_support_16x16x16_f16acc) {
+        allow_128x128 = false;
+    }
+#endif
+
+    if (allow_128x128 && K > 64 && n_tiles(CONV_SHAPE_128x128) >= shader_core_count * 2) {
        return CONV_SHAPE_128x128;
    } else if (K <= 32 && n_tiles(CONV_SHAPE_32x256) >= shader_core_count * 2) {
        return CONV_SHAPE_32x256;
+    } else if (K <= 64 && n_tiles(CONV_SHAPE_64x128) >= shader_core_count * 2) {
+        return CONV_SHAPE_64x128;
+    } else if (!allow_128x128 && K > 64 && n_tiles(CONV_SHAPE_64x128) >= shader_core_count * 2) {
+        // cm1 fallback for large K when 128x128 isn't available
+        return CONV_SHAPE_64x128;
    } else {
        return CONV_SHAPE_64x32;
    }
@@ -9648,7 +10019,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
        return nullptr;
    case GGML_OP_REPEAT:
        if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) {
-            return ctx->device->pipeline_repeat_f32;
+            return ctx->device->pipeline_repeat_i32;
+        }
+        if (ggml_type_size(src0->type) == 2 && ggml_type_size(dst->type) == 2) {
+            return ctx->device->pipeline_repeat_i16;
        }
        return nullptr;
    case GGML_OP_REPEAT_BACK:
@@ -10008,7 +10382,18 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
            uint32_t p1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 3) : 0;
            uint32_t d0 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 4) : 1;
            uint32_t d1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 5) : 1;
-            vk_conv2d_pipeline_state conv2d_pipeline_state(s0, s1, p0, p1, d0, d1, KW, KH);
+
+            // tile-aligned shapes let the shader skip bounds checks
+            const uint32_t Cin = (uint32_t)src1->ne[2];
+            const uint32_t CRS = Cin * KW * KH;
+            const uint32_t BS_K   = vk_conv_block_sizes[shape].K;
+            const uint32_t BS_CRS = vk_conv_block_sizes[shape].CRS;
+            const uint32_t BS_NPQ = vk_conv_block_sizes[shape].NPQ;
+            const uint32_t aligned = ((K   % BS_K   == 0) &&
+                                      (CRS % BS_CRS == 0) &&
+                                      (NPQ % BS_NPQ == 0)) ? 1u : 0u;
+
+            vk_conv2d_pipeline_state conv2d_pipeline_state(s0, s1, p0, p1, d0, d1, KW, KH, aligned);

            std::map<vk_conv2d_pipeline_state, vk_pipeline> *pipelines = nullptr;
            if (op == GGML_OP_CONV_2D) {
@@ -16152,7 +16537,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                return false;
            }
        case GGML_OP_REPEAT:
-            return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float);
+            return ggml_type_size(op->type) == ggml_type_size(op->src[0]->type) &&
+                  (ggml_type_size(op->type) == sizeof(float) || ggml_type_size(op->type) == 2);
        case GGML_OP_REPEAT_BACK:
            return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32;
        case GGML_OP_ROPE:
--- a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
@@ -11,6 +11,10 @@ if (GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
    add_compile_definitions(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
    message(STATUS "Enabling coopmat2 glslc support")
 endif()
+if (GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT)
+    add_compile_definitions(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT)
+    message(STATUS "Enabling coopmat2 decode_vector glslc support")
+endif()
 if (GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
    add_compile_definitions(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
    message(STATUS "Enabling dot glslc support")
--- a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
@@ -7,6 +7,13 @@
 #extension GL_KHR_memory_scope_semantics : enable
 #endif

+#ifdef COOPMAT
+#extension GL_KHR_cooperative_matrix : enable
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_KHR_memory_scope_semantics : enable
+#endif
+
 #ifdef USE_COLLECTIVES
 #    extension GL_KHR_shader_subgroup_shuffle : enable
 #endif
@@ -77,6 +84,39 @@ layout(constant_id = 12) const uint d1             = 1;
 // Kernel spatial sizes
 layout(constant_id = 13) const uint KW             = 1;
 layout(constant_id = 14) const uint KH             = 1;
+// when set, skip bounds checks and address clamps (K/CRS/NPQ are tile-aligned)
+layout(constant_id = 15) const uint aligned        = 0;
+// stage cm2 result through shmem (Csh) for coalesced stores. cm1 always does this.
+layout(constant_id = 16) const uint csh_store      = 0;
+
+#ifdef COOPMAT
+// cm1 subgroup tile: each subgroup computes a WM x WN region as a grid of
+// TM x TN x TK fragments. Requires WM%TM == WN%TN == BS_K%WM == BS_NPQ%WN ==
+// BS_CRS%TK == 0, and WG_SIZE == (BS_K/WM) * (BS_NPQ/WN) * subgroup_size.
+layout(constant_id = 17) const uint WM             = 32;
+layout(constant_id = 18) const uint WN             = 32;
+const uint TM = 16;
+const uint TN = 16;
+const uint TK = 16;
+const uint cms_per_row = WM / TM;
+const uint cms_per_col = WN / TN;
+const uint warps_M     = BS_K / WM;
+const uint warps_N     = BS_NPQ / WN;
+#endif
+
+// without padding, H_idx/W_idx are in bounds by construction (non-TRANSPOSE only)
+#ifdef TRANSPOSE
+const bool hw_in_bounds = false;
+#else
+const bool hw_in_bounds = (p0 == 0) && (p1 == 0);
+#endif
+
+// TRANSPOSE stride alignment is trivially satisfied for stride 1
+#ifdef TRANSPOSE
+const bool stride_in_bounds = (s0 == 1) && (s1 == 1);
+#else
+const bool stride_in_bounds = true;
+#endif

 uint32_t       tid     = gl_LocalInvocationID.x;
 const uint32_t WG_SIZE = gl_WorkGroupSize.x;
@@ -94,7 +134,7 @@ uint32_t n_elems_out = K * NPQ;
 // Number of blocktiles per input
 uint32_t NB_CRS = splitWork(CRS, BS_CRS);

-#ifdef COOPMAT2
+#if defined(COOPMAT2) || defined(COOPMAT)
 #define SHMEM_TYPE float16_t
 #else
 #define SHMEM_TYPE float
@@ -112,6 +152,17 @@ const uint32_t Bsh_len = BS_CRS * Bsh_stride;
 shared SHMEM_TYPE Ash[Ash_len];  // K x CRS
 shared SHMEM_TYPE Bsh[Bsh_len];  // CRS x NPQ

+#if defined(COOPMAT2) || defined(COOPMAT)
+// stage matC through shmem so global stores are row-major (NPQ-contiguous)
+const uint32_t Csh_stride = BS_NPQ;
+#ifdef COOPMAT
+const uint32_t Csh_len    = BS_K * Csh_stride;
+#else
+const uint32_t Csh_len    = csh_store != 0 ? BS_K * Csh_stride : 1;
+#endif
+shared SHMEM_TYPE Csh[Csh_len];  // K x NPQ
+#endif
+
 // Threadtile sizes
 const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K;

@@ -161,7 +212,7 @@ ACC_TYPE perElemOpStore(const in uint32_t r, const in uint32_t c, const in ACC_T
    uint32_t OH_idx  = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW;
    uint32_t OW_idx  = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
    uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
-    if (K_idx < K && NPQ_idx < NPQ) {
+    if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) {
        dst_data[dst_idx] = D_TYPE(elem);
    }
    return elem;
@@ -176,6 +227,13 @@ void main() {
 #ifdef COOPMAT2
    coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator> matC;
    matC = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator>(0.0);
+#elif defined(COOPMAT)
+    coopmat<float16_t, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> sums[cms_per_row * cms_per_col];
+    [[unroll]] for (uint i = 0; i < cms_per_row * cms_per_col; i++) {
+        sums[i] = coopmat<float16_t, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(0.0);
+    }
+    const uint warp_r = gl_SubgroupID / warps_N;
+    const uint warp_c = gl_SubgroupID % warps_N;
 #else
    float regC[TS_K][TS_NPQ];
    for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
@@ -228,12 +286,15 @@ void main() {
            uint32_t B_lx    = Ac;
            uint32_t K_idx   = B_idx_K * BS_K + B_ly; /* Global K_idx (row index of A)*/
 #ifdef TRANSPOSE
-            uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + K_idx * p.nb02 + Cin_idx_a * p.nb03, K * CRS - 1);
+            uint32_t knl_idx = KW_idx_a + KH_idx_a * p.nb01 + K_idx * p.nb02 + Cin_idx_a * p.nb03;
 #else
-            uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + Cin_idx_a * p.nb02 + K_idx * p.nb03, K * CRS - 1);
+            uint32_t knl_idx = KW_idx_a + KH_idx_a * p.nb01 + Cin_idx_a * p.nb02 + K_idx * p.nb03;
 #endif
+            if (aligned == 0) {
+                knl_idx = min(knl_idx, K * CRS - 1);
+            }
            float    val     = knl_data[knl_idx];
-            if (K_idx >= K || CRS_idx_a >= CRS) {
+            if (aligned == 0 && (K_idx >= K || CRS_idx_a >= CRS)) {
                val = 0.0;
            }
            Ash[B_ly * Ash_stride + B_lx] = SHMEM_TYPE(val);
@@ -282,15 +343,27 @@ void main() {
            uint32_t H_idx = OH_idx * s1 + KH_idx_b * d1 - p1;
            uint32_t W_idx = OW_idx * s0 + KW_idx_b * d0 - p0;
 #endif
-            uint32_t src_idx =
-                min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1);
+            uint32_t src_idx = W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13;
+            // skip clamp when address can't go OOB
+            if (aligned == 0 || !hw_in_bounds || !stride_in_bounds) {
+                src_idx = min(max(src_idx, 0), p.Cin * p.N * p.W * p.H - 1);
+            }
            float val = src_data[src_idx];
-            if (CRS_idx_b >= CRS || NPQ_idx >= NPQ
-                || H_idx >= p.H || W_idx >= p.W // Lower bound checks aren't necessary. (idx >= 0x80000000 for such case)
+            bool oob = false;
+            if (aligned == 0 && (CRS_idx_b >= CRS || NPQ_idx >= NPQ)) {
+                oob = true;
+            }
+            // also catches lower-bound underflow (idx wraps to 0x80000000+)
+            if (!hw_in_bounds && (H_idx >= p.H || W_idx >= p.W)) {
+                oob = true;
+            }
 #ifdef TRANSPOSE
-                || (H_idx_x_s1 - H_idx * s1 != 0) || (W_idx_x_s0 - W_idx * s0 != 0)
+            if (!stride_in_bounds &&
+                ((H_idx_x_s1 - H_idx * s1 != 0) || (W_idx_x_s0 - W_idx * s0 != 0))) {
+                oob = true;
+            }
 #endif
-                ) {
+            if (oob) {
                val = 0.0;
            }
            Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val);
@@ -303,6 +376,23 @@ void main() {
        coopMatLoad(matA, Ash, 0, Ash_stride, gl_CooperativeMatrixLayoutRowMajor);
        coopMatLoad(matB, Bsh, 0, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor);
        matC = coopMatMulAdd(matA, matB, matC);
+#elif defined(COOPMAT)
+        // each subgroup multiplies its grid of fragments per TK-sized CRS chunk
+        [[unroll]] for (uint k_step = 0; k_step < BS_CRS / TK; k_step++) {
+            coopmat<float16_t, gl_ScopeSubgroup, TM, TK, gl_MatrixUseA> cache_a[cms_per_row];
+            [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
+                const uint a_off = (warp_r * WM + cm_row * TM) * Ash_stride + k_step * TK;
+                coopMatLoad(cache_a[cm_row], Ash, a_off, Ash_stride, gl_CooperativeMatrixLayoutRowMajor);
+            }
+            [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
+                coopmat<float16_t, gl_ScopeSubgroup, TK, TN, gl_MatrixUseB> cache_b;
+                const uint b_off = k_step * TK * Bsh_stride + warp_c * WN + cm_col * TN;
+                coopMatLoad(cache_b, Bsh, b_off, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor);
+                [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
+                    sums[cm_col * cms_per_row + cm_row] = coopMatMulAdd(cache_a[cm_row], cache_b, sums[cm_col * cms_per_row + cm_row]);
+                }
+            }
+        }
 #else
        if (T_y * TS_K < K) {
            UNROLL for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) {
@@ -325,8 +415,51 @@ void main() {
        barrier();
    }
    /* Save C* */
+#if defined(COOPMAT2) || defined(COOPMAT)
+    // stage matC into Csh, then write to dst with coalesced NPQ-contiguous stores
+#ifdef COOPMAT
+    const bool use_staged_store = true;
+#else
+    const bool use_staged_store = (csh_store != 0);
+#endif
+    if (use_staged_store) {
+#ifdef COOPMAT
+        // cm1: each subgroup stores its fragment grid into its Csh slot
+        [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
+            [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
+                const uint csh_off = (warp_r * WM + cm_row * TM) * Csh_stride + warp_c * WN + cm_col * TN;
+                coopMatStore(sums[cm_col * cms_per_row + cm_row], Csh, csh_off, Csh_stride, gl_CooperativeMatrixLayoutRowMajor);
+            }
+        }
+#else
+        coopMatStore(matC, Csh, 0, Csh_stride, gl_CooperativeMatrixLayoutRowMajor);
+#endif
+        barrier();
+
+        // cooperative shmem->global: WG threads spread across BS_NPQ (the
+        // contiguous direction of dst), each iter covers store_rows_per_iter K-rows
+        const uint32_t store_rows_per_iter = WG_SIZE / BS_NPQ;
+        const uint32_t store_iters         = BS_K / store_rows_per_iter;
+        const uint32_t k_thread_offset     = tid / BS_NPQ;
+        const uint32_t npq_thread          = tid % BS_NPQ;
+        [[unroll]] for (uint32_t i = 0; i < store_iters; i++) {
+            uint32_t k_local = i * store_rows_per_iter + k_thread_offset;
+            uint32_t K_idx   = B_idx_K * BS_K + k_local;
+            uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + npq_thread;
+            uint32_t N_idx   = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL);
+            uint32_t OH_idx  = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL);
+            uint32_t OW_idx  = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
+            uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
+            if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) {
+                dst_data[dst_idx] = D_TYPE(Csh[k_local * Csh_stride + npq_thread]);
+            }
+        }
+    }
 #ifdef COOPMAT2
-    coopMatPerElementNV(matC, matC, perElemOpStore);
+    else {
+        coopMatPerElementNV(matC, matC, perElemOpStore);
+    }
+#endif
 #else
    if (T_y * TS_K < K) {
        for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
@@ -337,7 +470,7 @@ void main() {
                uint32_t OH_idx  = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW;
                uint32_t OW_idx  = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
                uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
-                if (K_idx < K && NPQ_idx < NPQ) {
+                if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) {
                    dst_data[dst_idx] = regC[T_ly][T_lx];
                }
            }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl
@@ -5,21 +5,60 @@
 #include "types.glsl"

 #if defined(DATA_A_F32)
+FLOAT_TYPE dequantize1(uint ib, uint iqs, uint a_offset) {
+    return data_a[a_offset + ib];
+}
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]);
 }
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    return vec4(data_a[a_offset + ib    ], data_a[a_offset + ib + 1],
+                data_a[a_offset + ib + 2], data_a[a_offset + ib + 3]);
+}
+vec4 dequantize4_2aligned(uint ib, uint iqs, uint a_offset) {
+    return vec4(data_a[a_offset + ib    ], data_a[a_offset + ib + 1],
+                data_a[a_offset + ib + 2], data_a[a_offset + ib + 3]);
+}
+
 #endif

 #if defined(DATA_A_F16)
+FLOAT_TYPE dequantize1(uint ib, uint iqs, uint a_offset) {
+    return data_a[a_offset + ib];
+}
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]);
 }
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    return vec4(data_a[a_offset + ib    ], data_a[a_offset + ib + 1],
+                data_a[a_offset + ib + 2], data_a[a_offset + ib + 3]);
+}
+vec4 dequantize4_2aligned(uint ib, uint iqs, uint a_offset) {
+    const vec2 a = data_a_packed32[(a_offset + ib)/2];
+    const vec2 b = data_a_packed32[(a_offset + ib)/2 + 1];
+    return vec4(a, b);
+}
 #endif

 #if defined(DATA_A_BF16)
+FLOAT_TYPE dequantize1(uint ib, uint iqs, uint a_offset) {
+    return bf16_to_fp32(data_a[a_offset + ib]);
+}
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    return vec2(bf16_to_fp32(data_a[a_offset + ib]), bf16_to_fp32(data_a[a_offset + ib + 1]));
 }
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    return vec4(bf16_to_fp32(data_a[a_offset + ib    ]), bf16_to_fp32(data_a[a_offset + ib + 1]),
+                bf16_to_fp32(data_a[a_offset + ib + 2]), bf16_to_fp32(data_a[a_offset + ib + 3]));
+}
+vec4 dequantize4_2aligned(uint ib, uint iqs, uint a_offset) {
+    const uint a = data_a_packed32[(a_offset + ib)/2];
+    const uint b = data_a_packed32[(a_offset + ib)/2 + 1];
+    return vec4(uintBitsToFloat((a & 0x0000ffff) << 16),
+                uintBitsToFloat( a & 0xffff0000),
+                uintBitsToFloat((b & 0x0000ffff) << 16),
+                uintBitsToFloat( b & 0xffff0000));
+}
 #endif

 #if defined(DATA_A_Q4_0)
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl
@@ -1,4 +1,12 @@

+// Each format defines a scalar dequantFunc<T> plus a V=4 dequantFunc<T>_v
+// passed as the optional vector decoder to coopMatLoadTensorNV via
+// GL_NV_cooperative_matrix_decode_vector. When the driver doesn't support
+// the extension, ggml-vulkan.cpp strips it from the compiled SPIR-V.
+#ifdef GL_NV_cooperative_matrix_decode_vector
+#extension GL_NV_cooperative_matrix_decode_vector : enable
+#endif
+
 #include "types.glsl"

 layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufF32 {
@@ -25,6 +33,19 @@ float16_t dequantFuncQ1_0(const in decodeBufQ1_0 bl, const in uint blockCoords[2
    return bit != 0u ? d : -d;
 }

+f16vec4 dequantFuncQ1_0_v(const in decodeBufQ1_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float16_t d  = bl.block.d;
+    const float16_t md = -d;
+    const uint idx = coordInBlock[1];
+    const uint qs_nib = uint(bl.block.qs[idx >> 3]) >> (idx & 0x4u);
+    return f16vec4(
+        (qs_nib & 1u) != 0u ? d : md,
+        (qs_nib & 2u) != 0u ? d : md,
+        (qs_nib & 4u) != 0u ? d : md,
+        (qs_nib & 8u) != 0u ? d : md);
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ4_0 {
   block_q4_0_packed16 block;
 };
@@ -42,10 +63,28 @@ float16_t dequantFuncQ4_0(const in decodeBufQ4_0 bl, const in uint blockCoords[2
    return ret;
 }

+f16vec4 dequantFuncQ4_0_v(const in decodeBufQ4_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+    const uint shift = (idx & 0x10) >> 2;     // 0 or 4
+    const uint qs_i = (idx & 0xE) >> 1;       // even, in {0,2,4,6}
+    const uint qsw = uint32_t(bl.block.qs[qs_i    ])
+                   | (uint32_t(bl.block.qs[qs_i + 1u]) << 16);
+    // shift in {0,4}: per-byte mask 0x0F isolates the wanted nibble in each byte.
+    const uint q4   = (qsw >> shift) & 0x0F0F0F0Fu;
+    const u8vec4 q  = unpack8(q4);
+    return f16vec4((vec4(q) - vec4(8.0)) * vec4(float(d)));
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ4_1 {
   block_q4_1 block;
 };

+layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ4_1_packed32 {
+   block_q4_1_packed32 block;
+};
+
 float16_t dequantFuncQ4_1(const in decodeBufQ4_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
    const float16_t d = bl.block.d;
@@ -60,10 +99,27 @@ float16_t dequantFuncQ4_1(const in decodeBufQ4_1 bl, const in uint blockCoords[2
    return ret;
 }

+f16vec4 dequantFuncQ4_1_v(const in decodeBufQ4_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ4_1_packed32 bl32 = decodeBufQ4_1_packed32(bl);
+    const float16_t d = bl.block.d;
+    const float16_t m = bl.block.m;
+    const uint idx = coordInBlock[1];
+    const uint shift = (idx & 0x10) >> 2;     // 0 or 4
+    const uint qs_w  = (idx & 0xC) >> 2;      // iqs / 4 in [0,4)
+    const uint qsw   = uint32_t(bl32.block.qs[qs_w]);
+    const u8vec4 q   = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
+    return f16vec4(vec4(q) * vec4(float(d)) + vec4(float(m)));
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ5_0 {
   block_q5_0 block;
 };

+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ5_0_packed16 {
+   block_q5_0_packed16 block;
+};
+
 float16_t dequantFuncQ5_0(const in decodeBufQ5_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
    const float16_t d = bl.block.d;
@@ -82,10 +138,32 @@ float16_t dequantFuncQ5_0(const in decodeBufQ5_0 bl, const in uint blockCoords[2
    return ret;
 }

+f16vec4 dequantFuncQ5_0_v(const in decodeBufQ5_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ5_0_packed16 bl16 = decodeBufQ5_0_packed16(bl);
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+    const uint shift = (idx & 0x10) >> 2;     // 0 or 4
+    const uint qs_i  = (idx & 0xC) >> 1;      // packed16 word index, in {0,2,4,6}
+    const uint qsw = uint32_t(bl16.block.qs[qs_i    ])
+                   | (uint32_t(bl16.block.qs[qs_i + 1u]) << 16);
+    const u8vec4 ql = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
+
+    const uint uint_qh = uint(bl16.block.qh[1]) << 16 | uint(bl16.block.qh[0]);
+    const uint qh_pack = uint_qh >> idx;      // bits 0..3 = element idx..idx+3 high bits
+    const uvec4 qh_high = (uvec4(qh_pack, qh_pack >> 1u, qh_pack >> 2u, qh_pack >> 3u) & uvec4(0x01u)) << 4u;
+
+    return f16vec4((vec4(ql) + vec4(qh_high) - vec4(16.0)) * vec4(float(d)));
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 8) buffer decodeBufQ5_1 {
   block_q5_1 block;
 };

+layout(buffer_reference, std430, buffer_reference_align = 8) buffer decodeBufQ5_1_packed32 {
+   block_q5_1_packed32 block;
+};
+
 float16_t dequantFuncQ5_1(const in decodeBufQ5_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
    const float16_t d = bl.block.d;
@@ -105,6 +183,23 @@ float16_t dequantFuncQ5_1(const in decodeBufQ5_1 bl, const in uint blockCoords[2
    return ret;
 }

+f16vec4 dequantFuncQ5_1_v(const in decodeBufQ5_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ5_1_packed32 bl32 = decodeBufQ5_1_packed32(bl);
+    const float16_t d = bl.block.d;
+    const float16_t m = bl.block.m;
+    const uint idx = coordInBlock[1];
+    const uint shift = (idx & 0x10) >> 2;     // 0 or 4
+    const uint qs_w  = (idx & 0xC) >> 2;      // iqs / 4 in [0,4)
+    const uint qsw   = uint32_t(bl32.block.qs[qs_w]);
+    const u8vec4 ql  = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
+
+    const uint qh_pack = bl.block.qh >> idx;  // bits 0..3 = element idx..idx+3 high bits
+    const uvec4 qh_high = (uvec4(qh_pack, qh_pack >> 1u, qh_pack >> 2u, qh_pack >> 3u) & uvec4(0x01u)) << 4u;
+
+    return f16vec4((vec4(ql) + vec4(qh_high)) * vec4(float(d)) + vec4(float(m)));
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ8_0 {
   block_q8_0_packed16 block;
 };
@@ -121,6 +216,17 @@ float16_t dequantFuncQ8_0(const in decodeBufQ8_0 bl, const in uint blockCoords[2
    return ret;
 }

+f16vec4 dequantFuncQ8_0_v(const in decodeBufQ8_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+    const uint base = idx >> 1u;
+    const uint w =  uint(uint16_t(bl.block.qs[base]))
+                 | (uint(uint16_t(bl.block.qs[base + 1u])) << 16u);
+    const i8vec4 qi = unpack8(int32_t(w));
+    return f16vec4(vec4(qi) * vec4(float(d)));
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ2_K {
   block_q2_K block;
 };
@@ -129,6 +235,10 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ2
   block_q2_K_packed16 block;
 };

+layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ2_K_packed32 {
+   block_q2_K_packed32 block;
+};
+
 float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
    decodeBufQ2_K_packed16 bl16 = decodeBufQ2_K_packed16(bl);
@@ -147,10 +257,36 @@ float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2
    return ret;
 }

+f16vec4 dequantFuncQ2_K_v(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ2_K_packed32 bl32 = decodeBufQ2_K_packed32(bl);
+    const f16vec2 dm = bl.block.dm;
+    const uint idx = coordInBlock[1];
+
+    const uint scalesi = idx >> 4;                      // 0..15
+    const uint qsshift = (idx & 0x60) >> 4;             // 0,2,4,6
+
+    // qs_i (packed16) = ((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1) is even for idx % 4 == 0,
+    // so qs_w (packed32) = qs_i / 2 = ((idx & 0x80) >> 4) + ((idx & 0x1Cu) >> 2).
+    const uint qs_w   = ((idx & 0x80) >> 4) + ((idx & 0x1Cu) >> 2);
+    const uint qsw    = uint32_t(bl32.block.qs[qs_w]);
+    const uint qs4    = (qsw >> qsshift) & 0x03030303u;
+    const u8vec4 qi   = unpack8(qs4);
+
+    const uint scales      = bl.block.scales[scalesi];
+    const float16_t d_sub  = dm.x * float16_t(scales & 0xF);
+    const float16_t m_sub  = dm.y * float16_t(scales >> 4);
+    return f16vec4(vec4(qi) * vec4(float(d_sub)) - vec4(float(m_sub)));
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ3_K {
   block_q3_K block;
 };

+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ3_K_packed16 {
+   block_q3_K_packed16 block;
+};
+
 float16_t dequantFuncQ3_K(const in decodeBufQ3_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
    const uint idx = coordInBlock[1];
@@ -179,6 +315,47 @@ float16_t dequantFuncQ3_K(const in decodeBufQ3_K bl, const in uint blockCoords[2
    return ret;
 }

+f16vec4 dequantFuncQ3_K_v(const in decodeBufQ3_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ3_K_packed16 bl16 = decodeBufQ3_K_packed16(bl);
+    const uint idx = coordInBlock[1];
+
+    const uint n         = idx >> 7;             // 0,1
+    const uint is        = idx >> 4;             // 0..15
+    const uint halfsplit = (idx & 0x60) >> 5;    // 0,1,2,3
+    const uint qsshift   = halfsplit << 1;       // 0,2,4,6
+    const uint hbit      = (n << 2) + halfsplit; // 0..7   (bit position in hmask byte)
+
+    uint32_t scaleidx0      = (is < 8) ? is : (is - 8);
+    uint32_t scaleidx0shift = (is < 8) ? 0u : 4u;
+    uint32_t scaleidx1      = is + 8 - (is / 4) * 4;
+    uint32_t scaleidx1shift = (is / 4) * 2;
+
+    const int8_t us = int8_t(
+        ((bl.block.scales[scaleidx0] >> scaleidx0shift) & 0xF) |
+        (((bl.block.scales[scaleidx1] >> scaleidx1shift) & 3) << 4));
+    const float16_t dl = bl.block.d * float16_t(int(us) - 32);
+
+    // For idx % 4 == 0: (idx & 0x1F) == (idx & 0x1C) is a multiple of 4.
+    const uint qsi = (n << 5) + (idx & 0x1Cu);
+    const uint hmi =             (idx & 0x1Cu);
+
+    // Two adjacent uint16 packed16 reads, combined into a uint32 in registers.
+    // After this: byte j of qsw / hmw holds the data for element idx+j.
+    const uint qsw = uint32_t(bl16.block.qs[qsi >> 1])
+                   | (uint32_t(bl16.block.qs[(qsi >> 1) + 1u]) << 16);
+    const uint hmw = uint32_t(bl16.block.hmask[hmi >> 1])
+                   | (uint32_t(bl16.block.hmask[(hmi >> 1) + 1u]) << 16);
+
+    // qsshift in {0,2,4,6} and hbit in {0..7}: per-byte masks isolate the wanted bits
+    // with no inter-byte leakage.
+    const uint ql4 = (qsw >> qsshift) & 0x03030303u;
+    const uint qh4 = (hmw >> hbit)    & 0x01010101u;
+
+    const ivec4 q = ivec4(unpack8(ql4 | (qh4 << 2))) - ivec4(4);
+    return f16vec4(vec4(q) * vec4(float(dl)));
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K {
   block_q4_K block;
 };
@@ -187,6 +364,10 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4
   block_q4_K_packed16 block;
 };

+layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed32 {
+   block_q4_K_packed32 block;
+};
+
 layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed128 {
   block_q4_K_packed128 block;
 };
@@ -334,6 +515,55 @@ float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2
    return float16_t(ret);
 }

+f16vec4 dequantFuncQ4_K_v(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ4_K_packed32 bl32 = decodeBufQ4_K_packed32(bl);
+    decodeBufQ4_K_packed128 bl128 = decodeBufQ4_K_packed128(bl);
+    const uint idx = coordInBlock[1];
+
+    const uint is = idx >> 5;                    // 0..7
+
+#if defined(IS_MUL_MM2) && defined(DATA_A_Q4_K)
+    vec2 v = shAscales[is * shAscales_stride + (blockCoords[0] % BM)];
+    float d = v.x;
+    float m = v.y;
+#else
+    uvec4 v = bl128.block.q4k[0];
+    const vec2 loadd = vec2(unpackFloat2x16(v.x));
+
+    uint32_t sc;
+    uint32_t mbyte;
+
+    uint32_t scale0 = v.y;
+    uint32_t scale4 = v.z;
+    uint32_t scale8 = v.w;
+
+    uint32_t sc_lo = scale0;
+    uint32_t mb_lo = scale4;
+    uint32_t sc_hi = (scale8 & 0x0F0F0F0F) | ((scale0 & 0xC0C0C0C0) >> 2);
+    uint32_t mb_hi = ((scale8 & 0xF0F0F0F0) >> 4) | ((scale4 & 0xC0C0C0C0) >> 2);
+
+    sc = is < 4 ? sc_lo : sc_hi;
+    mbyte = is < 4 ? mb_lo : mb_hi;
+    sc = sc >> (8 * (is & 3));
+    mbyte = mbyte >> (8 * (is & 3));
+    sc &= 0x3F;
+    mbyte &= 0x3F;
+
+    const float d = loadd.x * float(sc);
+    const float m = loadd.y * float(mbyte);
+#endif
+
+    // idx in [0,256); vector decode uses idx a multiple of 4. packed32 word index:
+    // (qs_i >> 1) == (idx >> 6) * 8 + ((idx & 0x1E) >> 2). sh is 0 or 4 only, so a
+    // single (w >> sh) & 0x0F0F0F0F isolates all four nibbles without inter-byte leakage.
+    const uint sh = (idx & 0x20u) >> 3u;
+    const uint w = uint32_t(bl32.block.qs[(idx >> 6) * 8u + ((idx & 0x1Eu) >> 2)]);
+    const u8vec4 q = unpack8((w >> sh) & 0x0F0F0F0Fu);
+
+    return f16vec4(vec4(d) * vec4(q) - vec4(m));
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K {
   block_q5_K block;
 };
@@ -346,6 +576,10 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5
   block_q5_K_packed128 block;
 };

+layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K_packed32 {
+   block_q5_K_packed32 block;
+};
+
 float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
    decodeBufQ5_K_packed16 bl16 = decodeBufQ5_K_packed16(bl);
@@ -399,6 +633,58 @@ float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2
    return float16_t(ret);
 }

+f16vec4 dequantFuncQ5_K_v(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ5_K_packed32 bl32 = decodeBufQ5_K_packed32(bl);
+    decodeBufQ5_K_packed128 bl128 = decodeBufQ5_K_packed128(bl);
+    const uint idx = coordInBlock[1];
+    const uint is = idx >> 5;
+
+#if defined(IS_MUL_MM2) && defined(DATA_A_Q5_K)
+    vec2 v = shAscales[is * shAscales_stride + (blockCoords[0] % BM)];
+    float d = v.x;
+    float m = v.y;
+#else
+    uvec4 v = bl128.block.q5k[0];
+
+    const f16vec2 loadd = unpackFloat2x16(v.x);
+
+    uint32_t sc;
+    uint32_t mbyte;
+
+    uint32_t scale0 = v.y;
+    uint32_t scale4 = v.z;
+    uint32_t scale8 = v.w;
+
+    uint32_t sc_lo = scale0;
+    uint32_t mb_lo = scale4;
+    uint32_t sc_hi = (scale8 & 0x0F0F0F0F) | ((scale0 & 0xC0C0C0C0) >> 2);
+    uint32_t mb_hi = ((scale8 & 0xF0F0F0F0) >> 4) | ((scale4 & 0xC0C0C0C0) >> 2);
+
+    sc = is < 4 ? sc_lo : sc_hi;
+    mbyte = is < 4 ? mb_lo : mb_hi;
+    sc = sc >> (8 * (is & 3));
+    mbyte = mbyte >> (8 * (is & 3));
+    sc &= 0x3F;
+    mbyte &= 0x3F;
+
+    const float16_t d = loadd.x * float16_t(sc);
+    const float16_t m = loadd.y * float16_t(mbyte);
+#endif
+
+    // sh is 0 or 4; mask 0x0F0F0F0F covers the four nibbles regardless (no inter-byte leakage).
+    const uint sh = (idx & 0x20u) >> 3u;
+    const uint qs_w = (idx >> 6) * 8u + ((idx & 0x1Eu) >> 2);
+    const uint qh_w = (idx & 0x1Eu) >> 2;
+
+    const uint ql4 = (uint32_t(bl32.block.qs[qs_w]) >> sh) & 0x0F0F0F0Fu;
+    // qh stores bit `is` per element across 4 consecutive bytes; one shift+mask handles all 4.
+    const uint qh4 = ((uint32_t(bl32.block.qh[qh_w]) >> is) & 0x01010101u) << 4u;
+
+    const u8vec4 qi = unpack8(ql4 | qh4);
+    return f16vec4(vec4(qi) * vec4(d) - vec4(m));
+}
+
 layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ6_K {
   block_q6_K block;
 };
@@ -431,6 +717,35 @@ float16_t dequantFuncQ6_K(const in decodeBufQ6_K bl, const in uint blockCoords[2
    return ret;
 }

+f16vec4 dequantFuncQ6_K_v(const in decodeBufQ6_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ6_K_packed16 bl16 = decodeBufQ6_K_packed16(bl);
+    const uint idx = coordInBlock[1];
+
+    const uint b = (idx & 0x40) >> 6;
+    const uint qhshift = (idx & 0x60) >> 4;          // 0,2,4,6
+    const uint is = idx >> 4;
+    const uint sh = b * 4;                            // 0 or 4
+
+    const float16_t dscale = bl.block.d * float16_t(bl.block.scales[is]);
+
+    const uint ql_i = ((idx & 0x80) >> 2) + ((idx & 0x3E) >> 1);
+    const uint qh_i = ((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1);
+
+    // Two adjacent uint16 packed16 reads, combined into a uint32 in registers.
+    // After this: byte j of qlw / qhw holds the data for element idx+j.
+    const uint qlw = uint32_t(bl16.block.ql[ql_i    ]) | (uint32_t(bl16.block.ql[ql_i + 1]) << 16);
+    const uint qhw = uint32_t(bl16.block.qh[qh_i    ]) | (uint32_t(bl16.block.qh[qh_i + 1]) << 16);
+
+    // sh in {0,4} and qhshift in {0,2,4,6}: per-byte masks 0x0F / 0x03 keep only the
+    // wanted bits with no inter-byte leakage; place qh's 2 bits at nibble high position.
+    const uint ql4 = (qlw >> sh) & 0x0F0F0F0Fu;
+    const uint qh4 = ((qhw >> qhshift) & 0x03030303u) << 4u;
+
+    const ivec4 qi = ivec4(unpack8(ql4 | qh4));
+    return f16vec4((vec4(qi) - vec4(32.0f)) * vec4(float(dscale)));
+}
+
 #if defined(DATA_A_IQ1_S)
 layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ1_S {
   block_iq1_s block;
@@ -453,6 +768,29 @@ float16_t dequantFuncIQ1_S(const in decodeBufIQ1_S bl, const in uint blockCoords
    float16_t ret = float16_t(dl) * (float16_t(bitfieldExtract(int(grid), 2 * int(idx % 8), 2)) + float16_t(delta));
    return ret;
 }
+
+f16vec4 dequantFuncIQ1_S_v(const in decodeBufIQ1_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+
+    const uint ib32 = idx >> 5;
+    const uint ib8  = idx >> 3;
+    const int  i8b  = int(idx & 4);              // 0 or 4
+
+    const uint qh = bl.block.qh[ib32];
+    const uint qs = bl.block.qs[ib8];
+    const float dl    = float(d) * float(2 * bitfieldExtract(qh, 12, 3) + 1);
+    const float delta = ((qh & 0x8000u) != 0u) ? -IQ1S_DELTA : IQ1S_DELTA;
+    const uint  grid  = iq1s_grid[qs | (bitfieldExtract(qh, 3 * int(ib8 & 3), 3) << 8)];
+
+    const ivec4 q = ivec4(
+        bitfieldExtract(int(grid), 2 * (i8b + 0), 2),
+        bitfieldExtract(int(grid), 2 * (i8b + 1), 2),
+        bitfieldExtract(int(grid), 2 * (i8b + 2), 2),
+        bitfieldExtract(int(grid), 2 * (i8b + 3), 2));
+    return f16vec4((vec4(q) + vec4(delta)) * dl);
+}
 #endif

 #if defined(DATA_A_IQ1_M)
@@ -485,6 +823,33 @@ float16_t dequantFuncIQ1_M(const in decodeBufIQ1_M bl, const in uint blockCoords
    float16_t ret = d * float16_t(dl) * (float16_t(bitfieldExtract(int(grid), 2 * i8, 2)) + float16_t(delta));
    return ret;
 }
+
+f16vec4 dequantFuncIQ1_M_v(const in decodeBufIQ1_M bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufIQ1_M_packed64 bl64 = decodeBufIQ1_M_packed64(bl);
+    const uint idx = coordInBlock[1];
+
+    uvec2 scales = unpack32(bl64.block.scales);
+    const float16_t d = uint16BitsToHalf(uint16_t(((scales.x & 0xF000) >> 12) | ((scales.x & 0xF0000000) >> 24) | ((scales.y & 0xF000) >> 4) | ((scales.y & 0xF0000000) >> 16)));
+
+    const uint ib8  = idx >> 3;
+    const uint ib16 = idx >> 4;
+    const int  i8b  = int(idx & 4);   // 0 or 4 -- i8 base for the V=4 group
+
+    const uint sc = bl.block.scales[ib8 / 8];
+    const uint qs = bl.block.qs[ib8];
+    const uint qh = bl.block.qh[ib16] >> (4 * (ib8 & 1));
+    const float dl    = 2.0 * float(bitfieldExtract(sc, 3 * int(ib16 & 3), 3)) + 1.0;
+    const float delta = ((qh & 8u) != 0u) ? -IQ1S_DELTA : IQ1S_DELTA;
+    const uint  grid  = iq1s_grid[qs | ((qh & 7u) << 8)];
+
+    const ivec4 q = ivec4(
+        bitfieldExtract(int(grid), 2 * (i8b + 0), 2),
+        bitfieldExtract(int(grid), 2 * (i8b + 1), 2),
+        bitfieldExtract(int(grid), 2 * (i8b + 2), 2),
+        bitfieldExtract(int(grid), 2 * (i8b + 3), 2));
+    return f16vec4((vec4(q) + vec4(delta)) * (float(d) * dl));
+}
 #endif

 #if defined(DATA_A_IQ2_XXS)
@@ -520,6 +885,33 @@ float16_t dequantFuncIQ2_XXS(const in decodeBufIQ2_XXS bl, const in uint blockCo
    vec2 ret = dscale * g * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf);
    return float16_t(ret[idx & 1]);
 }
+
+f16vec4 dequantFuncIQ2_XXS_v(const in decodeBufIQ2_XXS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufIQ2_XXS_packed16 bl16 = decodeBufIQ2_XXS_packed16(bl);
+    const uint idx = coordInBlock[1];
+
+    const uint ib32 = idx >> 5;
+    const uint ib8  = (idx & 0x18) >> 3;
+    const uint iqs  = 8 * ib32 + ib8;
+
+    const uint qs        = bl.block.qs[iqs];
+    const uint signscale = pack32(u16vec2(bl16.block.qs[4*ib32+2], bl16.block.qs[4*ib32+3]));
+    const float dscale   = float(bl.block.d) * 0.25 * (0.5 + float(signscale >> 28));
+
+    uint sign = bitfieldExtract(signscale, 7 * int(ib8), 7);
+    sign |= bitCount(sign) << 7;
+    const uint sb = sign >> (idx & 7u);
+
+    const uint   g2 = iq2xxs_grid[qs][(idx & 4) >> 2];
+    const u8vec4 g  = unpack8(g2);
+
+    return f16vec4(
+        dscale * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
+        dscale * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
+        dscale * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
+        dscale * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
+}
 #endif

 #if defined(DATA_A_IQ2_XS)
@@ -548,6 +940,31 @@ float16_t dequantFuncIQ2_XS(const in decodeBufIQ2_XS bl, const in uint blockCoor
    vec2 ret = dscale * g * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf);
    return float16_t(ret[idx & 1]);
 }
+
+f16vec4 dequantFuncIQ2_XS_v(const in decodeBufIQ2_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const uint idx = coordInBlock[1];
+
+    const uint is     = idx >> 5;
+    const uint sshift = (idx & 0x10) >> 2;
+    const uint iqs    = idx >> 3;
+
+    const uint16_t qs     = bl.block.qs[iqs];
+    const float    dscale = float(bl.block.d) * 0.25 * (0.5 + float((bl.block.scales[is] >> sshift) & 0xF));
+
+    uint sign = uint(qs >> 9);
+    sign |= bitCount(sign) << 7;
+    const uint sb = sign >> (idx & 7u);
+
+    const uint   g2 = iq2xs_grid[qs & 0x1FF][(idx & 4) >> 2];
+    const u8vec4 g  = unpack8(g2);
+
+    return f16vec4(
+        dscale * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
+        dscale * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
+        dscale * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
+        dscale * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
+}
 #endif

 #if defined(DATA_A_IQ2_S)
@@ -576,6 +993,32 @@ float16_t dequantFuncIQ2_S(const in decodeBufIQ2_S bl, const in uint blockCoords
    const vec2 v = db * vec2(sign01) * vec2(unpack8(g2));
    return float16_t(v[idx & 1]);
 }
+
+f16vec4 dequantFuncIQ2_S_v(const in decodeBufIQ2_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const uint idx = coordInBlock[1];
+
+    const uint ib32    = idx >> 5;
+    const uint ib8     = idx >> 3;
+    const uint qhshift = 2 * (ib8 % 4);
+
+    const uint scale = (bl.block.scales[ib32] >> ((idx & 0x10) >> 2)) & 0xf;
+    const uint qs    = bl.block.qs[ib8];
+    const uint qh    = bl.block.qh[ib32];
+    const uint sb    = uint(bl.block.qs[QUANT_K / 8 + ib8]) >> (idx & 0x6u);
+
+    const float d  = float(bl.block.d);
+    const float db = d * 0.25 * (0.5 + scale);
+
+    const uint   g2 = iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(idx & 4) >> 2];
+    const u8vec4 g  = unpack8(g2);
+
+    return f16vec4(
+        db * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
+        db * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
+        db * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
+        db * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
+}
 #endif

 #if defined(DATA_A_IQ3_XXS)
@@ -609,6 +1052,32 @@ float16_t dequantFuncIQ3_XXS(const in decodeBufIQ3_XXS bl, const in uint blockCo
    const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
    return float16_t(v[idx & 1]);
 }
+
+f16vec4 dequantFuncIQ3_XXS_v(const in decodeBufIQ3_XXS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufIQ3_XXS_packed16 bl16 = decodeBufIQ3_XXS_packed16(bl);
+    const uint idx = coordInBlock[1];
+
+    const uint iqs = idx >> 2;
+    const uint is  = QUANT_K / 4 + ((idx & 0xE0) >> 3);
+
+    const float d     = float(bl.block.d);
+    const uint  qs    = bl.block.qs[iqs];
+    const uint  signs = pack32(u16vec2(bl16.block.qs[is/2+0], bl16.block.qs[is/2+1]));
+    const float db    = d * 0.5 * (0.5 + (signs >> 28));
+
+    const uint sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7);
+    const uint sb    = (sign7 | (bitCount(sign7) << 7)) >> (idx & 0x6u);
+
+    const uint   grid = iq3xxs_grid[qs];
+    const u8vec4 g    = unpack8(grid);
+
+    return f16vec4(
+        db * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
+        db * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
+        db * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
+        db * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
+}
 #endif

 #if defined(DATA_A_IQ3_S)
@@ -635,6 +1104,30 @@ float16_t dequantFuncIQ3_S(const in decodeBufIQ3_S bl, const in uint blockCoords

    return float16_t(v[idx & 1]);
 }
+
+f16vec4 dequantFuncIQ3_S_v(const in decodeBufIQ3_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const uint idx = coordInBlock[1];
+
+    const uint iqs = idx >> 2;
+    const uint iqh = idx >> 5;
+
+    const float d     = float(bl.block.d);
+    const uint  qs    = bl.block.qs[iqs];
+    const uint  qh    = bl.block.qh[iqh];
+    const uint  sb    = uint(bl.block.signs[iqs / 2]) >> (idx & 0x6u);
+    const uint  scale = bl.block.scales[iqs / 16];
+    const float db    = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf));
+
+    const uint   grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)];
+    const u8vec4 g    = unpack8(grid);
+
+    return f16vec4(
+        db * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
+        db * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
+        db * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
+        db * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
+}
 #endif

 #if defined(DATA_A_IQ4_XS)
@@ -642,6 +1135,10 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4
   block_iq4_xs block;
 };

+layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufIQ4_XS_packed32 {
+   block_iq4_xs_packed32 block;
+};
+
 float16_t dequantFuncIQ4_XS(const in decodeBufIQ4_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
    const float16_t d = bl.block.d;
@@ -657,6 +1154,30 @@ float16_t dequantFuncIQ4_XS(const in decodeBufIQ4_XS bl, const in uint blockCoor
    float16_t ret = d * float16_t(int(sl | (sh << 4)) - 32) * float16_t(kvalues_iq4nl[q]);
    return ret;
 }
+
+f16vec4 dequantFuncIQ4_XS_v(const in decodeBufIQ4_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufIQ4_XS_packed32 bl32 = decodeBufIQ4_XS_packed32(bl);
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+
+    const uint ib32   = idx >> 5;                                   // 0..7
+    const uint sl     = (bl32.block.scales_l >> (4 * ib32)) & 0xF;
+    const uint sh     = (uint(bl32.block.scales_h) >> (2 * ib32)) & 0x3;
+    const uint qshift = (idx & 0x10) >> 2;                          // {0, 4}
+    const uint qs_w   = 4 * ib32 + ((idx & 0xC) >> 2);              // iqs / 4, in [0,32)
+
+    const float16_t dl = d * float16_t(int(sl | (sh << 4)) - 32);
+
+    const uint qsw  = bl32.block.qs[qs_w];
+    const u8vec4 qv = unpack8((qsw >> qshift) & 0x0F0F0F0Fu);
+    const vec4 ret = vec4(
+        float(kvalues_iq4nl[qv.x]),
+        float(kvalues_iq4nl[qv.y]),
+        float(kvalues_iq4nl[qv.z]),
+        float(kvalues_iq4nl[qv.w])) * float(dl);
+    return f16vec4(ret);
+}
 #endif

 #if defined(DATA_A_IQ4_NL)
@@ -664,6 +1185,10 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4
   block_iq4_nl block;
 };

+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_NL_packed16 {
+   block_iq4_nl_packed16 block;
+};
+
 float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
    const float16_t d = bl.block.d;
@@ -676,6 +1201,24 @@ float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoor
    float16_t ret = float16_t(kvalues_iq4nl[qs]) * d;
    return ret;
 }
+
+f16vec4 dequantFuncIQ4_NL_v(const in decodeBufIQ4_NL bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufIQ4_NL_packed16 bl16 = decodeBufIQ4_NL_packed16(bl);
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+    const uint shift = (idx & 0x10) >> 2;     // 0 or 4
+    const uint qs_i  = (idx & 0xC) >> 1;      // packed16 word index, in {0,2,4,6}
+    const uint qsw = uint32_t(bl16.block.qs[qs_i    ])
+                   | (uint32_t(bl16.block.qs[qs_i + 1u]) << 16);
+    // shift in {0,4}: per-byte mask 0x0F isolates the wanted nibble in each byte.
+    const u8vec4 q = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
+    return f16vec4(
+        float(d) * float(kvalues_iq4nl[q.x]),
+        float(d) * float(kvalues_iq4nl[q.y]),
+        float(d) * float(kvalues_iq4nl[q.z]),
+        float(d) * float(kvalues_iq4nl[q.w]));
+}
 #endif

 #if defined(DATA_A_MXFP4)
@@ -695,6 +1238,26 @@ float16_t dequantFuncMXFP4(const in decodeBufMXFP4 bl, const in uint blockCoords
    float16_t ret = float16_t(kvalues_mxfp4[qs] * d * 0.5);
    return ret;
 }
+
+f16vec4 dequantFuncMXFP4_v(const in decodeBufMXFP4 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float d = e8m0_to_fp32(bl.block.e);
+    const uint idx = coordInBlock[1];
+    const uint iqs = idx & 0xF;
+    const uint shift = (idx & 0x10) >> 2;
+    uvec4 qv = uvec4(
+        uint(bl.block.qs[iqs]),
+        uint(bl.block.qs[iqs + 1u]),
+        uint(bl.block.qs[iqs + 2u]),
+        uint(bl.block.qs[iqs + 3u]));
+    qv = (qv >> shift) & 0xFu;
+    const vec4 ret = vec4(
+        float(kvalues_mxfp4[qv.x]),
+        float(kvalues_mxfp4[qv.y]),
+        float(kvalues_mxfp4[qv.z]),
+        float(kvalues_mxfp4[qv.w])) * d * 0.5f;
+    return f16vec4(ret);
+}
 #endif

 #if defined(DATA_A_NVFP4)
@@ -702,6 +1265,10 @@ layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufNVF
   block_nvfp4 block;
 };

+layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufNVFP4_packed32 {
+   block_nvfp4_packed32 block;
+};
+
 float16_t dequantFuncNVFP4(const in decodeBufNVFP4 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
    const uint idx = coordInBlock[1];
@@ -713,56 +1280,97 @@ float16_t dequantFuncNVFP4(const in decodeBufNVFP4 bl, const in uint blockCoords
    qs = (qs >> shift) & 0xF;
    return float16_t(kvalues_mxfp4[qs] * d * 0.5);
 }
+
+f16vec4 dequantFuncNVFP4_v(const in decodeBufNVFP4 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufNVFP4_packed32 bl32 = decodeBufNVFP4_packed32(bl);
+    const uint idx = coordInBlock[1];
+    const uint sub   = idx >> 4;
+    const uint qs_w  = ((idx & 0x30) >> 3) + ((idx & 0x4u) >> 2);  // iqs / 4, in [0,8)
+    const uint shift = (idx & 0x8) >> 1;
+    const float d    = ue4m3_to_fp32(bl.block.d[sub]);
+
+    const uint qsw  = uint32_t(bl32.block.qs[qs_w]);
+    const u8vec4 qv = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
+    const vec4 ret = vec4(
+        float(kvalues_mxfp4[qv.x]),
+        float(kvalues_mxfp4[qv.y]),
+        float(kvalues_mxfp4[qv.z]),
+        float(kvalues_mxfp4[qv.w])) * d * 0.5f;
+    return f16vec4(ret);
+}
 #endif

 #if defined(DATA_A_Q1_0)
 #define dequantFuncA dequantFuncQ1_0
+#define dequantFuncA_v dequantFuncQ1_0_v
 #elif defined(DATA_A_Q4_0)
 #define dequantFuncA dequantFuncQ4_0
+#define dequantFuncA_v dequantFuncQ4_0_v
 #elif defined(DATA_A_Q4_1)
 #define dequantFuncA dequantFuncQ4_1
+#define dequantFuncA_v dequantFuncQ4_1_v
 #elif defined(DATA_A_Q5_0)
 #define dequantFuncA dequantFuncQ5_0
+#define dequantFuncA_v dequantFuncQ5_0_v
 #elif defined(DATA_A_Q5_1)
 #define dequantFuncA dequantFuncQ5_1
+#define dequantFuncA_v dequantFuncQ5_1_v
 #elif defined(DATA_A_Q8_0)
 #define dequantFuncA dequantFuncQ8_0
+#define dequantFuncA_v dequantFuncQ8_0_v
 #elif defined(DATA_A_Q2_K)
 #define dequantFuncA dequantFuncQ2_K
+#define dequantFuncA_v dequantFuncQ2_K_v
 #elif defined(DATA_A_Q3_K)
 #define dequantFuncA dequantFuncQ3_K
+#define dequantFuncA_v dequantFuncQ3_K_v
 #elif defined(DATA_A_Q4_K)
 #define dequantFuncA dequantFuncQ4_K
+#define dequantFuncA_v dequantFuncQ4_K_v
 #define fetch_scales fetch_scalesQ4_K
 #define store_scales store_scalesQ4_K
 #elif defined(DATA_A_Q5_K)
 #define dequantFuncA dequantFuncQ5_K
+#define dequantFuncA_v dequantFuncQ5_K_v
 #define fetch_scales fetch_scalesQ5_K
 #define store_scales store_scalesQ4_K
 #elif defined(DATA_A_Q6_K)
 #define dequantFuncA dequantFuncQ6_K
+#define dequantFuncA_v dequantFuncQ6_K_v
 #elif defined(DATA_A_IQ1_S)
 #define dequantFuncA dequantFuncIQ1_S
+#define dequantFuncA_v dequantFuncIQ1_S_v
 #elif defined(DATA_A_IQ1_M)
 #define dequantFuncA dequantFuncIQ1_M
+#define dequantFuncA_v dequantFuncIQ1_M_v
 #elif defined(DATA_A_IQ2_XXS)
 #define dequantFuncA dequantFuncIQ2_XXS
+#define dequantFuncA_v dequantFuncIQ2_XXS_v
 #elif defined(DATA_A_IQ2_XS)
 #define dequantFuncA dequantFuncIQ2_XS
+#define dequantFuncA_v dequantFuncIQ2_XS_v
 #elif defined(DATA_A_IQ2_S)
 #define dequantFuncA dequantFuncIQ2_S
+#define dequantFuncA_v dequantFuncIQ2_S_v
 #elif defined(DATA_A_IQ3_XXS)
 #define dequantFuncA dequantFuncIQ3_XXS
+#define dequantFuncA_v dequantFuncIQ3_XXS_v
 #elif defined(DATA_A_IQ3_S)
 #define dequantFuncA dequantFuncIQ3_S
+#define dequantFuncA_v dequantFuncIQ3_S_v
 #elif defined(DATA_A_IQ4_XS)
 #define dequantFuncA dequantFuncIQ4_XS
+#define dequantFuncA_v dequantFuncIQ4_XS_v
 #elif defined(DATA_A_IQ4_NL)
 #define dequantFuncA dequantFuncIQ4_NL
+#define dequantFuncA_v dequantFuncIQ4_NL_v
 #elif defined(DATA_A_MXFP4)
 #define dequantFuncA dequantFuncMXFP4
+#define dequantFuncA_v dequantFuncMXFP4_v
 #elif defined(DATA_A_NVFP4)
 #define dequantFuncA dequantFuncNVFP4
+#define dequantFuncA_v dequantFuncNVFP4_v
 #elif defined(DATA_A_F32)
 #define dequantFuncA dequantFuncF32
 #endif
--- a/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp
@@ -0,0 +1,7 @@
+#version 460
+
+#extension GL_NV_cooperative_matrix_decode_vector : require
+
+void main()
+{
+}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
@@ -11,6 +11,9 @@
 #extension GL_KHR_memory_scope_semantics : enable
 #extension GL_KHR_cooperative_matrix : enable
 #extension GL_NV_cooperative_matrix2 : enable
+#ifdef GL_NV_cooperative_matrix_decode_vector
+#extension GL_NV_cooperative_matrix_decode_vector : enable
+#endif
 #extension GL_EXT_buffer_reference : enable
 #extension GL_KHR_shader_subgroup_ballot : enable
 #extension GL_KHR_shader_subgroup_vote : enable
@@ -54,6 +57,41 @@ float16_t faDecodeV(const decodeBufFA_V bl_in, const uint blockCoords[2], const
    }
 }

+// V=4 vector decode for K/V; dispatches to per-format _v decoders.
+f16vec4 faDecodeKVector(const decodeBufFA_K bl_in, const uint blockCoords[2], const uint coordInBlock[2]) {
+    switch (FaTypeK) {
+        case 0u: return f16vec4(decodeBufF32(bl_in).block);
+        case 2u: return dequantFuncQ4_0_v(decodeBufQ4_0(bl_in), blockCoords, coordInBlock);
+        case 3u: return dequantFuncQ4_1_v(decodeBufQ4_1(bl_in), blockCoords, coordInBlock);
+        case 6u: return dequantFuncQ5_0_v(decodeBufQ5_0(bl_in), blockCoords, coordInBlock);
+        case 7u: return dequantFuncQ5_1_v(decodeBufQ5_1(bl_in), blockCoords, coordInBlock);
+        case 8u: return dequantFuncQ8_0_v(decodeBufQ8_0(bl_in), blockCoords, coordInBlock);
+        case 41u: return dequantFuncQ1_0_v(decodeBufQ1_0(bl_in), blockCoords, coordInBlock);
+        default: return f16vec4(0);
+    }
+}
+
+f16vec4 faDecodeVVector(const decodeBufFA_V bl_in, const uint blockCoords[2], const uint coordInBlock[2]) {
+    switch (FaTypeV) {
+        case 0u: return f16vec4(decodeBufF32(bl_in).block);
+        case 2u: return dequantFuncQ4_0_v(decodeBufQ4_0(bl_in), blockCoords, coordInBlock);
+        case 3u: return dequantFuncQ4_1_v(decodeBufQ4_1(bl_in), blockCoords, coordInBlock);
+        case 6u: return dequantFuncQ5_0_v(decodeBufQ5_0(bl_in), blockCoords, coordInBlock);
+        case 7u: return dequantFuncQ5_1_v(decodeBufQ5_1(bl_in), blockCoords, coordInBlock);
+        case 8u: return dequantFuncQ8_0_v(decodeBufQ8_0(bl_in), blockCoords, coordInBlock);
+        case 41u: return dequantFuncQ1_0_v(decodeBufQ1_0(bl_in), blockCoords, coordInBlock);
+        default: return f16vec4(0);
+    }
+}
+
+#ifdef GL_NV_cooperative_matrix_decode_vector
+#define FADECODEK , faDecodeK, faDecodeKVector
+#define FADECODEV , faDecodeV, faDecodeVVector
+#else
+#define FADECODEK , faDecodeK
+#define FADECODEV , faDecodeV
+#endif
+
 layout (binding = 0) readonly buffer Q {uint8_t data_q[];};
 layout (binding = 1) readonly buffer K {uint8_t data_k[];};
 layout (binding = 2) readonly buffer V {uint8_t data_v[];};
@@ -259,7 +297,7 @@ void main() {
        // F16: bs_k==1 (direct load). F32: bs_k==4 (vec4 / dequantFuncF32). Q4/Q8 family: bs_k==32. Q1_0: bs_k==128.
        const bool k_use_decode = (bs_k > 1u);
        if (k_use_decode) {
-            coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose, faDecodeK);
+            coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose FADECODEK);
        } else {
            coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose);
        }
@@ -325,7 +363,7 @@ void main() {
        uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23;
        const bool v_use_decode = (bs_v > 1u);
        if (v_use_decode) {
-            coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad), faDecodeV);
+            coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad) FADECODEV);
        } else {
            coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad));
        }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp
@@ -0,0 +1,69 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : require
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_KHR_shader_subgroup_shuffle : enable
+
+layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
+
+layout(constant_id = 0) const uint WARP_SIZE = 32;
+layout(constant_id = 1) const uint N = 128;
+
+layout(push_constant) uniform parameter
+{
+    uint n_rows;
+    uint src_offset;
+    uint dst_offset;
+    float scale;
+};
+
+layout(binding = 0, std430) readonly buffer A { float data_a[]; };
+layout(binding = 1, std430) writeonly buffer D { float data_d[]; };
+
+const uint EL_W = N / WARP_SIZE;
+
+void main() {
+    const uint lane = gl_SubgroupInvocationID;
+    for (uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_SubgroupID;
+            row < n_rows;
+            row += gl_NumWorkGroups.x * gl_WorkGroupSize.y) {
+        const uint row_offset = row * N;
+
+        float reg[EL_W];
+
+        [[unroll]]
+        for (uint i = 0; i < EL_W; ++i) {
+            reg[i] = data_a[src_offset + row_offset + i * WARP_SIZE + lane] * scale;
+        }
+
+        [[unroll]]
+        for (uint h = 1; h < WARP_SIZE; h <<= 1) {
+            [[unroll]]
+            for (uint j = 0; j < EL_W; ++j) {
+                const float val = reg[j];
+                const float val2 = subgroupShuffleXor(val, h);
+                reg[j] = (lane & h) == 0 ? val + val2 : val2 - val;
+            }
+        }
+
+        [[unroll]]
+        for (uint h = WARP_SIZE; h < N; h <<= 1) {
+            const uint step = h / WARP_SIZE;
+            [[unroll]]
+            for (uint j = 0; j < EL_W; j += 2 * step) {
+                [[unroll]]
+                for (uint k = 0; k < step; ++k) {
+                    const float x = reg[j + k];
+                    const float y = reg[j + k + step];
+                    reg[j + k] = x + y;
+                    reg[j + k + step] = x - y;
+                }
+            }
+        }
+
+        [[unroll]]
+        for (uint i = 0; i < EL_W; ++i) {
+            data_d[dst_offset + row_offset + i * WARP_SIZE + lane] = reg[i];
+        }
+    }
+}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
@@ -10,12 +10,38 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 #if !defined(DATA_A_F32) && !defined(DATA_A_F16) && !defined(DATA_A_BF16)
 #define K_PER_ITER 8
 #else
-#define K_PER_ITER 2
+#define K_PER_ITER 4
 #endif


 uint a_offset, b_offset, d_offset, y_offset;

+vec4 load_b(const uint j, const uint iybs, const uint iqs, const bool lastiter, out bool OOB_y, out bool OOB_z, out bool OOB_w) {
+    // Check if the latter elements are OOB, and don't fetch B or accumulate it.
+    OOB_y = lastiter && (iybs + iqs + y_offset >= p.ncols);
+    OOB_z = lastiter && (iybs + iqs + y_offset*2 >= p.ncols);
+    OOB_w = lastiter && (iybs + iqs + y_offset*3 >= p.ncols);
+
+    if (!OOB_w) {
+        return vec4(FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]),
+                 FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]),
+                 FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset*2]),
+                 FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset*3]));
+    } else if (!OOB_z) {
+        return vec4(FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]),
+                 FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]),
+                 FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset*2]),
+                 0);
+    } else if (!OOB_y) {
+        return vec4(FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]),
+                 FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]),
+                 0, 0);
+    } else {
+        return vec4(FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]),
+                 0, 0, 0);
+    }
+}
+
 void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter)
 {
    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
@@ -25,6 +51,8 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const

 #if K_PER_ITER == 8
 #if QUANT_R == 2
+        // Note that we end up fetching bogus elements here, but its fine as they'll be
+        // within an accessible block.
        const vec4 bv02 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4]);
        const vec4 bv13 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs + y_offset) / 4]);
        const vec4 bv0 = vec4(bv02.x, bv13.x, bv02.y, bv13.y);
@@ -34,18 +62,11 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const
        const vec4 bv1 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4 + 1]);
 #endif
 #else
-        // Check if the second of the pair of elements is OOB, and don't fetch B or
-        // accumulate it. We still fetch a pair of elements for A, which is fine for
-        // quantized formats since they'll be within the same block. We should
-        // probably skip fetching the second element for F16/F32, but as of now we
-        // still do.
-        const bool OOB = lastiter && (iybs + iqs + y_offset >= p.ncols);
+        bool OOB_y;
+        bool OOB_z;
+        bool OOB_w;

-        FLOAT_TYPE b0 = 0, b1 = 0;
-        b0 = FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]);
-        if (!OOB) {
-            b1 = FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]);
-        }
+        const vec4 b = load_b(j, iybs, iqs, lastiter, OOB_y, OOB_z, OOB_w);
 #endif
        uint ibi = first_row*p.ncols;
        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
@@ -71,22 +92,60 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const

            temp[j][n] += rowtmp;
 #else
-            const vec2 v = dequantize(ib, iqs, a_offset);
-
-            // matrix multiplication
-            temp[j][n] = fma(FLOAT_TYPE(v.x), b0, temp[j][n]);
-            if (!OOB) {
-                temp[j][n] = fma(FLOAT_TYPE(v.y), b1, temp[j][n]);
+            if (!OOB_w) {
+                const vec4 v = dequantize4(ib, iqs, a_offset);
+                temp[j][n] += dot(v, b);
+            } else if (!OOB_z) {
+                const vec2 v0 = dequantize(ib, iqs, a_offset);
+                const FLOAT_TYPE v1 = dequantize1(ib + 2/QUANT_R, iqs, a_offset);
+                const vec3 v = vec3(v0.x, v0.y, v1);
+                const vec3 b0 = vec3(b.x, b.y, b.z);
+                temp[j][n] += dot(v, b0);
+            } else if (!OOB_y) {
+                const vec2 v0 = dequantize(ib, iqs, a_offset);
+                const vec2 b0 = vec2(b.x, b.y);
+                temp[j][n] += dot(v0, b0);
+            } else {
+                const FLOAT_TYPE v = dequantize1(ib, iqs, a_offset);
+                temp[j][n] = fma(v, b.x, temp[j][n]);
            }
 #endif
        }
    }
 }

+#if defined(DATA_A_F32) || defined(DATA_A_F16) || defined(DATA_A_BF16)
+void iter_aligned_nonquant(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i)
+{
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        const uint col = i*BLOCK_SIZE + K_PER_ITER*tid;
+        const uint iqs = 0; // quant index
+        const uint iybs = col; // y block start index
+
+        const vec4 b = data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4];
+
+        uint ibi = first_row*p.ncols;
+        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+            const uint ib = (ibi + col)/QUANT_K; // block index
+            ibi += p.ncols;
+
+            const vec4 v = dequantize4_2aligned(ib, iqs, a_offset);
+
+            // matrix multiplication
+            temp[j][n] += dot(v, b);
+        }
+    }
+}
+#endif
+
 void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
    const uint tid = gl_LocalInvocationID.x;

    get_offsets(a_offset, b_offset, d_offset);
+    const bool is_aligned_nonquant =
+        p.batch_stride_b % 4 == 0 && b_offset % 4 == 0 &&
+        p.ncols % 4 == 0 && BLOCK_SIZE % 4 == 0 &&
+        K_PER_ITER == 4;

    y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;

@@ -105,17 +164,26 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
    int unroll_count = 4;
    uint unrolled_iters = num_iters & ~(unroll_count - 1);

-#if K_PER_ITER == 2
+    uint i = 0;
+
+#if K_PER_ITER == 4
    // If the K dimension is odd, we need lastiter==true on the last iteration
    // so OOB is computed correctly. Skip some unrolling to make that happen.
-    if ((p.ncols & 1) != 0 &&
+    if ((p.ncols & 3) != 0 &&
        unrolled_iters == num_iters &&
        unrolled_iters > 0) {
        unrolled_iters -= unroll_count;
    }
+    if (is_aligned_nonquant) {
+        while (i < unrolled_iters) {
+            // Manually partially unroll the loop
+            [[unroll]] for (uint k = 0; k < unroll_count; ++k) {
+                iter_aligned_nonquant(temp, first_row, num_rows, tid, i*K_PER_ITER);
+                i++;
+            }
+        }
+    } else {
 #endif
-
-    uint i = 0;
    while (i < unrolled_iters) {
        // Manually partially unroll the loop
        [[unroll]] for (uint k = 0; k < unroll_count; ++k) {
@@ -123,18 +191,30 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
            i++;
        }
    }
+#if K_PER_ITER == 4
+    }
+#endif

    unroll_count = 2;
    unrolled_iters = num_iters & ~(unroll_count - 1);

-#if K_PER_ITER == 2
-    if ((p.ncols & 1) != 0 &&
+#if K_PER_ITER == 4
+    if ((p.ncols & 3) != 0 &&
        unrolled_iters == num_iters &&
        unrolled_iters > 0) {
        unrolled_iters -= unroll_count;
    }
-#endif

+    if (is_aligned_nonquant) {
+        while (i < unrolled_iters && is_aligned_nonquant) {
+            // Manually partially unroll the loop
+            [[unroll]] for (uint k = 0; k < unroll_count; ++k) {
+                iter_aligned_nonquant(temp, first_row, num_rows, tid, i*K_PER_ITER);
+                i++;
+            }
+        }
+    } else {
+#endif
    while (i < unrolled_iters) {
        // Manually partially unroll the loop
        [[unroll]] for (uint k = 0; k < unroll_count; ++k) {
@@ -142,10 +222,25 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
            i++;
        }
    }
+#if K_PER_ITER == 4
+    }
+#endif
+
+#if K_PER_ITER == 4
+    if (is_aligned_nonquant) {
+        while (i < num_iters) {
+            iter_aligned_nonquant(temp, first_row, num_rows, tid, i*K_PER_ITER);
+            i++;
+        }
+    } else {
+#endif
    while (i < num_iters) {
        iter(temp, first_row, num_rows, tid, i*K_PER_ITER, true);
        i++;
    }
+#if K_PER_ITER == 4
+    }
+#endif

    reduce_result(temp, d_offset, first_row, num_rows, tid);
 }
@@ -164,6 +259,6 @@ void main() {
        if (first_row >= p.stride_d) {
            return;
        }
-        compute_outputs(first_row, p.stride_d - first_row);
+        compute_outputs(first_row, min(NUM_ROWS, p.stride_d - first_row));
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
@@ -71,10 +71,12 @@ layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
 layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};

 #if QUANT_K > 1
-#define DECODEFUNCA , dequantFuncA
-
 #include "dequant_funcs_cm2.glsl"
-
+#if defined(dequantFuncA_v) && defined(GL_NV_cooperative_matrix_decode_vector)
+#define DECODEFUNCA , dequantFuncA, dequantFuncA_v
+#else
+#define DECODEFUNCA , dequantFuncA
+#endif
 #else
 #define DECODEFUNCA
 #endif
--- a/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl
@@ -31,6 +31,7 @@
 #else
 #define A_TYPE float16_t
 #endif
+#define A_TYPE_PACKED32 f16vec2
 #endif

 #if defined(DATA_A_BF16)
@@ -44,6 +45,7 @@
 #else
 #define A_TYPE uint16_t
 #endif
+#define A_TYPE_PACKED32 uint32_t
 #endif

 #define QUANT_K_Q4_0 32
@@ -1722,11 +1724,18 @@ struct block_nvfp4
    uint8_t qs[QUANT_K_NVFP4 / 2];
 };

+struct block_nvfp4_packed32
+{
+    uint32_t d[QUANT_K_NVFP4 / 16 / 4];
+    uint32_t qs[QUANT_K_NVFP4 / 2 / 4];
+};
+
 #if defined(DATA_A_NVFP4)
 #define QUANT_K QUANT_K_NVFP4
 #define QUANT_R QUANT_R_NVFP4
 #define QUANT_AUXF 1
 #define A_TYPE block_nvfp4
+#define A_TYPE_PACKED32 block_nvfp4_packed32
 #endif

 #if defined(DATA_A_IQ4_NL) || defined(DATA_A_IQ4_XS)
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -798,9 +798,11 @@ void process_shaders() {

    string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});

-    string_to_spv("repeat_f32", "repeat.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("repeat_i32", "repeat.comp", {{"A_TYPE", "int32_t"}, {"D_TYPE", "int32_t"}});
    string_to_spv("repeat_back_f32", "repeat_back.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});

+    string_to_spv("repeat_i16", "repeat.comp", {{"A_TYPE", "int16_t"}, {"D_TYPE", "int16_t"}});
+
    string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});

    string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
@@ -932,6 +934,7 @@ void process_shaders() {

    string_to_spv("argmax_f32", "argmax.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "int"}}));
    string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+    string_to_spv("fwht_f32", "fwht.comp", {});
    string_to_spv("count_equal_i32", "count_equal.comp", merge_maps(base_dict, {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}}));
    string_to_spv("cumsum_f32", "cumsum.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
    string_to_spv("cumsum_multipass1_f32", "cumsum_multipass1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
@@ -984,8 +987,16 @@ void process_shaders() {
                string_to_spv(name + (unroll ? "_unroll" : ""), "conv2d_mm.comp", defines);
 #if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
                if (unroll) {
-                    defines["COOPMAT2"] = "1";
-                    string_to_spv(name, "conv2d_mm.comp", defines, true, false, true);
+                    auto cm2_defines = defines;
+                    cm2_defines["COOPMAT2"] = "1";
+                    string_to_spv(name, "conv2d_mm.comp", cm2_defines, true, false, true);
+                }
+#endif
+#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+                if (unroll) {
+                    auto cm1_defines = defines;
+                    cm1_defines["COOPMAT"] = "1";
+                    string_to_spv(name, "conv2d_mm.comp", cm1_defines, true, true, false);
                }
 #endif
            }
--- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
@@ -52,7 +52,7 @@
 #define WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG 4
 #define WEBGPU_MUL_MAT_VEC_K_Q_OUTPUTS_PER_WG      4

-// default size for legacy matrix multiplication
+// default size for reg-tile matrix multiplication
 #define WEBGPU_MUL_MAT_WG_SIZE 256

 // Same hash combine function as in boost
@@ -93,6 +93,8 @@ struct ggml_webgpu_shader_lib_context {
    uint32_t sg_mat_k                 = 0;
    uint32_t min_subgroup_size        = 0;
    uint32_t max_subgroup_size        = 0;
+    bool     supports_dot_product     = false;
+    std::string vendor;
 };

 struct webgpu_pipeline {
@@ -850,31 +852,15 @@ inline ggml_webgpu_flash_attn_decisions ggml_webgpu_flash_attn_get_decisions(

 /** Matrix Multiplication **/

-struct ggml_webgpu_legacy_mul_mat_pipeline_key {
-    ggml_type src0_type;
-    ggml_type src1_type;
-
-    bool operator==(const ggml_webgpu_legacy_mul_mat_pipeline_key & other) const {
-        return src0_type == other.src0_type && src1_type == other.src1_type;
-    }
-};
-
-struct ggml_webgpu_legacy_mul_mat_pipeline_key_hash {
-    size_t operator()(const ggml_webgpu_legacy_mul_mat_pipeline_key & key) const {
-        size_t seed = 0;
-        ggml_webgpu_hash_combine(seed, key.src0_type);
-        ggml_webgpu_hash_combine(seed, key.src1_type);
-        return seed;
-    }
-};
-
 struct ggml_webgpu_mul_mat_vec_pipeline_key {
    ggml_type src0_type;
    ggml_type src1_type;
    int       vectorized;
+    bool      use_mmvq;

    bool operator==(const ggml_webgpu_mul_mat_vec_pipeline_key & other) const {
-        return src0_type == other.src0_type && src1_type == other.src1_type && vectorized == other.vectorized;
+        return src0_type == other.src0_type && src1_type == other.src1_type && vectorized == other.vectorized &&
+               use_mmvq == other.use_mmvq;
    }
 };

@@ -884,6 +870,7 @@ struct ggml_webgpu_mul_mat_vec_pipeline_key_hash {
        ggml_webgpu_hash_combine(seed, key.src0_type);
        ggml_webgpu_hash_combine(seed, key.src1_type);
        ggml_webgpu_hash_combine(seed, key.vectorized);
+        ggml_webgpu_hash_combine(seed, key.use_mmvq);
        return seed;
    }
 };
@@ -894,6 +881,20 @@ struct ggml_webgpu_mul_mat_vec_shader_decisions {
    uint32_t vec_size;
 };

+struct ggml_webgpu_quantize_q8_pipeline_key {
+    ggml_type src0_type;
+
+    bool operator==(const ggml_webgpu_quantize_q8_pipeline_key & other) const { return src0_type == other.src0_type; }
+};
+
+struct ggml_webgpu_quantize_q8_pipeline_key_hash {
+    size_t operator()(const ggml_webgpu_quantize_q8_pipeline_key & key) const {
+        size_t seed = 0;
+        ggml_webgpu_hash_combine(seed, key.src0_type);
+        return seed;
+    }
+};
+
 struct ggml_webgpu_mul_mat_pipeline_key {
    ggml_type src0_type;
    ggml_type src1_type;
@@ -1051,6 +1052,36 @@ struct ggml_webgpu_soft_max_pipeline_key_hash {
    }
 };

+/** MMVQ **/
+
+inline bool ggml_webgpu_can_use_mmvq(const ggml_tensor * src0,
+                                     const ggml_tensor * src1,
+                                     bool                supports_dot_product,
+                                     const std::string & vendor) {
+    if (src1->ne[1] == 1) {
+        bool supports_dp4a = vendor == "amd" || vendor == "intel" || vendor == "nvidia";
+        if (supports_dp4a && supports_dot_product) {
+            switch (src1->type) {
+                case GGML_TYPE_F32:
+                    switch (src0->type) {
+                        case GGML_TYPE_Q4_0:
+                        case GGML_TYPE_Q4_1:
+                        case GGML_TYPE_Q8_0:
+                        case GGML_TYPE_Q2_K:
+                        case GGML_TYPE_Q4_K:
+                            return src0->ne[0] % 4 == 0;
+                        default:
+                            break;
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    return false;
+}
+
 class ggml_webgpu_shader_lib {
    wgpu::Device           device;
    pre_wgsl::Preprocessor preprocessor;
@@ -1099,14 +1130,12 @@ class ggml_webgpu_shader_lib {
                       webgpu_pipeline,
                       ggml_webgpu_flash_attn_blk_pipeline_key_hash>
        flash_attn_blk_pipelines;
-    std::unordered_map<ggml_webgpu_legacy_mul_mat_pipeline_key,
-                       webgpu_pipeline,
-                       ggml_webgpu_legacy_mul_mat_pipeline_key_hash>
-        mul_mat_legacy_pipelines;  // legacy mul_mat (non-subgroup/non-regtile/non-vec)
    std::unordered_map<ggml_webgpu_mul_mat_vec_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_vec_pipeline_key_hash>
        mul_mat_vec_pipelines;     // fast mat-vec (n==1)
    std::unordered_map<ggml_webgpu_mul_mat_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_pipeline_key_hash>
                                             mul_mat_fast_pipelines;       // fast mat-mat (reg-tile or subgroup)
+    std::unordered_map<ggml_webgpu_quantize_q8_pipeline_key, webgpu_pipeline, ggml_webgpu_quantize_q8_pipeline_key_hash>
+                                             quantize_q8_pipelines;
    std::unordered_map<int, webgpu_pipeline> mul_mat_id_gather_pipelines;  // key is fixed
    std::unordered_map<ggml_webgpu_mul_mat_id_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_id_pipeline_key_hash>
        mul_mat_id_pipelines;                                              // src0_type/src1_type
@@ -1631,7 +1660,7 @@ class ggml_webgpu_shader_lib {
        key.type                              = context.dst->type;
        key.d_state                           = (int) context.src0->ne[0];
        key.xbc_overlap                       = ggml_webgpu_tensor_overlap(context.src1, context.src4) &&
-                          ggml_webgpu_tensor_overlap(context.src1, context.src5);
+                                                ggml_webgpu_tensor_overlap(context.src1, context.src5);

        auto it = ssm_scan_pipelines.find(key);
        if (it != ssm_scan_pipelines.end()) {
@@ -1744,6 +1773,44 @@ class ggml_webgpu_shader_lib {
        return pad_pipelines[key];
    }

+    webgpu_pipeline get_quantize_q8_pipeline(const ggml_webgpu_shader_lib_context & context) {
+        ggml_webgpu_quantize_q8_pipeline_key key = {};
+        key.src0_type                            = context.src0->type;
+
+        auto it = quantize_q8_pipelines.find(key);
+        if (it != quantize_q8_pipelines.end()) {
+            return it->second;
+        }
+        const char *             shader_src = wgsl_quantize_q8;
+        std::vector<std::string> defines;
+        std::string              variant = "quantize_q8";
+
+        uint32_t wg_size = WEBGPU_MUL_MAT_VEC_WG_SIZE;
+
+        defines.push_back("SRC1_INNER_TYPE=f32");
+        defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
+
+        const struct ggml_type_traits * src0_traits = ggml_get_type_traits(context.src0->type);
+        std::string                     src0_name   = src0_traits->type_name;
+        std::string                     type_upper  = src0_name;
+        variant += "_" + src0_name;
+        std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper);
+
+        defines.push_back("MUL_ACC_" + type_upper);
+        defines.push_back("Q8_1_T");
+
+        defines.push_back(context.supports_subgroups ? "USE_SUBGROUP_REDUCTION" : "USE_WORKGROUP_REDUCTION");
+        variant += context.supports_subgroups ? "_sg_reduce" : "_wg_reduce";
+
+        auto processed             = preprocessor.preprocess(shader_src, defines);
+        auto decisions             = std::make_shared<ggml_webgpu_generic_shader_decisions>();
+        decisions->wg_size         = wg_size;
+        webgpu_pipeline pipeline   = ggml_webgpu_create_pipeline(device, processed, variant);
+        pipeline.context           = decisions;
+        quantize_q8_pipelines[key] = pipeline;
+        return quantize_q8_pipelines[key];
+    }
+
    webgpu_pipeline get_mul_mat_vec_pipeline(const ggml_webgpu_shader_lib_context & context) {
        ggml_webgpu_mul_mat_vec_pipeline_key key = {};
        key.src0_type                            = context.src0->type;
@@ -1752,6 +1819,8 @@ class ggml_webgpu_shader_lib {
                          (context.src0->type == GGML_TYPE_F32 || context.src0->type == GGML_TYPE_F16)) ?
                                                       1 :
                                                       0;
+        key.use_mmvq                             =
+            ggml_webgpu_can_use_mmvq(context.src0, context.src1, context.supports_dot_product, context.vendor);

        auto it = mul_mat_vec_pipelines.find(key);
        if (it != mul_mat_vec_pipelines.end()) {
@@ -1788,6 +1857,19 @@ class ggml_webgpu_shader_lib {
                    defines.push_back("U32_DEQUANT_HELPERS");
                    defines.push_back("SRC0_INNER_TYPE=u32");
                    switch (context.src0->type) {
+                        case GGML_TYPE_Q8_0:
+                        case GGML_TYPE_Q4_0:
+                        case GGML_TYPE_Q4_1:
+                            if (key.use_mmvq) {
+                                defines.push_back("LEGACY_QUANTS");
+                            }
+                            break;
+                        case GGML_TYPE_Q2_K:
+                        case GGML_TYPE_Q4_K:
+                            if (key.use_mmvq) {
+                                defines.push_back("K_QUANTS");
+                            }
+                            break;
                        case GGML_TYPE_IQ1_S:
                        case GGML_TYPE_IQ1_M:
                        case GGML_TYPE_IQ2_S:
@@ -1840,6 +1922,11 @@ class ggml_webgpu_shader_lib {
            outputs_per_wg = WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG;
        }

+        if (key.use_mmvq) {
+            defines.push_back("MMVQ");
+            defines.push_back("Q8_1_T");
+        }
+
        defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
        defines.push_back(std::string("OUTPUTS_PER_WG=") + std::to_string(outputs_per_wg));
        defines.push_back(context.supports_subgroups ? "USE_SUBGROUP_REDUCTION" : "USE_WORKGROUP_REDUCTION");
@@ -2018,100 +2105,6 @@ class ggml_webgpu_shader_lib {
        return mul_mat_fast_pipelines[key];
    }

-    webgpu_pipeline get_mul_mat_legacy_pipeline(const ggml_webgpu_shader_lib_context & context) {
-        ggml_webgpu_legacy_mul_mat_pipeline_key key = {};
-        key.src0_type                               = context.src0->type;
-        key.src1_type                               = context.src1->type;
-
-        auto it = mul_mat_legacy_pipelines.find(key);
-        if (it != mul_mat_legacy_pipelines.end()) {
-            return it->second;
-        }
-
-        std::vector<std::string> defines;
-        std::string              variant = "mul_mat";
-
-        switch (context.src1->type) {
-            case GGML_TYPE_F32:
-                defines.push_back("SRC1_TYPE=f32");
-                variant += "_f32";
-                break;
-            case GGML_TYPE_F16:
-                defines.push_back("SRC1_TYPE=f16");
-                variant += "_f16";
-                break;
-            default:
-                GGML_ABORT("Unsupported src1 type for mul_mat legacy shader");
-        }
-
-        const struct ggml_type_traits * src0_traits = ggml_get_type_traits(context.src0->type);
-        const char *                    src0_name   = src0_traits->type_name;
-
-        switch (context.src0->type) {
-            case GGML_TYPE_F32:
-                defines.push_back("SRC0_TYPE=f32");
-                defines.push_back("FLOAT");
-                variant += "_f32";
-                break;
-            case GGML_TYPE_F16:
-                defines.push_back("SRC0_TYPE=f16");
-                defines.push_back("FLOAT");
-                variant += "_f16";
-                break;
-            default:
-                {
-                    std::string type_upper = src0_name;
-                    std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper);
-
-                    switch (context.src0->type) {
-                        case GGML_TYPE_Q4_0:
-                        case GGML_TYPE_Q5_0:
-                        case GGML_TYPE_Q8_0:
-                        case GGML_TYPE_Q3_K:
-                        case GGML_TYPE_Q6_K:
-                        case GGML_TYPE_IQ2_XXS:
-                        case GGML_TYPE_IQ2_XS:
-                        case GGML_TYPE_IQ2_S:
-                        case GGML_TYPE_IQ3_XXS:
-                        case GGML_TYPE_IQ3_S:
-                        case GGML_TYPE_IQ1_S:
-                        case GGML_TYPE_IQ4_NL:
-                        case GGML_TYPE_MXFP4:
-                            {
-                                // Quantized types using u32 buffers for portability.
-                                defines.push_back("SRC0_TYPE=u32");
-                                defines.push_back("U32_DEQUANT_HELPERS");
-                                break;
-                            }
-                        default:
-                            {
-                                defines.push_back(std::string("SRC0_TYPE=") + src0_name);
-                            }
-                    }
-
-                    defines.push_back("BYTE_HELPERS");
-                    defines.push_back(type_upper + "_T");
-                    defines.push_back(type_upper);
-                    defines.push_back(type_upper + "_SCALE_MIN");
-                    defines.push_back(type_upper + "_TABLES");
-                    defines.push_back(type_upper + "_GRID");
-
-                    variant += std::string("_") + src0_name;
-                    break;
-                }
-        }
-
-        auto processed = preprocessor.preprocess(wgsl_mul_mat, defines);
-
-        auto decisions     = std::make_shared<ggml_webgpu_generic_shader_decisions>();
-        decisions->wg_size = WEBGPU_MUL_MAT_WG_SIZE;
-
-        webgpu_pipeline pipeline      = ggml_webgpu_create_pipeline(device, processed, variant);
-        pipeline.context              = decisions;
-        mul_mat_legacy_pipelines[key] = pipeline;
-        return mul_mat_legacy_pipelines[key];
-    }
-
    webgpu_pipeline get_mul_mat_id_gather_pipeline(const ggml_webgpu_shader_lib_context & context) {
        auto it = mul_mat_id_gather_pipelines.find(1);
        if (it != mul_mat_id_gather_pipelines.end()) {
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -94,14 +94,6 @@ static inline uint32_t ggml_webgpu_u32_from_f32(float value) {
 #define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES     4
 #define WEBGPU_STORAGE_BUF_BINDING_MULT          4    // a storage buffer binding size must be a multiple of 4

-// For operations which process a row in parallel, this seems like a reasonable
-// default
-#define WEBGPU_ROW_SPLIT_WG_SIZE 64
-
-// Track https://github.com/gpuweb/gpuweb/issues/5315 for fixes to
-// implementations so this can be removed, necessary only for get_rows right now
-#define WEBGPU_MAX_WG_SIZE 288
-
 /* End Constants */

 // This is a "fake" base pointer, since WebGPU buffers do not have pointers to
@@ -181,6 +173,7 @@ struct webgpu_capabilities {
    wgpu::Limits limits;
    bool         supports_subgroups       = false;
    bool         supports_subgroup_matrix = false;
+    bool         supports_dot_product     = false;

    uint32_t sg_mat_m = 0;
    uint32_t sg_mat_n = 0;
@@ -210,6 +203,8 @@ struct webgpu_global_context_struct {
    wgpu::Buffer    memset_params_buf;
    webgpu_pipeline memset_pipeline;

+    std::string vendor;
+
    // TODO: We should rework the CPU profiling time handling to make it more useful. ref: https://github.com/ggml-org/llama.cpp/pull/22050
 #ifdef GGML_WEBGPU_CPU_PROFILE
    // Profiling: labeled CPU time in ms (total)
@@ -259,6 +254,7 @@ struct webgpu_context_struct {
    wgpu::Buffer             set_rows_host_error_buf;
    wgpu::CommandEncoder     active_command_encoder;
    wgpu::ComputePassEncoder active_compute_pass;
+    bool                     batch_compute_passes = true;

    size_t memset_bytes_per_thread;

@@ -590,9 +586,18 @@ static webgpu_encoded_op ggml_backend_webgpu_build_multi(webgpu_context &
    }
 #else
    for (size_t i = 0; i < dispatches.size(); i++) {
-        ctx->active_compute_pass.SetPipeline(dispatches[i].pipeline.pipeline);
-        ctx->active_compute_pass.SetBindGroup(0, bind_groups[i]);
-        ctx->active_compute_pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second, 1);
+        if (ctx->batch_compute_passes) {
+            ctx->active_compute_pass.SetPipeline(dispatches[i].pipeline.pipeline);
+            ctx->active_compute_pass.SetBindGroup(0, bind_groups[i]);
+            ctx->active_compute_pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second,
+                                                        1);
+        } else {
+            wgpu::ComputePassEncoder pass = ctx->active_command_encoder.BeginComputePass();
+            pass.SetPipeline(dispatches[i].pipeline.pipeline);
+            pass.SetBindGroup(0, bind_groups[i]);
+            pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second, 1);
+            pass.End();
+        }
    }
 #endif

@@ -618,7 +623,7 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_global_context & ctx,
                                              size_t                  size) {
    std::vector<uint32_t>             params       = { (uint32_t) offset, (uint32_t) size, value };
    std::vector<wgpu::BindGroupEntry> entries      = { ggml_webgpu_make_bind_group_entry(0, buf, 0, buf.GetSize()) };
-    size_t                            bytes_per_wg = WEBGPU_MAX_WG_SIZE * ctx->capabilities.memset_bytes_per_thread;
+    size_t                            bytes_per_wg = ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup * ctx->capabilities.memset_bytes_per_thread;
    uint32_t                          wg_x         = CEIL_DIV(size + 3, bytes_per_wg);

    ctx->queue.WriteBuffer(ctx->memset_params_buf, 0, params.data(), params.size() * sizeof(uint32_t));
@@ -736,8 +741,11 @@ static webgpu_encoded_op ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src
        ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, dst),
    };

-    uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
+    uint32_t wg_x;
+    uint32_t wg_y;
+    uint32_t total_wg = CEIL_DIV(ne, decisions->wg_size);
+    compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
 }

 static webgpu_encoded_op ggml_webgpu_set(webgpu_context & ctx,
@@ -961,9 +969,10 @@ static webgpu_encoded_op ggml_webgpu_conv_2d(webgpu_context & ctx,

    auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());

+    uint32_t wg_x;
+    uint32_t wg_y;
    uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
-    uint32_t wg_x     = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg);
-    uint32_t wg_y     = CEIL_DIV(total_wg, wg_x);
+    compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y);

    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
 }
@@ -1051,9 +1060,10 @@ static webgpu_encoded_op ggml_webgpu_im2col(webgpu_context & ctx,

    auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());

+    uint32_t wg_x;
+    uint32_t wg_y;
    uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
-    uint32_t wg_x     = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg);
-    uint32_t wg_y     = CEIL_DIV(total_wg, wg_x);
+    compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y);

    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
 }
@@ -1348,7 +1358,7 @@ static webgpu_encoded_op ggml_webgpu_get_rows(webgpu_context & ctx,
    shader_lib_ctx.src0                           = src;
    shader_lib_ctx.src1                           = nullptr;
    shader_lib_ctx.dst                            = dst;
-    shader_lib_ctx.max_wg_size                    = WEBGPU_MAX_WG_SIZE;
+    shader_lib_ctx.max_wg_size                    = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;

    webgpu_pipeline pipeline  = ctx->shader_lib->get_get_rows_pipeline(shader_lib_ctx);
    auto *          decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
@@ -1384,6 +1394,58 @@ static webgpu_encoded_op ggml_webgpu_get_rows(webgpu_context & ctx,
    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }

+static void ggml_webgpu_quantize_q8_dispatch(webgpu_context &                    ctx,
+                                             ggml_tensor *                       src0,
+                                             ggml_tensor *                       src1,
+                                             ggml_tensor *                       dst,
+                                             std::vector<webgpu_dispatch_desc> & dispatches) {
+    ggml_webgpu_shader_lib_context shader_lib_ctx = {};
+
+    shader_lib_ctx.src0               = src0;
+    shader_lib_ctx.src1               = src1;
+    shader_lib_ctx.dst                = dst;
+    shader_lib_ctx.max_wg_size        = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
+    shader_lib_ctx.supports_subgroups = ctx->global_ctx->capabilities.supports_subgroups;
+
+    webgpu_pipeline qq8_pipeline = ctx->shader_lib->get_quantize_q8_pipeline(shader_lib_ctx);
+
+    // quantize_q8 pipeline
+    const size_t dst_offset           = ggml_webgpu_tensor_offset(dst);
+    const size_t q8_src1_align_offset = ROUNDUP_POW2(
+        dst_offset + ggml_nbytes(dst), ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
+    const size_t q8_src1_binding_size =
+        ROUNDUP_POW2(src1->ne[3] * src1->ne[2] * (36 /* sizeof(q8_1) */ * (src1->ne[0] / /* block_size */ 32)),
+                     WEBGPU_STORAGE_BUF_BINDING_MULT);
+
+    std::vector<uint32_t> q8_params = {
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
+        (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
+        (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),
+        (uint32_t) src1->ne[0],
+        (uint32_t) src1->ne[2],
+        (uint32_t) src1->ne[3],
+    };
+
+    std::vector<wgpu::BindGroupEntry> q8_entries = {
+        ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src1),
+        ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(dst), q8_src1_align_offset, q8_src1_binding_size)
+    };
+
+    auto q8_decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(qq8_pipeline.context.get());
+
+    uint32_t       q8_wg_size     = q8_decisions->wg_size;
+    uint32_t       q8_wg_x        = 1;
+    uint32_t       q8_wg_y        = 1;
+    const uint32_t wg_per_vec     = (src0->ne[0] / 4 + (q8_wg_size - 1)) / q8_wg_size;
+    const uint32_t q8_total_wg    = src1->ne[2] * src1->ne[3] * wg_per_vec;
+    const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
+    compute_2d_workgroups(q8_total_wg, max_wg_per_dim, q8_wg_x, q8_wg_y);
+
+    dispatches.push_back({
+        qq8_pipeline, std::move(q8_params), std::move(q8_entries), { q8_wg_x, q8_wg_y }
+    });
+}
+
 static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
                                             ggml_tensor *    src0,
                                             ggml_tensor *    src1,
@@ -1391,47 +1453,9 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
    // Determine if this is a mat-vec operation
    bool is_vec = (dst->ne[1] == 1);

-    // Determine if we should use fast path
-    bool use_fast = false;
-    switch (src1->type) {
-        case GGML_TYPE_F16:
-            use_fast = (src0->type == GGML_TYPE_F16);
-            break;
-        case GGML_TYPE_F32:
-            // TODO: implement better mat-mat for k-quants, mat-vec for all k-quants except q6_K
-            switch (src0->type) {
-                case GGML_TYPE_F32:
-                case GGML_TYPE_F16:
-                case GGML_TYPE_Q4_0:
-                case GGML_TYPE_Q4_1:
-                case GGML_TYPE_Q5_0:
-                case GGML_TYPE_Q5_1:
-                case GGML_TYPE_Q8_0:
-                case GGML_TYPE_Q6_K:
-                case GGML_TYPE_Q4_K:
-                case GGML_TYPE_Q5_K:
-                case GGML_TYPE_Q3_K:
-                case GGML_TYPE_Q2_K:
-                case GGML_TYPE_Q1_0:
-                case GGML_TYPE_IQ1_S:
-                case GGML_TYPE_IQ1_M:
-                case GGML_TYPE_IQ2_XXS:
-                case GGML_TYPE_IQ2_XS:
-                case GGML_TYPE_IQ2_S:
-                case GGML_TYPE_IQ3_XXS:
-                case GGML_TYPE_IQ3_S:
-                case GGML_TYPE_IQ4_NL:
-                case GGML_TYPE_IQ4_XS:
-                case GGML_TYPE_MXFP4:
-                    use_fast = true;
-                    break;
-                default:
-                    break;
-            }
-            break;
-        default:
-            break;
-    }
+    // use MMVQ path for mat-vec
+    bool use_mmvq = ggml_webgpu_can_use_mmvq(src0, src1, ctx->global_ctx->capabilities.supports_dot_product,
+                                             ctx->global_ctx->vendor);

    ggml_webgpu_shader_lib_context shader_lib_ctx = {};

@@ -1446,16 +1470,20 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
    shader_lib_ctx.sg_mat_k                 = ctx->global_ctx->capabilities.sg_mat_k;
    shader_lib_ctx.min_subgroup_size        = ctx->global_ctx->capabilities.min_subgroup_size;
    shader_lib_ctx.max_subgroup_size        = ctx->global_ctx->capabilities.max_subgroup_size;
+    shader_lib_ctx.supports_dot_product     = ctx->global_ctx->capabilities.supports_dot_product;
+    shader_lib_ctx.vendor                   = ctx->global_ctx->vendor;

    // Get or create pipeline
-    webgpu_pipeline pipeline;
+    webgpu_pipeline                   pipeline;
+    std::vector<webgpu_dispatch_desc> dispatches;

-    if (use_fast && is_vec) {
+    if (is_vec) {
+        if (use_mmvq) {
+            ggml_webgpu_quantize_q8_dispatch(ctx, src0, src1, dst, dispatches);
+        }
        pipeline = ctx->shader_lib->get_mul_mat_vec_pipeline(shader_lib_ctx);
-    } else if (use_fast) {
-        pipeline = ctx->shader_lib->get_mul_mat_fast_pipeline(shader_lib_ctx);
    } else {
-        pipeline = ctx->shader_lib->get_mul_mat_legacy_pipeline(shader_lib_ctx);
+        pipeline = ctx->shader_lib->get_mul_mat_fast_pipeline(shader_lib_ctx);
    }

    // Build params
@@ -1479,25 +1507,31 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
    };

    // Build bind group entries
-    std::vector<wgpu::BindGroupEntry> entries = {
-        ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0),
-        ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1),
-        ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst),
-    };
+    std::vector<wgpu::BindGroupEntry> entries = {};
+
+    entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0));
+    if (use_mmvq) {
+        auto & mmvq_qq8_entry = dispatches[0].bind_group_entries[1];
+        entries.push_back(ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(dst), mmvq_qq8_entry.offset,
+                                                            mmvq_qq8_entry.size));
+    } else {
+        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1));
+    }
+    entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst));

    // Calculate workgroup dimensions
    uint32_t       wg_x           = 1;
    uint32_t       wg_y           = 1;
    const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;

-    if (use_fast && is_vec) {
+    if (is_vec) {
        auto * decisions = static_cast<ggml_webgpu_mul_mat_vec_shader_decisions *>(pipeline.context.get());

        uint32_t batches       = dst->ne[2] * dst->ne[3];
        uint32_t output_groups = CEIL_DIV(dst->ne[0], decisions->outputs_per_wg);
        uint32_t total_wg      = output_groups * batches;
        compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
-    } else if (use_fast) {
+    } else {
        auto * decisions = static_cast<ggml_webgpu_mul_mat_shader_decisions *>(pipeline.context.get());

        // Fast-path tiled/subgroup calculations
@@ -1518,15 +1552,13 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
        }
        uint32_t total_wg = wg_m * wg_n * dst->ne[2] * dst->ne[3];
        compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
-
-    } else {  // legacy
-        auto *   decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
-        uint32_t wg_size   = decisions->wg_size;
-        uint32_t total_wg  = CEIL_DIV(dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3], wg_size);
-        compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
    }

-    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
+    dispatches.push_back({
+        pipeline, std::move(params), std::move(entries), { wg_x, wg_y }
+    });
+
+    return ggml_backend_webgpu_build_multi(ctx, dispatches);
 }

 static webgpu_encoded_op ggml_webgpu_mul_mat_id_vec(webgpu_context & ctx,
@@ -1654,14 +1686,11 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx,
                                          gathered_count_ids_binding_size),
    };

-    const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
-
-    const uint32_t gather_total_wg = param_n_expert;
-    const uint32_t gather_wg_x     = std::min(gather_total_wg, max_wg_per_dim);
-    const uint32_t gather_wg_y     = CEIL_DIV(gather_total_wg, gather_wg_x);
+    // n_expert is much less than maxComputeWorkgroupsPerDimension (e.g., n_exeprt=256 at Qwen3.5-35B-A3B)
+    const uint32_t gather_wg_x = param_n_expert;

    dispatches.push_back({
-        gather_pipeline, std::move(gather_params), std::move(gather_entries), { gather_wg_x, gather_wg_y }
+        gather_pipeline, std::move(gather_params), std::move(gather_entries), { gather_wg_x, 1 }
    });

    // params for mul_mat_id.wgsl
@@ -1713,7 +1742,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx,
    uint32_t max_wg_n           = CEIL_DIV(total_gathered, tile_n_s) + max_active_experts;
    uint32_t total_wg           = wg_m * max_wg_n;

-    compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
+    compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y);

    dispatches.push_back({
        main_pipeline, std::move(main_params), std::move(main_entries), { wg_x, wg_y }
@@ -1956,10 +1985,10 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
    std::vector<wgpu::BindGroupEntry> reduce_entries;
    if (use_vec_reduce) {
        const uint32_t reduce_sg_size = ctx->global_ctx->capabilities.max_subgroup_size;
-        const uint32_t reduce_wg_size =
-            std::max(reduce_sg_size, (uint32_t) std::min<uint64_t>(
-                                         (uint64_t) nwg * reduce_sg_size,
-                                         ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup));
+        const uint32_t reduce_wg_size = std::max(
+            reduce_sg_size,
+            (uint32_t) std::min<uint64_t>((uint64_t) nwg * reduce_sg_size,
+                                          ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup));
        ggml_webgpu_shader_lib_context reduce_shader_ctx = shader_lib_ctx;
        reduce_shader_ctx.max_wg_size                    = reduce_wg_size;
        reduce_pipeline = ctx->shader_lib->get_flash_attn_vec_reduce_pipeline(reduce_shader_ctx);
@@ -2736,10 +2765,12 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor *
        block_size,  npr,         nrows
    };

-    const uint32_t                    total_wg_init = npr * nrows;
-    const uint32_t                    max_wg    = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
-    const uint32_t                    wg_x_init = std::min(total_wg_init, max_wg);
-    const uint32_t                    wg_y_init = CEIL_DIV(total_wg_init, wg_x_init);
+    uint32_t       wg_x_init;
+    uint32_t       wg_y_init;
+    const uint32_t total_wg_init  = npr * nrows;
+    const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
+    compute_2d_workgroups(total_wg_init, max_wg_per_dim, wg_x_init, wg_y_init);
+
    std::vector<wgpu::BindGroupEntry> init_entries = {
        ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src),
        ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(dst), init_align_offset, init_binding_size)
@@ -2796,9 +2827,11 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor *
            ggml_webgpu_make_bind_group_entry(2, ggml_webgpu_tensor_buf(dst), align_out, size_out)
        };

+        uint32_t       wg_x_merge;
+        uint32_t       wg_y_merge;
        const uint32_t total_wg_merge = nm * nrows;
-        const uint32_t wg_x_merge     = std::min(total_wg_merge, max_wg);
-        const uint32_t wg_y_merge     = CEIL_DIV(total_wg_merge, wg_x_merge);
+        compute_2d_workgroups(total_wg_merge, max_wg_per_dim, wg_x_merge, wg_y_merge);
+
        dispatches.push_back({
            argsort_merge_pipeline, std::move(merge_params), std::move(merge_entries), { wg_x_merge, wg_y_merge }
        });
@@ -2918,9 +2951,12 @@ static webgpu_encoded_op ggml_webgpu_upscale(webgpu_context ctx, ggml_tensor * s

    webgpu_pipeline pipeline  = ctx->shader_lib->get_upscale_pipeline(shader_lib_ctx);
    auto *          decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
-    uint32_t        total_wg  = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
-    uint32_t        wg_x = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg);
-    uint32_t        wg_y = CEIL_DIV(total_wg, wg_x);
+
+    uint32_t wg_x;
+    uint32_t wg_y;
+    uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
+    compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y);
+
    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
 }

@@ -3110,18 +3146,16 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
    uint32_t num_batched_kernels  = 0;
    uint32_t num_inflight_batches = 0;
    bool     contains_set_rows    = false;
-    bool     batch_compute_passes = true;
    int      num_encoded_ops      = 1;
    int      node_idx             = 0;

 #ifdef GGML_WEBGPU_GPU_PROFILE
    ctx->profile_timestamp_query_count = 0;
-    batch_compute_passes               = false;
    std::vector<std::string> profile_pipeline_names;
 #endif

    ctx->active_command_encoder = ctx->global_ctx->device.CreateCommandEncoder();
-    if (batch_compute_passes) {
+    if (ctx->batch_compute_passes) {
        ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass();
    }

@@ -3148,7 +3182,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str

            // reset state for next batch
            ctx->active_command_encoder = ctx->global_ctx->device.CreateCommandEncoder();
-            if (batch_compute_passes) {
+            if (ctx->batch_compute_passes) {
                ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass();
            }
            ctx->param_arena.reset();
@@ -3548,8 +3582,8 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer
                        const uint32_t kv_tile = decisions.kv_tile;

                        const uint32_t vec_nwg_cap = ctx->webgpu_global_ctx->capabilities.min_subgroup_size;
-                        uint32_t       nwg     = 1u;
-                        const uint64_t kv_span = (uint64_t) std::max(1u, kv_tile);
+                        uint32_t       nwg         = 1u;
+                        const uint64_t kv_span     = (uint64_t) std::max(1u, kv_tile);
                        while ((2u * nwg * kv_span) < (uint64_t) K->ne[1] && nwg < vec_nwg_cap) {
                            nwg <<= 1;
                        }
@@ -3582,6 +3616,22 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer
                }
            }
            break;
+        case GGML_OP_MUL_MAT:
+            {
+                const ggml_tensor * src0 = tensor->src[0];
+                const ggml_tensor * src1 = tensor->src[1];
+                bool                use_mmvq =
+                    ggml_webgpu_can_use_mmvq(src0, src1, ctx->webgpu_global_ctx->capabilities.supports_dot_product,
+                                             ctx->webgpu_global_ctx->vendor);
+                if (use_mmvq) {
+                    const size_t q8_src1_size =
+                        src1->ne[3] * src1->ne[2] * (36 /* sizeof(q8_1) */ * (src1->ne[0] / /* block_size */ 32));
+                    res = ROUNDUP_POW2(res + q8_src1_size +
+                                           ctx->webgpu_global_ctx->capabilities.limits.minStorageBufferOffsetAlignment,
+                                       WEBGPU_STORAGE_BUF_BINDING_MULT);
+                }
+            }
+            break;
        case GGML_OP_MUL_MAT_ID:
            {
                const ggml_tensor * src0 = tensor->src[0];
@@ -3658,13 +3708,13 @@ static ggml_guid_t ggml_backend_webgpu_guid(void) {

 static void ggml_webgpu_init_memset_pipeline(webgpu_global_context & ctx) {
    // we use the maximum workgroup size for the memset pipeline
-    size_t max_threads = WEBGPU_MAX_WG_SIZE * ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
+    size_t max_threads = ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup * ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
    // Size the bytes_per_thread so that the largest buffer size can be handled
    ctx->capabilities.memset_bytes_per_thread =
        CEIL_DIV(ctx->capabilities.limits.maxStorageBufferBindingSize, max_threads);
    std::vector<wgpu::ConstantEntry> constants(2);
    constants[0].key     = "wg_size";
-    constants[0].value   = WEBGPU_MAX_WG_SIZE;
+    constants[0].value   = ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
    constants[1].key     = "bytes_per_thread";
    constants[1].value   = ctx->capabilities.memset_bytes_per_thread;
    ctx->memset_pipeline = ggml_webgpu_create_pipeline(ctx->device, wgsl_memset, "memset", constants);
@@ -3707,12 +3757,16 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
    ctx->webgpu_global_ctx->adapter.GetInfo(&info);
    ctx->webgpu_global_ctx->command_submit_batch_size = ggml_backend_webgpu_get_command_submit_batch_size();
    ctx->webgpu_global_ctx->max_inflight_batches      = ggml_backend_webgpu_get_max_inflight_batches();
+    ctx->webgpu_global_ctx->vendor                    = info.vendor;
    wgpu::SupportedFeatures features;
    ctx->webgpu_global_ctx->adapter.GetFeatures(&features);
    // we require f16 support
    GGML_ASSERT(ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));
    ctx->webgpu_global_ctx->capabilities.supports_subgroups =
        ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::Subgroups);
+    // for dot4I8packed
+    ctx->webgpu_global_ctx->capabilities.supports_dot_product = ctx->webgpu_global_ctx->instance.HasWGSLLanguageFeature(
+        wgpu::WGSLLanguageFeatureName::Packed4x8IntegerDotProduct);

    bool valid_subgroup_matrix_config = false;
 #ifndef __EMSCRIPTEN__
@@ -3839,6 +3893,7 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
                              wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "set_rows_host_error_buf");

 #ifdef GGML_WEBGPU_GPU_PROFILE
+    webgpu_ctx->batch_compute_passes = false;
    ggml_webgpu_create_buffer(
        webgpu_ctx->global_ctx->device, webgpu_ctx->profile_timestamp_dev_buf, WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES,
        wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc, "profile_timestamp_dev_buf");
--- a/Show More
+++ b/Show More