sync : ggml

cmake : Indent ggml-config.cmake (ggml/1310)
2026-04-23 16:37:33 +03:00 · 2025-07-25 14:31:39 +03:00 · 2025-07-25 14:31:28 +03:00
293 changed files with 36273 additions and 98422 deletions
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@@ -1,130 +0,0 @@
-# ==============================================================================
-# ARGUMENTS
-# ==============================================================================
-
-# Define the CANN base image for easier version updates later
-ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
-
-# ==============================================================================
-# BUILD STAGE
-# Compile all binary files and libraries
-# ==============================================================================
-FROM ${CANN_BASE_IMAGE} AS build
-
-# Define the Ascend chip model for compilation. Default is Ascend910B3
-ARG ASCEND_SOC_TYPE=Ascend910B3
-
-# -- Install build dependencies --
-RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
-    yum clean all && \
-    rm -rf /var/cache/yum
-
-# -- Set the working directory --
-WORKDIR /app
-
-# -- Copy project files --
-COPY . .
-
-# -- Set CANN environment variables (required for compilation) --
-# Using ENV instead of `source` allows environment variables to persist across the entire image layer
-ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
-ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
-ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
-ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
-ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
-# ... You can add other environment variables from the original file as needed ...
-# For brevity, only core variables are listed here. You can paste the original ENV list here.
-
-# -- Build llama.cpp --
-# Use the passed ASCEND_SOC_TYPE argument and add general build options
-RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
-    && \
-    cmake -B build \
-        -DGGML_CANN=ON \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DSOC_TYPE=${ASCEND_SOC_TYPE} \
-        . && \
-    cmake --build build --config Release -j$(nproc)
-
-# -- Organize build artifacts for copying in later stages --
-# Create a lib directory to store all .so files
-RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
-
-# Create a full directory to store all executables and Python scripts
-RUN mkdir -p /app/full && \
-    cp build/bin/* /app/full/ && \
-    cp *.py /app/full/ && \
-    cp -r gguf-py /app/full/ && \
-    cp -r requirements /app/full/ && \
-    cp requirements.txt /app/full/
-    # If you have a tools.sh script, make sure it is copied here
-    # cp .devops/tools.sh /app/full/tools.sh
-
-# ==============================================================================
-# BASE STAGE
-# Create a minimal base image with CANN runtime and common libraries
-# ==============================================================================
-FROM ${CANN_BASE_IMAGE} AS base
-
-# -- Install runtime dependencies --
-RUN yum install -y libgomp curl && \
-    yum clean all && \
-    rm -rf /var/cache/yum
-
-# -- Set CANN environment variables (required for runtime) --
-ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
-ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
-ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
-ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
-# ... You can add other environment variables from the original file as needed ...
-
-WORKDIR /app
-
-# Copy compiled .so files from the build stage
-COPY --from=build /app/lib/ /app
-
-# ==============================================================================
-# FINAL STAGES (TARGETS)
-# ==============================================================================
-
-### Target: full
-# Complete image with all tools, Python bindings, and dependencies
-# ==============================================================================
-FROM base AS full
-
-COPY --from=build /app/full /app
-
-# Install Python dependencies
-RUN yum install -y git python3 python3-pip && \
-    pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
-    pip3 install --no-cache-dir -r requirements.txt && \
-    yum clean all && \
-    rm -rf /var/cache/yum
-
-# You need to provide a tools.sh script as the entrypoint
-ENTRYPOINT ["/app/tools.sh"]
-# If there is no tools.sh, you can set the default to start the server
-# ENTRYPOINT ["/app/llama-server"]
-
-### Target: light
-# Lightweight image containing only llama-cli
-# ==============================================================================
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Target: server
-# Dedicated server image containing only llama-server
-# ==============================================================================
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app
-
-HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/cloud-v-pipeline
+++ b/.devops/cloud-v-pipeline
@@ -0,0 +1,22 @@
+node('x86_runner1'){            // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
+    stage('Cleanup'){
+        cleanWs()               // Cleaning previous CI build in workspace
+    }
+    stage('checkout repo'){
+        retry(5){               // Retry if the cloning fails due to some reason
+            checkout scm        // Clone the repo on Runner
+        }
+    }
+    stage('Compiling llama.cpp'){
+        sh'''#!/bin/bash
+            make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
+        '''
+    }
+    stage('Running llama.cpp'){
+        sh'''#!/bin/bash
+            module load gnu-bin2/0.1            # loading latest versions of vector qemu and vector gcc
+            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
+            cat llama_log.txt                   # Printing results
+        '''
+    }
+}
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -4,6 +4,8 @@ FROM ubuntu:$UBUNTU_VERSION AS build

 ARG TARGETARCH

+ARG GGML_CPU_ARM_ARCH=armv8-a
+
 RUN apt-get update && \
    apt-get install -y build-essential git cmake libcurl4-openssl-dev

@@ -11,8 +13,10 @@ WORKDIR /app

 COPY . .

-RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
+RUN if [ "$TARGETARCH" = "amd64" ]; then \
        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
+    elif [ "$TARGETARCH" = "arm64" ]; then \
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
    else \
        echo "Unsupported architecture"; \
        exit 1; \
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -60,7 +60,8 @@ RUN apt-get update \
    git \
    python3 \
    python3-pip \
-    && pip install --break-system-packages -r requirements.txt \
+    && pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -1,8 +1,8 @@
 ARG UBUNTU_VERSION=24.04

 # This needs to generally match the container host's environment.
-ARG ROCM_VERSION=6.4
-ARG AMDGPU_VERSION=6.4
+ARG ROCM_VERSION=6.3
+ARG AMDGPU_VERSION=6.3

 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@@ -40,7 +40,7 @@ body:
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
+        options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
        multiple: true
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@@ -42,7 +42,7 @@ body:
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
+        options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
        multiple: true
    validations:
      required: true
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -22,11 +22,6 @@ Vulkan:
        - any-glob-to-any-file:
            - ggml/include/ggml-vulkan.h
            - ggml/src/ggml-vulkan/**
-IBM zDNN:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-zdnn.h
-            - ggml/src/ggml-zdnn/**
 documentation:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/workflows/build-riscv-native.yml
+++ b/.github/workflows/build-riscv-native.yml
@@ -1,43 +0,0 @@
-name: Build on RISCV Linux Machine by Cloud-V
-on:
-  workflow_dispatch:
-  workflow_call:
-
-jobs:
-  bianbu-riscv64-native: # Bianbu 2.2
-    runs-on: self-hosted
-
-    steps:
-      - name: Install prerequisites
-        run: |
-          sudo apt-get update || true
-          sudo apt-get install -y libatomic1
-      - uses: actions/checkout@v4
-      - name: Setup Riscv
-        run: |
-          sudo apt-get update || true
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu \
-                  cmake
-
-      - name: Build
-        run: |
-          cmake -B build -DLLAMA_CURL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -64,7 +64,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-arm64
          evict-old-files: 1d
@@ -104,7 +104,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-x64
          evict-old-files: 1d
@@ -144,7 +144,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-arm64-webgpu
          evict-old-files: 1d
@@ -159,15 +159,31 @@ jobs:
      - name: Dawn Dependency
        id: dawn-depends
        run: |
-          DAWN_VERSION="v1.0.0"
-          DAWN_OWNER="reeselevine"
-          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-macos-latest-Release.tar.gz"
-          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
-          curl -L -o artifact.tar.gz \
-            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+          ARTIFACTS_JSON=$(curl -s -L \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            "https://api.github.com/repos/google/dawn/actions/artifacts")
+          echo "Finding latest macos-latest-Release artifact..."
+          DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
+            | sort_by(.created_at)
+            | reverse
+            | map(select(.name | test("macos-latest-Release$")))
+            | .[0].archive_download_url')
+          if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
+            echo "No suitable Dawn artifact found!"
+            exit 1
+          fi
+          echo "Downloading from: $DOWNLOAD_URL"
+          curl -L \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+            -o artifact.zip "$DOWNLOAD_URL"
+          unzip artifact.zip
          mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
+          tar_file=$(find . -name '*.tar.gz' | head -n 1)
+          echo "Extracting: $tar_file"
+          tar -xvf "$tar_file" -C dawn --strip-components=1

      - name: Build
        id: cmake_build
@@ -199,7 +215,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-cpu-cmake
          evict-old-files: 1d
@@ -251,7 +267,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
          evict-old-files: 1d
@@ -330,7 +346,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-latest-cmake-rpc
          evict-old-files: 1d
@@ -363,7 +379,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-vulkan
          evict-old-files: 1d
@@ -400,7 +416,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-webgpu
          evict-old-files: 1d
@@ -417,15 +433,31 @@ jobs:
        id: dawn-depends
        run: |
          sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
-          DAWN_VERSION="v1.0.0"
-          DAWN_OWNER="reeselevine"
-          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-ubuntu-latest-Release.tar.gz"
-          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
-          curl -L -o artifact.tar.gz \
-            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+          ARTIFACTS_JSON=$(curl -s -L \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            "https://api.github.com/repos/google/dawn/actions/artifacts")
+          echo "Finding latest ubuntu-latest-Release artifact..."
+          DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
+            | sort_by(.created_at)
+            | reverse
+            | map(select(.name | test("ubuntu-latest-Release$")))
+            | .[0].archive_download_url')
+          if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
+            echo "No suitable Dawn artifact found!"
+            exit 1
+          fi
+          echo "Downloading from: $DOWNLOAD_URL"
+          curl -L \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+            -o artifact.zip "$DOWNLOAD_URL"
+          unzip artifact.zip
          mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
+          tar_file=$(find . -name '*.tar.gz' | head -n 1)
+          echo "Extracting: $tar_file"
+          tar -xvf "$tar_file" -C dawn --strip-components=1

      - name: Build
        id: cmake_build
@@ -443,7 +475,7 @@ jobs:

  ubuntu-22-cmake-hip:
    runs-on: ubuntu-22.04
-    container: rocm/dev-ubuntu-22.04:6.1.2
+    container: rocm/dev-ubuntu-22.04:6.0.2

    steps:
      - name: Clone
@@ -457,7 +489,7 @@ jobs:
          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-hip
          evict-old-files: 1d
@@ -471,6 +503,16 @@ jobs:
            -DGGML_HIP=ON
          cmake --build build --config Release -j $(nproc)

+      - name: Build with legacy HIP support
+        id: cmake_build_legacy_hip
+        run: |
+          cmake -B build2 -S . \
+            -DCMAKE_C_COMPILER=hipcc \
+            -DCMAKE_CXX_COMPILER=hipcc \
+            -DGGML_HIP_ROCWMMA_FATTN=ON \
+            -DGGML_HIP=ON
+          cmake --build build2 --config Release -j $(nproc)
+
  ubuntu-22-cmake-musa:
    runs-on: ubuntu-22.04
    container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
@@ -487,7 +529,7 @@ jobs:
          apt-get install -y build-essential git cmake libcurl4-openssl-dev

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-musa
          evict-old-files: 1d
@@ -532,7 +574,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-sycl
          evict-old-files: 1d
@@ -580,7 +622,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-sycl-fp16
          evict-old-files: 1d
@@ -611,7 +653,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-ios
          evict-old-files: 1d
@@ -648,7 +690,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-tvos
          evict-old-files: 1d
@@ -720,7 +762,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: macOS-latest-swift
          evict-old-files: 1d
@@ -766,7 +808,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-msys2
          variant: ccache
@@ -834,7 +876,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-${{ matrix.build }}
          variant: ccache
@@ -948,7 +990,7 @@ jobs:
              apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev

        - name: ccache
-          uses: ggml-org/ccache-action@v1.2.16
+          uses: hendrikmuhs/ccache-action@v1.2.16
          with:
            key: ubuntu-latest-cmake-cuda
            evict-old-files: 1d
@@ -977,7 +1019,7 @@ jobs:
        uses: actions/checkout@v4

      - name: Install ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-cuda-${{ matrix.cuda }}
          variant: ccache
@@ -1033,7 +1075,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-sycl
          variant: ccache
@@ -1070,8 +1112,7 @@ jobs:
          write-host "Downloading AMD HIP SDK Installer"
          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
-          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
-          $proc.WaitForExit(600000)
+          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
          write-host "Completed AMD HIP SDK installation"

      - name: Verify ROCm
@@ -1080,7 +1121,7 @@ jobs:
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version

      - name: Install ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ${{ github.job }}
          evict-old-files: 1d
@@ -1114,11 +1155,6 @@ jobs:
      - name: Checkout code
        uses: actions/checkout@v4

-      - name: Setup Xcode
-        uses: maxim-lobanov/setup-xcode@v1
-        with:
-          xcode-version: latest-stable
-
      - name: Build
        id: cmake_build
        run: |
@@ -1152,7 +1188,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: android-build
          evict-old-files: 1d
--- a/.github/workflows/copilot-setup-steps.yml
+++ b/.github/workflows/copilot-setup-steps.yml
@@ -1,53 +0,0 @@
-name: "Copilot Setup Steps"
-
-# Automatically run the setup steps when they are changed to allow for easy validation, and
-# allow manual testing through the repository's "Actions" tab
-on:
-  workflow_dispatch:
-  push:
-    paths:
-      - .github/workflows/copilot-setup-steps.yml
-  pull_request:
-    paths:
-      - .github/workflows/copilot-setup-steps.yml
-
-jobs:
-  # The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot.
-  copilot-setup-steps:
-    runs-on: ubuntu-latest
-
-    # Set the permissions to the lowest permissions possible needed for your steps.
-    # Copilot will be given its own token for its operations.
-    permissions:
-      # If you want to clone the repository as part of your setup steps, for example to install dependencies, you'll need the `contents: read` permission. If you don't clone the repository in your setup steps, Copilot will do this for you automatically after the steps complete.
-      contents: read
-
-    # You can define any steps you want, and they will run before the agent starts.
-    # If you do not check out your code, Copilot will do this for you.
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: copilot-setup-steps
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Install Python dependencies
-        run: |
-          python3 -m venv .venv
-          .venv/bin/activate
-          pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt
-          pip install flake8 pyright
--- a/.github/workflows/pre-tokenizer-hashes.yml
+++ b/.github/workflows/pre-tokenizer-hashes.yml
@@ -1,45 +0,0 @@
-name: Check Pre-Tokenizer Hashes
-
-on:
-    push:
-        paths:
-            - 'convert_hf_to_gguf.py'
-            - 'convert_hf_to_gguf_update.py'
-    pull_request:
-        paths:
-            - 'convert_hf_to_gguf.py'
-            - 'convert_hf_to_gguf_update.py'
-
-jobs:
-    pre-tokenizer-hashes:
-        runs-on: ubuntu-latest
-
-        steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-
-        - name: Set up Python
-          uses: actions/setup-python@v5
-          with:
-              python-version: '3.11'
-
-        - name: Install Python dependencies
-          run: |
-              python3 -m venv .venv
-              .venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
-
-        - name: Update pre-tokenizer hashes
-          run: |
-              cp convert_hf_to_gguf.py /tmp
-              .venv/bin/python convert_hf_to_gguf_update.py --check-missing
-
-        - name: Check if committed pre-tokenizer hashes matches generated version
-          run: |
-              if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
-                  echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
-                  echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
-                  echo "Differences found:"
-                  diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
-                  exit 1
-              fi
-              echo "Model pre-tokenizer hashes are up to date."
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -32,7 +32,7 @@ jobs:
          fetch-depth: 0

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-arm64
          evict-old-files: 1d
@@ -85,7 +85,7 @@ jobs:
          fetch-depth: 0

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-x64
          evict-old-files: 1d
@@ -147,7 +147,7 @@ jobs:
          fetch-depth: 0

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-cpu-cmake
          evict-old-files: 1d
@@ -198,7 +198,7 @@ jobs:
          fetch-depth: 0

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-vulkan
          evict-old-files: 1d
@@ -256,7 +256,7 @@ jobs:
          fetch-depth: 0

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-cpu-${{ matrix.arch }}
          variant: ccache
@@ -328,7 +328,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
          variant: ccache
@@ -398,7 +398,7 @@ jobs:
        uses: actions/checkout@v4

      - name: Install ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-cuda-${{ matrix.cuda }}
          variant: ccache
@@ -471,7 +471,7 @@ jobs:
        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-sycl
          variant: ccache
@@ -545,7 +545,7 @@ jobs:
          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
+        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-hip-${{ matrix.name }}-x64
          evict-old-files: 1d
@@ -557,8 +557,7 @@ jobs:
          write-host "Downloading AMD HIP SDK Installer"
          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
-          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
-          $proc.WaitForExit(600000)
+          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
          write-host "Completed AMD HIP SDK installation"

      - name: Verify ROCm
@@ -601,7 +600,7 @@ jobs:
          name: llama-bin-win-hip-${{ matrix.name }}-x64.zip

  ios-xcode-build:
-    runs-on: macos-15
+    runs-on: macos-latest

    steps:
      - name: Checkout code
@@ -609,10 +608,6 @@ jobs:
        with:
          fetch-depth: 0

-      - name: Setup Xcode
-        run: |
-          sudo xcode-select -s /Applications/Xcode_16.4.app
-
      - name: Build
        id: cmake_build
        run: |
--- a/.gitignore
+++ b/.gitignore
@@ -82,7 +82,6 @@ models/*
 models-mnt
 !models/.editorconfig
 !models/ggml-vocab-*.gguf*
-!models/templates

 # Zig
 zig-out/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,8 +12,6 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()

-message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
-
 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")

--- a/1
+++ b/1
@@ -10,4 +10,3 @@
 /ggml/src/ggml-opt.cpp @JohannesGaessler
 /ggml/src/gguf.cpp @JohannesGaessler
 /ggml/src/ggml-vulkan/ @0cc4m
-/ggml/src/ggml-zdnn/ @taronaeo
--- a/README.md
+++ b/README.md
@@ -17,8 +17,6 @@ LLM inference in C/C++

 ## Hot topics

- **[[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)**
- Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095)
 - Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
 - Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
 - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
@@ -241,7 +239,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 <details>
 <summary>Infrastructure</summary>

- [Paddler](https://github.com/intentee/paddler) - Open-source LLMOps platform for hosting and scaling AI in your own infrastructure
+- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
 - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
 - [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
 - [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -24,7 +24,6 @@
 #include <cstdarg>
 #include <filesystem>
 #include <fstream>
-#include <list>
 #include <regex>
 #include <set>
 #include <string>
@@ -749,39 +748,6 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
 // utils
 //

-// Helper function to parse tensor buffer override strings
-static void parse_tensor_buffer_overrides(const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) {
-    std::map<std::string, ggml_backend_buffer_type_t> buft_list;
-    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
-        auto * dev = ggml_backend_dev_get(i);
-        auto * buft = ggml_backend_dev_buffer_type(dev);
-        if (buft) {
-            buft_list[ggml_backend_buft_name(buft)] = buft;
-        }
-    }
-
-    for (const auto & override : string_split<std::string>(value, ',')) {
-        std::string::size_type pos = override.find('=');
-        if (pos == std::string::npos) {
-            throw std::invalid_argument("invalid value");
-        }
-        std::string tensor_name = override.substr(0, pos);
-        std::string buffer_type = override.substr(pos + 1);
-
-        if (buft_list.find(buffer_type) == buft_list.end()) {
-            printf("Available buffer types:\n");
-            for (const auto & it : buft_list) {
-                printf("  %s\n", ggml_backend_buft_name(it.second));
-            }
-            throw std::invalid_argument("unknown buffer type");
-        }
-        // keep strings alive and avoid leaking memory by storing them in a static vector
-        static std::list<std::string> buft_overrides;
-        buft_overrides.push_back(tensor_name);
-        overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
-    }
-}
-
 struct handle_model_result {
    bool found_mmproj = false;
    common_params_model mmproj;
@@ -1011,10 +977,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
            string_process_escapes(seq_breaker);
        }
-        for (auto & pair : params.speculative.replacements) {
-            string_process_escapes(pair.first);
-            string_process_escapes(pair.second);
-        }
    }

    if (!params.kv_overrides.empty()) {
@@ -1026,10 +988,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        params.tensor_buft_overrides.push_back({nullptr, nullptr});
    }

-    if (!params.speculative.tensor_buft_overrides.empty()) {
-        params.speculative.tensor_buft_overrides.push_back({nullptr, nullptr});
-    }
-
    if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
        throw std::runtime_error(string_format(
            "error: the supplied chat template is not supported: %s%s\n",
@@ -1238,7 +1196,6 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
            common_params_print_completion(ctx_arg);
            exit(0);
        }
-        params.lr.init();
    } catch (const std::invalid_argument & ex) {
        fprintf(stderr, "%s\n", ex.what());
        ctx_arg.params = params_org;
@@ -1507,14 +1464,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.swa_full = true;
        }
    ).set_env("LLAMA_ARG_SWA_FULL"));
-    add_opt(common_arg(
-        {"--swa-checkpoints"}, "N",
-        string_format("max number of SWA checkpoints per slot to create (default: %d)\n"
-            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_swa_checkpoints),
-        [](common_params & params, int value) {
-            params.n_swa_checkpoints = value;
-        }
-    ).set_env("LLAMA_ARG_SWA_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--kv-unified", "-kvu"},
        string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
@@ -2142,13 +2091,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.no_kv_offload = true;
        }
    ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
-    add_opt(common_arg(
-        {"-nr", "--no-repack"},
-        "disable weight repacking",
-        [](common_params & params) {
-            params.no_extra_bufts = true;
-        }
-    ).set_env("LLAMA_ARG_NO_REPACK"));
    add_opt(common_arg(
        {"-ctk", "--cache-type-k"}, "TYPE",
        string_format(
@@ -2395,58 +2337,38 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    add_opt(common_arg(
        {"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
        "override tensor buffer type", [](common_params & params, const std::string & value) {
-            parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
+            /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
+            if (buft_list.empty()) {
+                // enumerate all the devices and add their buffer types to the list
+                for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+                    auto * dev = ggml_backend_dev_get(i);
+                    auto * buft = ggml_backend_dev_buffer_type(dev);
+                    if (buft) {
+                        buft_list[ggml_backend_buft_name(buft)] = buft;
+                    }
+                }
+            }
+
+            for (const auto & override : string_split<std::string>(value, ',')) {
+                std::string::size_type pos = override.find('=');
+                if (pos == std::string::npos) {
+                    throw std::invalid_argument("invalid value");
+                }
+                std::string tensor_name = override.substr(0, pos);
+                std::string buffer_type = override.substr(pos + 1);
+
+                if (buft_list.find(buffer_type) == buft_list.end()) {
+                    printf("Available buffer types:\n");
+                    for (const auto & it : buft_list) {
+                        printf("  %s\n", ggml_backend_buft_name(it.second));
+                    }
+                    throw std::invalid_argument("unknown buffer type");
+                }
+                // FIXME: this leaks memory
+                params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
+            }
        }
    ));
-    add_opt(common_arg(
-        {"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
-        "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
-            parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
-    add_opt(common_arg(
-        {"--cpu-moe", "-cmoe"},
-        "keep all Mixture of Experts (MoE) weights in the CPU",
-        [](common_params & params) {
-            params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
-        }
-    ).set_env("LLAMA_ARG_CPU_MOE"));
-    add_opt(common_arg(
-        {"--n-cpu-moe", "-ncmoe"}, "N",
-        "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
-        [](common_params & params, int value) {
-            if (value < 0) {
-                throw std::invalid_argument("invalid value");
-            }
-            for (int i = 0; i < value; ++i) {
-                // keep strings alive and avoid leaking memory by storing them in a static vector
-                static std::list<std::string> buft_overrides;
-                buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
-                params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
-            }
-        }
-    ).set_env("LLAMA_ARG_N_CPU_MOE"));
-    add_opt(common_arg(
-        {"--cpu-moe-draft", "-cmoed"},
-        "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
-        [](common_params & params) {
-            params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
-    add_opt(common_arg(
-        {"--n-cpu-moe-draft", "-ncmoed"}, "N",
-        "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
-        [](common_params & params, int value) {
-            if (value < 0) {
-                throw std::invalid_argument("invalid value");
-            }
-            for (int i = 0; i < value; ++i) {
-                static std::list<std::string> buft_overrides_draft;
-                buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
-                params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
    add_opt(common_arg(
        {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
        "number of layers to store in VRAM",
@@ -2697,7 +2619,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.out_file = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
    add_opt(common_arg(
        {"-ofreq", "--output-frequency"}, "N",
        string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -2705,15 +2627,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.n_out_freq = value;
        }
    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
-    add_opt(common_arg(
-        {"--output-format"}, "{gguf,dat}",
-        string_format("output format for imatrix file (default: %s)", params.imat_dat > 0 ? "dat" : "gguf"),
-        [](common_params & params, const std::string & value) {
-            /**/ if (value == "gguf") { params.imat_dat = -1; }
-            else if (value == "dat")  { params.imat_dat = 1;  }
-            else { throw std::invalid_argument("invalid output format"); }
-        }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
    add_opt(common_arg(
        {"--save-frequency"}, "N",
        string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
@@ -2989,9 +2902,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
        "- none: leaves thoughts unparsed in `message.content`\n"
        "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
-        "(default: auto)",
+        "(default: deepseek)",
        [](common_params & params, const std::string & value) {
-            params.reasoning_format = common_reasoning_format_from_name(value);
+            /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
+            else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
+            else if (value == "none") {     params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
+            else { throw std::invalid_argument("invalid value"); }
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
    add_opt(common_arg(
@@ -3172,7 +3088,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
            }
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-tbd", "--threads-batch-draft"}, "N",
        "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
@@ -3182,7 +3098,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
            }
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-Cd", "--cpu-mask-draft"}, "M",
        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
@@ -3333,13 +3249,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.speculative.model.path = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
-    add_opt(common_arg(
-        {"--spec-replace"}, "TARGET", "DRAFT",
-        "translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
-        [](common_params & params, const std::string & tgt, const std::string & dft) {
-            params.speculative.replacements.push_back({ tgt, dft });
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-ctkd", "--cache-type-k-draft"}, "TYPE",
        string_format(
@@ -3529,11 +3438,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}));

+    // diffusion parameters
    add_opt(common_arg(
        { "--diffusion-steps" }, "N",
        string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
        [](common_params & params, int value) { params.diffusion.steps = value; }
    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-eps" }, "F",
+        string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
+        [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-algorithm" }, "N",
+        string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
+                      params.diffusion.algorithm),
+        [](common_params & params, int value) { params.diffusion.algorithm = value; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-alg-temp" }, "F",
+        string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
+        [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
    add_opt(common_arg(
        { "--diffusion-visual" },
        string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
@@ -3541,85 +3467,5 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) { params.diffusion.visual_mode = true; }
    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));

-    add_opt(common_arg(
-        { "--diffusion-eps" }, "F",
-        string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
-        [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
-    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
-    add_opt(common_arg(
-        { "--diffusion-algorithm" }, "N",
-        string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
-                      params.diffusion.algorithm),
-        [](common_params & params, int value) { params.diffusion.algorithm = value; }
-    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
-    add_opt(common_arg(
-        { "--diffusion-alg-temp" }, "F",
-        string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
-        [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
-    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
-
-    add_opt(common_arg(
-        { "--diffusion-block-length" }, "N",
-        string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
-        [](common_params & params, int value) { params.diffusion.block_length = value; }
-    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
-    add_opt(common_arg(
-        { "--diffusion-cfg-scale" }, "F",
-        string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
-        [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
-    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
-    add_opt(common_arg(
-        { "--diffusion-add-gumbel-noise" }, "F",
-        string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
-        [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
-    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
-
-
-    add_opt(
-        common_arg({ "-lr", "--learning-rate" }, "ALPHA",
-                   string_format(
-                       "adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)",
-                       (double) params.lr.lr0),
-                   [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); })
-            .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
-    add_opt(
-        common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
-                   string_format(
-                       "(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
-                       (double) params.lr.lr_min),
-                   [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
-            .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
-    add_opt(
-        common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
-                   string_format(
-                       "(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
-                       (double) params.lr.decay_epochs),
-                   [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
-            .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
-    add_opt(common_arg(
-                { "-wd", "--weight-decay" }, "WD",
-                string_format(
-                    "adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
-                    (double) params.lr.wd),
-                [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); })
-                .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
-    add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION",
-                       string_format("fraction of data to use as validation set for training (default: %.2g).",
-                                     (double) params.val_split),
-                       [](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
-                .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
-    add_opt(common_arg({ "-epochs", "--epochs" }, "N",
-                       string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
-                       [](common_params & params, int epochs) { params.lr.epochs = epochs; })
-                .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
-    add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
-                       [](common_params & params, const std::string & name) {
-                           params.optimizer = common_opt_get_optimizer(name.c_str());
-                           if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
-                               throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
-                           }
-                       })
-                .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
-
    return ctx_arg;
 }
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@@ -55,15 +55,7 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
 bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
    std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
    std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
-    std::string arguments = "";
-    if (tool_call.contains("arguments")) {
-        if (tool_call.at("arguments").is_object()) {
-            arguments = tool_call.at("arguments").dump();
-        } else {
-            arguments = tool_call.at("arguments");
-        }
-    }
-
+    std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
    return add_tool_call(name, id, arguments);
 }

--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -126,8 +126,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
 typedef minja::chat_template common_chat_template;

 struct common_chat_templates {
-    bool add_bos;
-    bool add_eos;
    bool has_explicit_template; // Model had builtin template or template overridde was specified.
    std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
    std::unique_ptr<common_chat_template> template_tool_use;
@@ -145,8 +143,6 @@ struct templates_params {
    bool enable_thinking = true;
    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
    json extra_context;
-    bool add_bos;
-    bool add_eos;
 };

 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -296,7 +292,6 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
        }
        if (!msg.reasoning_content.empty()) {
            jmsg["reasoning_content"] = msg.reasoning_content;
-            jmsg["thinking"] = msg.reasoning_content; // gpt-oss
        }
        if (!msg.tool_name.empty()) {
            jmsg["name"] = msg.tool_name;
@@ -450,8 +445,6 @@ std::string common_chat_format_single(

    common_chat_templates_inputs inputs;
    inputs.use_jinja = use_jinja;
-    inputs.add_bos = tmpls->add_bos;
-    inputs.add_eos = tmpls->add_eos;

    std::string fmt_past_msg;
    if (!past_msg.empty()) {
@@ -473,12 +466,9 @@ std::string common_chat_format_single(
    return ss.str();
 }

-std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja, const std::map<std::string, std::string> & chat_template_kwargs) {
+std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
    common_chat_templates_inputs inputs;
    inputs.use_jinja = use_jinja;
-    inputs.add_bos = tmpls->add_bos;
-    inputs.add_eos = tmpls->add_eos;
-    inputs.chat_template_kwargs = chat_template_kwargs;
    auto add_simple_msg = [&](auto role, auto content) {
        common_chat_msg msg;
        msg.role = role;
@@ -554,21 +544,8 @@ common_chat_templates_ptr common_chat_templates_init(
            default_template_src = CHATML_TEMPLATE_SRC;
        }
    }
-
-    // TODO @ngxson : this is a temporary hack to prevent chat template from throwing an error
-    // Ref: https://github.com/ggml-org/llama.cpp/pull/15230#issuecomment-3173959633
-    if (default_template_src.find("<|channel|>") != std::string::npos
-            // search for the error message and patch it
-            && default_template_src.find("in message.content or") != std::string::npos) {
-        string_replace_all(default_template_src,
-            "{%- if \"<|channel|>analysis<|message|>\" in message.content or \"<|channel|>final<|message|>\" in message.content %}",
-            "{%- if false %}");
-    }
-
    std::string token_bos = bos_token_override;
    std::string token_eos = eos_token_override;
-    bool add_bos = false;
-    bool add_eos = false;
    if (model) {
        const auto * vocab = llama_model_get_vocab(model);
        const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
@@ -583,13 +560,9 @@ common_chat_templates_ptr common_chat_templates_init(
        };
        token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
        token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
-        add_bos = llama_vocab_get_add_bos(vocab);
-        add_eos = llama_vocab_get_add_eos(vocab);
    }
    common_chat_templates_ptr tmpls(new common_chat_templates());
    tmpls->has_explicit_template = has_explicit_template;
-    tmpls->add_bos = add_bos;
-    tmpls->add_eos = add_eos;
    try {
        tmpls->template_default = std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos);
    } catch (const std::exception & e) {
@@ -619,8 +592,6 @@ const char * common_chat_format_name(common_chat_format format) {
        case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
        case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
        case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
-        case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
-        case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
        default:
            throw std::runtime_error("Unknown chat format");
    }
@@ -629,28 +600,13 @@ const char * common_chat_format_name(common_chat_format format) {
 const char * common_reasoning_format_name(common_reasoning_format format) {
    switch (format) {
        case COMMON_REASONING_FORMAT_NONE:     return "none";
-        case COMMON_REASONING_FORMAT_AUTO:     return "auto";
        case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
        case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
-        case COMMON_REASONING_FORMAT_GRANITE: return "granite";
        default:
            throw std::runtime_error("Unknown reasoning format");
    }
 }

-common_reasoning_format common_reasoning_format_from_name(const std::string & format) {
-    if (format == "none") {
-        return COMMON_REASONING_FORMAT_NONE;
-    } else if (format == "auto") {
-        return COMMON_REASONING_FORMAT_AUTO;
-    } else if (format == "deepseek") {
-        return COMMON_REASONING_FORMAT_DEEPSEEK;
-    } else if (format == "deepseek-legacy") {
-        return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
-    }
-    throw std::runtime_error("Unknown reasoning format: " + format);
-}
-
 static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
    std::string arguments;
    if (builder.is_partial()) {
@@ -792,10 +748,10 @@ static std::string apply(
    // instead of using `chat_template_options.use_bos_token = false`, since these tokens
    // may be needed inside the template / between messages too.
    auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
-    if (inputs.add_bos && string_starts_with(result, tmpl.bos_token())) {
+    if (string_starts_with(result, tmpl.bos_token())) {
        result = result.substr(tmpl.bos_token().size());
    }
-    if (inputs.add_eos && string_ends_with(result, tmpl.eos_token())) {
+    if (string_ends_with(result, tmpl.eos_token())) {
        result = result.substr(0, result.size() - tmpl.eos_token().size());
    }
    return result;
@@ -1333,174 +1289,6 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
        tool_calls_end);
 }

-static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
-    common_chat_params data;
-    auto prompt = apply(tmpl, inputs);
-
-    data.prompt = prompt;
-    data.format = COMMON_CHAT_FORMAT_GPT_OSS;
-
-    // These special tokens are required to parse properly, so we include them
-    // even if parse_tool_calls is false.
-    data.preserved_tokens = {
-        "<|channel|>",
-        "<|constrain|>",
-        "<|message|>",
-        "<|start|>",
-        "<|end|>",
-    };
-
-    if (inputs.tools.is_array() && !inputs.tools.empty()) {
-        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-            // tool calls can appear in commentary or analysis channels
-            auto channel = builder.add_rule("channel", "\"<|channel|>\" ( \"commentary\" | \"analysis\" )");
-
-            std::vector<std::string> tool_rules_recipient_in_role;
-            std::vector<std::string> tool_rules_recipient_in_channel;
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                std::string name = function.at("name");
-                auto parameters = function.at("parameters");
-                builder.resolve_refs(parameters);
-
-                tool_rules_recipient_in_role.push_back(
-                    builder.add_rule(name + "-call",
-                        "\"" + name + "\"" + channel + " \" <|constrain|>json\"? \"<|message|>\" " +
-                        builder.add_schema(name + "-args", parameters)
-                    )
-                );
-
-                tool_rules_recipient_in_channel.push_back(
-                    builder.add_rule(name + "-call",
-                        "\"" + name + "\"" + " \" <|constrain|>json\"? \"<|message|>\" " +
-                        builder.add_schema(name + "-args", parameters)
-                    )
-                );
-            });
-
-            auto recipient_in_role = builder.add_rule("recipient_in_role",
-                "\"<|start|>assistant\"? \" to=functions.\" ( " +
-                string_join(tool_rules_recipient_in_role, " | ") + " )"
-            );
-
-            auto recipient_in_channel = builder.add_rule("recipient_in_channel",
-                channel + " \" to=functions.\" ( " +
-                string_join(tool_rules_recipient_in_channel, " | ") + " )"
-            );
-
-            builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
-
-            // Trigger on tool calls that appear in the commentary channel
-            data.grammar_triggers.push_back({
-                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
-                "<\\|channel\\|>(commentary|analysis) to"
-            });
-
-            // Trigger tool calls that appear in the role section, either at the
-            // start or in the middle.
-            data.grammar_triggers.push_back({
-                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
-                "^ to"
-            });
-
-            data.grammar_triggers.push_back({
-                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
-                "<\\|start\\|>assistant to"
-            });
-        });
-    }
-
-    return data;
-}
-static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
-    static const std::string constraint = "(?: (<\\|constrain\\|>)?([a-zA-Z0-9_-]+))";
-    static const std::string recipient("(?: to=functions\\.([^<\\s]+))");
-
-    static const common_regex start_regex("<\\|start\\|>assistant");
-    static const common_regex analysis_regex("<\\|channel\\|>analysis");
-    static const common_regex final_regex("<\\|channel\\|>final" + constraint + "?");
-    static const common_regex preamble_regex("<\\|channel\\|>commentary");
-    static const common_regex tool_call1_regex(recipient + "<\\|channel\\|>(analysis|commentary)" + constraint + "?");
-    static const common_regex tool_call2_regex("<\\|channel\\|>(analysis|commentary)" + recipient + constraint + "?");
-
-    auto consume_end = [&](bool include_end = false) {
-        if (auto res = builder.try_find_literal("<|end|>")) {
-            return res->prelude + (include_end ? builder.str(res->groups[0]) : "");
-        }
-        return builder.consume_rest();
-    };
-
-    auto handle_tool_call = [&](const std::string & name) {
-        if (auto args = builder.try_consume_json_with_dumped_args({{}})) {
-            if (builder.syntax().parse_tool_calls) {
-                if (!builder.add_tool_call(name, "", args->value) || args->is_partial) {
-                    throw common_chat_msg_partial_exception("incomplete tool call");
-                }
-            } else if (args->is_partial) {
-                throw common_chat_msg_partial_exception("incomplete tool call");
-            }
-        }
-    };
-
-    auto regex_match = [](const common_regex & regex, const std::string & input) -> std::optional<common_regex_match> {
-        auto match = regex.search(input, 0, true);
-        if (match.type == COMMON_REGEX_MATCH_TYPE_FULL) {
-            return match;
-        }
-        return std::nullopt;
-    };
-
-    do {
-        auto header_start_pos = builder.pos();
-        auto content_start = builder.try_find_literal("<|message|>");
-        if (!content_start) {
-            throw common_chat_msg_partial_exception("incomplete header");
-        }
-
-        auto header = content_start->prelude;
-
-        if (auto match = regex_match(tool_call1_regex, header)) {
-            auto group = match->groups[1];
-            auto name = header.substr(group.begin, group.end - group.begin);
-            handle_tool_call(name);
-            continue;
-        }
-
-        if (auto match = regex_match(tool_call2_regex, header)) {
-            auto group = match->groups[2];
-            auto name = header.substr(group.begin, group.end - group.begin);
-            handle_tool_call(name);
-            continue;
-        }
-
-        if (regex_match(analysis_regex, header)) {
-            builder.move_to(header_start_pos);
-            if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
-                builder.add_content(consume_end(true));
-            } else {
-                builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|end|>");
-            }
-            continue;
-        }
-
-        if(regex_match(final_regex, header) || regex_match(preamble_regex, header)) {
-            builder.add_content(consume_end());
-            continue;
-        }
-
-        // Possibly a malformed message, attempt to recover by rolling
-        // back to pick up the next <|start|>
-        LOG_DBG("%s: unknown header from message: %s\n", __func__, header.c_str());
-        builder.move_to(header_start_pos);
-    } while (builder.try_find_regex(start_regex, std::string::npos, false));
-
-    auto remaining = builder.consume_rest();
-    if (!remaining.empty()) {
-        LOG_DBG("%s: content after last message: %s\n", __func__, remaining.c_str());
-    }
-}
-
 static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
    LOG_DBG("%s\n", __func__);
    common_chat_params data;
@@ -1858,7 +1646,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
        "|<function name=\"([^\"]+)\">"  // match 5 (function name again)
    );

-    while (auto res = builder.try_find_regex(open_regex)) {
+    if (auto res = builder.try_find_regex(open_regex)) {
        const auto & block_start = res->groups[1];
        std::string block_end = block_start.empty() ? "" : "```";

@@ -1880,6 +1668,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
                    builder.consume_literal(block_end);
                    builder.consume_spaces();
                }
+                builder.add_content(builder.consume_rest());
            } else {
                throw common_chat_msg_partial_exception("failed to parse tool call");
            }
@@ -1904,124 +1693,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
                    builder.consume_spaces();
                }
            }
-        }
-    }
-
-    builder.add_content(builder.consume_rest());
-}
-
-static common_chat_params common_chat_params_init_granite(const common_chat_template & tmpl, const struct templates_params & inputs) {
-    common_chat_params data;
-
-    // Pass thinking context for Granite template
-    json additional_context = {
-        {"thinking", inputs.enable_thinking},
-    };
-
-    data.prompt = apply(tmpl, inputs, /* messages_override= */ std::nullopt, /* tools_override= */ std::nullopt, additional_context);
-    data.format = COMMON_CHAT_FORMAT_GRANITE;
-
-    if (string_ends_with(data.prompt, "<think>\n") || string_ends_with(data.prompt, "<think>")) {
-        if (!inputs.enable_thinking) {
-            data.prompt += "</think>";
-        } else {
-            data.thinking_forced_open = true;
-        }
-    }
-
-    if (!inputs.tools.is_null()) {
-        // Granite uses <|tool_call|> followed by JSON list
-        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-            std::vector<std::string> tool_rules;
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                std::string name = function.at("name");
-                auto parameters = function.at("parameters");
-                builder.resolve_refs(parameters);
-                tool_rules.push_back(builder.add_rule(name + "-call", builder.add_schema(name +
-"-args", {
-                    {"type", "object"},
-                    {"properties", {
-                        {"name", {{"const", name}}},
-                        {"arguments", parameters},
-                    }},
-                    {"required", json::array({"name", "arguments"})},
-                })));
-            });
-
-            auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
-            auto tool_list = builder.add_rule("tool_list", "\"[\" space " + tool_call + " (\",\" space " + tool_call + ")* space \"]\"");
-
-            if (data.thinking_forced_open) {
-                builder.add_rule("root", "\"</think>\" space \"<response>\" space [^<]* \"</response>\" space \"<|tool_call|>\" space " + tool_list);
-            } else {
-                builder.add_rule("root", "\"<|tool_call|>\" space " + tool_list);
-            }
-
-            data.grammar_triggers.push_back({
-                COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
-                "<|tool_call|>"
-            });
-
-            data.preserved_tokens = {
-                "<think>",
-                "</think>",
-                "<response>",
-                "</response>",
-                "<|tool_call|>",
-            };
-        });
-    } else {
-        // Handle thinking tags for non-tool responses
-        if (data.thinking_forced_open && inputs.enable_thinking) {
-            data.grammar_lazy = false;
-            data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-                builder.add_rule("root", "\"</think>\" space \"<response>\" space .* \"</response>\" space");
-            });
-            data.preserved_tokens = {
-                "<think>",
-                "</think>",
-                "<response>",
-                "</response>",
-            };
-        }
-    }
-
-    return data;
-}
-
-static void common_chat_parse_granite(common_chat_msg_parser & builder) {
-    // Parse thinking tags
-    builder.try_parse_reasoning("<think>", "</think>");
-
-    // Parse response tags using regex
-    static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
-    if (auto res = builder.try_find_regex(response_regex)) {
-        // Extract the content between the tags (capture group 1)
-        auto content = builder.str(res->groups[1]);
-        builder.add_content(content);
-        builder.move_to(res->groups[0].end);
-    }
-
-    if (!builder.syntax().parse_tool_calls) {
-        builder.add_content(builder.consume_rest());
-        return;
-    }
-
-    // Look for tool calls
-    static const common_regex tool_call_regex(regex_escape("<|tool_call|>"));
-    if (auto res = builder.try_find_regex(tool_call_regex)) {
-        builder.move_to(res->groups[0].end);
-
-        // Expect JSON array of tool calls
-        auto tool_calls_data = builder.consume_json();
-        if (tool_calls_data.json.is_array()) {
-            if (!builder.add_tool_calls(tool_calls_data.json)) {
-                builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
-            }
-        } else {
-            builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
+            builder.add_content(builder.consume_rest());
        }
    } else {
        builder.add_content(builder.consume_rest());
@@ -2061,8 +1733,6 @@ static common_chat_params common_chat_templates_apply_jinja(
    params.enable_thinking = inputs.enable_thinking;
    params.grammar = inputs.grammar;
    params.now = inputs.now;
-    params.add_bos = tmpls->add_bos;
-    params.add_eos = tmpls->add_eos;

    params.extra_context = json::object();
    for (auto el : inputs.chat_template_kwargs) {
@@ -2099,21 +1769,11 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_command_r7b(tmpl, params);
    }

-    // Granite (IBM) - detects thinking / tools support
-    if (src.find("elif thinking") != std::string::npos && src.find("<|tool_call|>") != std::string::npos) {
-        return common_chat_params_init_granite(tmpl, params);
-    }
-
    // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
    if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
        return common_chat_params_init_hermes_2_pro(tmpl, params);
    }

-    // GPT-OSS
-    if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
-        return common_chat_params_init_gpt_oss(tmpl, params);
-    }
-
    // Use generic handler when mixing tools + JSON schema.
    // TODO: support that mix in handlers below.
    if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2164,7 +1824,6 @@ static common_chat_params common_chat_templates_apply_legacy(
    int alloc_size = 0;
    std::vector<llama_chat_message> chat;
    std::vector<std::string> contents;
-
    for (const auto & msg : inputs.messages) {
        auto content = msg.content;
        for (const auto & part : msg.content_parts) {
@@ -2266,12 +1925,6 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
        case COMMON_CHAT_FORMAT_COMMAND_R7B:
            common_chat_parse_command_r7b(builder);
            break;
-        case COMMON_CHAT_FORMAT_GRANITE:
-            common_chat_parse_granite(builder);
-            break;
-        case COMMON_CHAT_FORMAT_GPT_OSS:
-            common_chat_parse_gpt_oss(builder);
-            break;
        default:
            throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
    }
@@ -2291,8 +1944,6 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
        }
    }
    auto msg = builder.result();
-    if (!is_partial) {
-        LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
-    }
+    LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
    return msg;
 }
--- a/common/chat.h
+++ b/common/chat.h
@@ -109,8 +109,6 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
    COMMON_CHAT_FORMAT_HERMES_2_PRO,
    COMMON_CHAT_FORMAT_COMMAND_R7B,
-    COMMON_CHAT_FORMAT_GRANITE,
-    COMMON_CHAT_FORMAT_GPT_OSS,

    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
@@ -129,8 +127,6 @@ struct common_chat_templates_inputs {
    bool enable_thinking = true;
    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
    std::map<std::string, std::string> chat_template_kwargs;
-    bool add_bos = false;
-    bool add_eos = false;
 };

 struct common_chat_params {
@@ -187,12 +183,10 @@ std::string common_chat_format_single(
 // Returns an example of formatted chat
 std::string common_chat_format_example(
    const struct common_chat_templates * tmpls,
-    bool use_jinja,
-    const std::map<std::string, std::string> & chat_template_kwargs);
+    bool use_jinja);

 const char*               common_chat_format_name(common_chat_format format);
 const char*               common_reasoning_format_name(common_reasoning_format format);
-common_reasoning_format   common_reasoning_format_from_name(const std::string & format);
 common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);

 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -41,7 +41,6 @@
 #endif
 #include <locale>
 #include <windows.h>
-#include <string.h>
 #include <fcntl.h>
 #include <io.h>
 #else
@@ -1123,7 +1122,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
    mparams.use_mmap        = params.use_mmap;
    mparams.use_mlock       = params.use_mlock;
    mparams.check_tensors   = params.check_tensors;
-    mparams.use_extra_bufts = !params.no_extra_bufts;

    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
@@ -1566,56 +1564,3 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std

    return result;
 }
-
-ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
-    ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
-    const lr_opt &            d      = *(lr_opt *) userdata;
-    result.adamw.alpha = result.sgd.alpha = d.get_lr(d.epoch);
-    result.sgd.wd = result.adamw.wd = d.wd;
-    return result;
-}
-
-// TODO make all command line args case-insensitive
-static inline bool eq_case_insensitive(char const* a, char const* b) {
-    return !
-#if defined(_MSC_VER)
-        _stricmp
-#else
-        strcasecmp
-#endif // defined(_MSC_VER)
-        (a, b);
-}
-
-enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
-    if (eq_case_insensitive("adamw", n)) {
-        return GGML_OPT_OPTIMIZER_TYPE_ADAMW;
-    }
-    if (eq_case_insensitive("sgd", n)) {
-        return GGML_OPT_OPTIMIZER_TYPE_SGD;
-    }
-    return GGML_OPT_OPTIMIZER_TYPE_COUNT;
-}
-
-// TODO simplify to use just log and exp
-static float const k_log_2 = std::log(2.f);
-
-void lr_opt::init() {
-    if (lr_min > 0 && lr_min < lr0) {
-        float nhalf = std::log(lr0 / lr_min) / k_log_2;
-        float e     = epochs;
-        if (decay_epochs > 0 && decay_epochs < e) {
-            e = decay_epochs;
-        } else {
-            decay_epochs = e;
-        }
-        scale_epoch = nhalf / e;
-    }
-}
-
-float lr_opt::get_lr(float epoch) const {
-    float r = lr_min <= 0 ? lr0 :
-        epoch >= decay_epochs ? lr_min :
-        lr0 * std::pow(0.5f, epoch * scale_epoch);
-    LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
-    return r;
-}
--- a/common/common.h
+++ b/common/common.h
@@ -2,17 +2,14 @@

 #pragma once

+#include "llama-cpp.h"
+
 #include <set>
-#include <sstream>
 #include <string>
 #include <string_view>
 #include <vector>
 #include <map>
 #include <sstream>
-#include <cmath>
-
-#include "ggml-opt.h"
-#include "llama-cpp.h"

 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -85,7 +82,6 @@ enum llama_example {
    LLAMA_EXAMPLE_PARALLEL,
    LLAMA_EXAMPLE_TTS,
    LLAMA_EXAMPLE_DIFFUSION,
-    LLAMA_EXAMPLE_FINETUNE,

    LLAMA_EXAMPLE_COUNT,
 };
@@ -205,8 +201,6 @@ struct common_params_speculative {
    int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
    float   p_split      =  0.1f; // speculative decoding split probability
    float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
-    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
-    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;

    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
@@ -226,46 +220,19 @@ struct common_params_vocoder {
 };

 struct common_params_diffusion {
-    int32_t steps         = 128;
-    bool    visual_mode   = false;
-
-    float   eps           = 0;        // epsilon for timesteps
-    int32_t block_length  = 0;        // block length for generation
-
-    int32_t algorithm     = 4;        // default algorithm: low-confidence
-    float   alg_temp      = 0.0f;     // algorithm temperature
-
-    float   cfg_scale     = 0;        // classifier-free guidance scale
-    bool    add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
+    int32_t steps       = 64;     // number of diffusion steps
+    float   eps         = 1e-3f;  // epsilon for timesteps
+    int32_t algorithm   = 0;      // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
+    float   alg_temp    = 0.0f;   // algorithm temperature
+    bool    visual_mode = false;  // show progressive diffusion on screen
 };

 enum common_reasoning_format {
    COMMON_REASONING_FORMAT_NONE,
-    COMMON_REASONING_FORMAT_AUTO,
    COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
    COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
-    COMMON_REASONING_FORMAT_GRANITE,         // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
 };

-
-struct lr_opt {
-    float    lr0          = 1e-5; // learning rate at first epoch
-    float    lr_min       = -1;
-    float    decay_epochs = -1;   // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
-    float    scale_epoch  = 0;
-    float    wd           = 0;
-    unsigned epochs       = 2;
-
-    unsigned epoch; // set by optimizer outer (epochs) loop
-    // learning rate decay - constant LR per epoch only for now
-    float get_lr(float e) const;
-    float get_lr() const { return get_lr(epoch); }
-    // must call after arg parse, before get_lr
-    void init();
-};
-
-struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
-
 struct common_params {
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =  4096; // context size
@@ -385,7 +352,6 @@ struct common_params {
    bool warmup            = true;  // warmup run
    bool check_tensors     = false; // validate tensor data
    bool no_op_offload     = false; // globally disable offload host tensor operations to device
-    bool no_extra_bufts    = false; // disable extra buffer types (used for weight repacking)

    bool single_turn       = false; // single turn chat conversation

@@ -400,11 +366,6 @@ struct common_params {
    bool no_mmproj = false;         // explicitly disable multimodal model
    std::vector<std::string> image; // path to image file(s)

-    // finetune
-    struct lr_opt lr;
-    enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
-    float val_split = 0.05f; // fraction of the data used for the validation set
-
    // embedding
    bool embedding         = false; // get only sentence embedding
    int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
@@ -413,12 +374,11 @@ struct common_params {
    std::string cls_sep    = "\t";  // separator of classification sequences

    // server params
-    int32_t port              = 8080;         // server listens on this network port
-    int32_t timeout_read      = 600;          // http read timeout in seconds
-    int32_t timeout_write     = timeout_read; // http write timeout in seconds
-    int32_t n_threads_http    = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
-    int32_t n_cache_reuse     = 0;            // min chunk size to reuse from the cache via KV shifting
-    int32_t n_swa_checkpoints = 3;            // max number of SWA checkpoints per slot
+    int32_t port           = 8080;         // server listens on this network port
+    int32_t timeout_read   = 600;          // http read timeout in seconds
+    int32_t timeout_write  = timeout_read; // http write timeout in seconds
+    int32_t n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
+    int32_t n_cache_reuse  = 0;            // min chunk size to reuse from the cache via KV shifting

    std::string hostname      = "127.0.0.1";
    std::string public_path   = "";                                                                         // NOLINT
@@ -426,7 +386,7 @@ struct common_params {
    std::string chat_template = "";                                                                         // NOLINT
    bool use_jinja = false;                                                                                 // NOLINT
    bool enable_chat_template = true;
-    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int reasoning_budget = -1;
    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response

@@ -471,7 +431,6 @@ struct common_params {
    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
    int32_t i_chunk     =  0; // start processing from this chunk
-    int8_t  imat_dat    =  0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)

    bool process_output  = false; // collect data for the output tensor
    bool compute_ppl     = true;  // whether to compute perplexity
@@ -733,6 +692,3 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 //

 ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
-
-// "adamw" or "sgd" (case insensitive)
-enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -1,39 +1,30 @@
 #include "speculative.h"

-#include "ggml.h"
-#include "llama.h"
 #include "log.h"
 #include "common.h"
 #include "sampling.h"

 #include <cstring>
 #include <algorithm>
-#include <map>

 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  128
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5

 struct common_speculative {
-    struct llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
-    struct llama_context * ctx_dft;
+    struct llama_context * ctx;
    struct common_sampler * smpl;

    llama_batch batch;
-    llama_tokens prompt_dft;
-    bool vocab_dft_compatible = true; // whether retokenization is needed
-    std::map<std::string, std::string> tgt_dft_replacements = {};
+    llama_tokens prompt;
 };

 struct common_speculative * common_speculative_init(
-        struct llama_context * ctx_tgt,
        struct llama_context * ctx_dft) {
    auto * result = new common_speculative {
-        /* .ctx_tgt    = */ ctx_tgt,
-        /* .ctx_dft    = */ ctx_dft,
-        /* .smpl       = */ nullptr,
-        /* .batch      = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
-        /* .prompt_dft = */ {},
-        /* .vocab_dft_compatible = */ false,
+        /* .ctx    = */ ctx_dft,
+        /* .smpl   = */ nullptr,
+        /* .batch  = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
+        /* .prompt = */ {},
    };

    // TODO: optimize or pass from outside?
@@ -68,9 +59,6 @@ struct common_speculative * common_speculative_init(
    }
 #endif

-    result->vocab_dft_compatible = common_speculative_are_compatible(ctx_tgt, ctx_dft);
-    LOG_DBG("vocab_dft_compatible = %d\n", result->vocab_dft_compatible);
-
    return result;
 }

@@ -87,8 +75,8 @@ void common_speculative_free(struct common_speculative * spec) {
 }

 bool common_speculative_are_compatible(
-    const struct llama_context * ctx_tgt,
-    const struct llama_context * ctx_dft) {
+        const struct llama_context * ctx_tgt,
+        const struct llama_context * ctx_dft) {
    const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
    const struct llama_model * model_dft = llama_get_model(ctx_dft);

@@ -102,32 +90,31 @@ bool common_speculative_are_compatible(
    LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);

    if (vocab_type_tgt != vocab_type_dft) {
-        LOG_DBG("%s: draft model vocab type must match target model to use speculation but ", __func__);
-        LOG_DBG("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
+        LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
+                     "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
        return false;
    }

-    if (
-        llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
+    if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
        llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
        llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
-        llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
-    ) {
-        LOG_DBG("%s: draft model special tokens must match target model to use speculation\n", __func__);
+        llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) {
+        LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__);
+        LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt));
+        LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft));
        return false;
    }

    {
        const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
        const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
-        const int vocab_diff  = n_vocab_tgt > n_vocab_dft
-            ? n_vocab_tgt - n_vocab_dft
-            : n_vocab_dft - n_vocab_tgt;
+
+        const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);

        if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
-            LOG_DBG("%s: draft model vocab must closely match target model to use speculation but ", __func__);
-            LOG_DBG("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
-                    n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
+            LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
+                         "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
+                    __func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
            return false;
        }

@@ -135,8 +122,8 @@ bool common_speculative_are_compatible(
            const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
            const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
            if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
-                LOG_DBG("%s: draft model vocab must match target model to use speculation but ", __func__);
-                LOG_DBG("token %d content differs - target '%s', draft '%s'\n", i,
+                LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but "
+                             "token %d content differs - target '%s', draft '%s'\n", __func__, i,
                        common_token_to_piece(ctx_tgt, i).c_str(),
                        common_token_to_piece(ctx_dft, i).c_str());
                return false;
@@ -147,93 +134,32 @@ bool common_speculative_are_compatible(
    return true;
 }

-void common_speculative_add_replacement_tgt_dft(
-        struct common_speculative * spec,
-        const char *source, const char *dest) {
-    spec->tgt_dft_replacements[source] = dest;
-}
-
-static std::string replace_to_dft(
-        struct common_speculative * spec,
-        const std::string& input) {
-    std::string result = input;
-    for (const auto & pair : spec->tgt_dft_replacements) {
-        size_t pos = result.find(pair.first);
-        while (pos != std::string::npos) {
-            result.replace(pos, pair.first.length(), pair.second);
-            pos = result.find(pair.first, pos + pair.second.length());
-        }
-    }
-    return result;
-}
-
-static std::string replace_to_tgt(
-        struct common_speculative * spec,
-        const std::string& input) {
-    std::string result = input;
-    for (const auto& pair : spec->tgt_dft_replacements) {
-        size_t pos = result.find(pair.second);
-        while (pos != std::string::npos) {
-            result.replace(pos, pair.second.length(), pair.first);
-            pos = result.find(pair.second, pos + pair.first.length());
-        }
-    }
-    return result;
-}
-
-
 llama_tokens common_speculative_gen_draft(
        struct common_speculative * spec,
        struct common_speculative_params params,
-        const llama_tokens & prompt_tgt_main_model, // specified in target model vocab
+        const llama_tokens & prompt_tgt,
        llama_token id_last) {
    auto & batch  = spec->batch;
-    auto & ctx_tgt = spec->ctx_tgt;
-    auto & ctx_dft = spec->ctx_dft;
+    auto & ctx    = spec->ctx;
    auto & smpl   = spec->smpl;
-    auto & prompt_dft = spec->prompt_dft;
+    auto & prompt = spec->prompt;

-    auto * mem_dft = llama_get_memory(ctx_dft);
+    auto * mem = llama_get_memory(ctx);

    int reuse_i = 0;
    int reuse_n = 0;

-    const int n_ctx = llama_n_ctx(ctx_dft) - params.n_draft;
-
-    llama_tokens prompt_tgt_draft_model;
-    if (!spec->vocab_dft_compatible) {
-        std::string text;
-        text = common_detokenize(ctx_tgt, prompt_tgt_main_model, true);
-        text = replace_to_dft(spec, text);
-        LOG_DBG("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str());
-        prompt_tgt_draft_model = common_tokenize(ctx_dft, text, false, true);
-
-        // convert id_last to draft vocab. llama_detokenize is called directly to avoid an allocation
-        const auto * model_tgt = llama_get_model(ctx_tgt);
-        const auto * vocab_tgt = llama_model_get_vocab(model_tgt);
-
-        int32_t n_chars = llama_detokenize(vocab_tgt, &id_last, 1, nullptr, 0, false, false);
-        GGML_ASSERT(n_chars < 0 && "failed to detokenize id_last");
-        text.resize(-n_chars);
-        llama_detokenize(vocab_tgt, &id_last, 1, text.data(), text.size(), false, false);
-        text = replace_to_dft(spec, text);
-
-        LOG_DBG("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str());
-        id_last = common_tokenize(ctx_dft, text, false, true)[0];
-    }
-    // prompt_tgt's tokens will always be compatible with ctx_dft
-    const llama_tokens &prompt_tgt =
-        spec->vocab_dft_compatible ? prompt_tgt_main_model : prompt_tgt_draft_model;
+    const int n_ctx = llama_n_ctx(ctx) - params.n_draft;

    const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);

    // reuse as much as possible from the old draft context
    // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
-    for (int i = 0; i < (int) prompt_dft.size(); ++i) {
+    for (int i = 0; i < (int) prompt.size(); ++i) {
        int cur = 0;
        while (i_start + cur < (int) prompt_tgt.size() &&
-               i       + cur < (int) prompt_dft.size() &&
-               prompt_tgt[i_start + cur] == prompt_dft[i + cur]) {
+               i       + cur < (int) prompt.size() &&
+               prompt_tgt[i_start + cur] == prompt[i + cur]) {
            cur++;
        }

@@ -243,20 +169,21 @@ llama_tokens common_speculative_gen_draft(
        }
    }

-    LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
+    LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());

    llama_tokens result;
    result.reserve(params.n_draft);

    if (reuse_n == 0) {
-        llama_memory_clear(mem_dft, false);
-        prompt_dft.clear();
+        llama_memory_clear(mem, false);
+
+        prompt.clear();
    } else {
        // this happens when a previous draft has been discarded (for example, due to being too small), but the
        // target model agreed with it. in this case, we simply pass back the previous results to save compute
-        if (reuse_i + reuse_n < (int) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
-            for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) {
-                result.push_back(prompt_dft[i]);
+        if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
+            for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
+                result.push_back(prompt[i]);

                if (params.n_draft <= (int) result.size()) {
                    break;
@@ -267,15 +194,16 @@ llama_tokens common_speculative_gen_draft(
        }

        if (reuse_i > 0) {
-            llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
-            llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);
+            llama_memory_seq_rm (mem, 0, 0, reuse_i);
+            llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);

-            prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
+            prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
        }

-        if (reuse_n < (int) prompt_dft.size()) {
-            llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
-            prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
+        if (reuse_n < (int) prompt.size()) {
+            llama_memory_seq_rm (mem, 0, reuse_n, -1);
+
+            prompt.erase(prompt.begin() + reuse_n, prompt.end());
        }
    }

@@ -286,28 +214,28 @@ llama_tokens common_speculative_gen_draft(
        //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
        common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);

-        prompt_dft.push_back(prompt_tgt[i]);
+        prompt.push_back(prompt_tgt[i]);
    }

    // we should rarely end-up here during normal decoding
    if (batch.n_tokens > 0) {
        //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());

-        llama_decode(ctx_dft, batch);
+        llama_decode(ctx, batch);
    }

-    const llama_pos n_past = prompt_dft.size();
+    const llama_pos n_past = prompt.size();

    LOG_DBG("%s: n_past = %d\n", __func__, n_past);

    common_batch_clear(batch);
    common_batch_add  (batch, id_last, n_past, { 0 }, true);

-    prompt_dft.push_back(id_last);
+    prompt.push_back(id_last);

-    LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
+    //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());

-    llama_decode(ctx_dft, batch);
+    llama_decode(ctx, batch);

    common_sampler_reset(smpl);

@@ -315,13 +243,13 @@ llama_tokens common_speculative_gen_draft(
    for (int i = 0; i < params.n_draft; ++i) {
        common_batch_clear(batch);

-        common_sampler_sample(smpl, ctx_dft, 0, true);
+        common_sampler_sample(smpl, ctx, 0, true);

        const auto * cur_p = common_sampler_get_candidates(smpl);

        for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
            LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
-                    k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
+                    k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
        }

        // add drafted token for each sequence
@@ -343,19 +271,10 @@ llama_tokens common_speculative_gen_draft(
        common_batch_add(batch, id, n_past + i + 1, { 0 }, true);

        // evaluate the drafted tokens on the draft model
-        llama_decode(ctx_dft, batch);
+        llama_decode(ctx, batch);

-        prompt_dft.push_back(id);
+        prompt.push_back(id);
    }

-    if (!spec->vocab_dft_compatible) {
-        std::string detokenized = common_detokenize(ctx_dft, result, true);
-        detokenized = replace_to_tgt(spec, detokenized);
-        LOG_DBG("draft->main detokenized string: '%s'\n", detokenized.c_str());
-        result = common_tokenize(ctx_tgt, detokenized, false, true);
-        if (result.size() > (size_t)params.n_draft) {
-            result.resize(params.n_draft);
-        }
-    }
    return result;
 }
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -12,10 +12,7 @@ struct common_speculative_params {
    float p_min = 0.75f; // min probability required to accept a token in the draft
 };

-struct common_speculative * common_speculative_init(
-        struct llama_context * ctx_tgt,
-        struct llama_context * ctx_dft
-);
+struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);

 void common_speculative_free(struct common_speculative * spec);

@@ -23,10 +20,6 @@ bool common_speculative_are_compatible(
        const struct llama_context * ctx_tgt,
        const struct llama_context * ctx_dft);

-void common_speculative_add_replacement_tgt_dft(
-        struct common_speculative * spec,
-        const char *source, const char *dest);
-
 // sample up to n_draft tokens and add them to the batch using the draft model
 llama_tokens common_speculative_gen_draft(
               struct common_speculative * spec,
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -59,10 +59,6 @@ parser.add_argument(
    "--full", action="store_true",
    help="download full list of models - make sure you have access to all of them",
 )
-parser.add_argument(
-    "--check-missing", action="store_true",
-    help="only check for missing pre-tokenizer hashes",
-)
 parser.add_argument(
    "hf_token",
    help="optional HF token",
@@ -74,10 +70,6 @@ hf_token = args.hf_token if args.hf_token is not None else hf_token
 if hf_token is None:
    logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")

-if args.check_missing and args.full:
-    logger.warning("Downloading full list of models requested, ignoring --check-missing!")
-    args.check_missing = False
-
 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
 CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
@@ -138,7 +130,6 @@ models = [
    {"name": "midm-2.0",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
    {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
    {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
-    {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
@@ -147,17 +138,14 @@ pre_computed_hashes = [
    {"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b"},
    {"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
    {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
-    {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.5-Air", "chkhsh": "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902"},
    {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
    {"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
-    {"name": "hunyuan-dense", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-4B-Instruct", "chkhsh": "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6"},
    # falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
    {"name": "kimi-k2",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base",   "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
-    {"name": "qwen2",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
 ]


@@ -232,13 +220,12 @@ if not args.full:
    all_models = models.copy()
    models = [model for model in all_models if model["name"] not in existing_models]

-if not args.check_missing:
-    logging.info(f"Downloading {len(models)} models...")
-    for model in models:
-        try:
-            download_model(model)
-        except Exception as e:
-            logger.error(f"Failed to download model {model['name']}. Error: {e}")
+logging.info(f"Downloading {len(models)} models...")
+for model in models:
+    try:
+        download_model(model)
+    except Exception as e:
+        logger.error(f"Failed to download model {model['name']}. Error: {e}")


 # generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -340,7 +340,7 @@ if __name__ == '__main__':
            sys.exit(1)
    else:
        logger.info(f"Loading base model: {dir_base_model.name}")
-        hparams = ModelBase.load_hparams(dir_base_model, False)
+        hparams = ModelBase.load_hparams(dir_base_model)

    with torch.inference_mode():
        try:
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -310,7 +310,5 @@ Specifies the memory pool management strategy:

 Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies.

-### GGML_CANN_WEIGHT_NZ
-
-Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
-
+## TODO
+- Support more models and data types.
--- a/docs/build-s390x.md
+++ b/docs/build-s390x.md
@@ -42,14 +42,14 @@ cmake --build build --config Release -j $(nproc)
    cmake --build build --config Release -j $(nproc)
    ```

-   By default, NNPA is disabled by default. To enable it:
+-   By default, NNPA is enabled when available. To disable it (not recommended):

    ```bash
    cmake -S . -B build             \
        -DCMAKE_BUILD_TYPE=Release  \
        -DGGML_BLAS=ON              \
        -DGGML_BLAS_VENDOR=OpenBLAS \
-        -DGGML_NNPA=ON
+        -DGGML_NNPA=OFF

    cmake --build build --config Release -j $(nproc)
    ```
@@ -76,23 +76,6 @@ cmake --build build --config Release -j $(nproc)
    cmake --build build --config Release -j $(nproc)
    ```

-## IBM zDNN Accelerator
-
-This provides acceleration using the IBM zAIU co-processor located in the Telum I and Telum II processors. Make sure to have the [IBM zDNN library](https://github.com/IBM/zDNN) installed.
-
-#### Compile from source from IBM
-
-You may find the official build instructions here: [Building and Installing zDNN](https://github.com/IBM/zDNN?tab=readme-ov-file#building-and-installing-zdnn)
-
-### Compilation
-
-```bash
-cmake -S . -B build             \
-    -DCMAKE_BUILD_TYPE=Release  \
-    -DGGML_ZDNN=ON
-cmake --build build --config Release -j$(nproc)
-```
-
 ## Getting GGUF Models

 All models need to be converted to Big-Endian. You can achieve this in three cases:
@@ -101,9 +84,9 @@ All models need to be converted to Big-Endian. You can achieve this in three cas

    ![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff)

-    You can find popular models pre-converted and verified at [s390x Verified Models](https://huggingface.co/collections/taronaeo/s390x-verified-models-672765393af438d0ccb72a08) or [s390x Runnable Models](https://huggingface.co/collections/taronaeo/s390x-runnable-models-686e951824198df12416017e).
+    You can find popular models pre-converted and verified at [s390x Ready Models](https://huggingface.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08).

-    These models have already been converted from `safetensors` to `GGUF` Big-Endian and their respective tokenizers verified to run correctly on IBM z15 and later system.
+    These models have already been converted from `safetensors` to `GGUF Big-Endian` and their respective tokenizers verified to run correctly on IBM z15 and later system.

 2. **Convert safetensors model to GGUF Big-Endian directly (recommended)**

@@ -111,14 +94,6 @@ All models need to be converted to Big-Endian. You can achieve this in three cas

    The model you are trying to convert must be in `safetensors` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct)). Make sure you have downloaded the model repository for this case.

-    Ensure that you have installed the required packages in advance
-
-    ```bash
-    pip3 install -r requirements.txt
-    ```
-
-    Convert the `safetensors` model to `GGUF`
-
    ```bash
    python3 convert_hf_to_gguf.py \
        --outfile model-name-be.f16.gguf \
@@ -141,7 +116,7 @@ All models need to be converted to Big-Endian. You can achieve this in three cas

    ![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff)

-    The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B GGUF](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case.
+    The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case.

    ```bash
    python3 gguf-py/gguf/scripts/gguf_convert_endian.py model-name.f16.gguf BIG
@@ -162,19 +137,19 @@ All models need to be converted to Big-Endian. You can achieve this in three cas

 ### 1. SIMD Acceleration

-Only available in IBM z15/LinuxONE 3 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
+Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.

 ### 2. NNPA Vector Intrinsics Acceleration

-Only available in IBM z16/LinuxONE 4 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
+Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned on when available) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.

-### 3. zDNN Accelerator (WIP)
+### 3. zDNN Accelerator

-Only available in IBM z17/LinuxONE 5 or later system with the `-DGGML_ZDNN=ON` compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs will default back to CPU routines.
+_Only available in IBM z16 or later system. No direction at the moment._

 ### 4. Spyre Accelerator

-_Only available with IBM z17 / LinuxONE 5 or later system. No support currently available._
+_No direction at the moment._

 ## Performance Tuning

@@ -214,26 +189,6 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl

    Answer: Please ensure that your GCC compiler is of minimum GCC 15.1.0 version, and have `binutils` updated to the latest version. If this does not fix the problem, kindly open an issue.

-4. Failing to install the `sentencepiece` package using GCC 15+
-
-    Answer: The `sentencepiece` team are aware of this as seen in [this issue](https://github.com/google/sentencepiece/issues/1108).
-
-    As a temporary workaround, please run the installation command with the following environment variables.
-
-    ```bash
-    export CXXFLAGS="-include cstdint"
-    ```
-
-    For example,
-
-    ```bash
-    CXXFLAGS="-include cstdint" pip3 install -r requirements.txt
-    ```
-
-5. `-DGGML_NNPA=ON` generates gibberish output
-
-    Answer: We are aware of this as detailed in [this issue](https://github.com/ggml-org/llama.cpp/issues/14877). Please either try reducing the number of threads, or disable the compile option using `-DGGML_NNPA=OFF`.
-
 ## Getting Help on IBM Z & LinuxONE

 1. **Bugs, Feature Requests**
@@ -246,12 +201,11 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl

 ## Appendix A: Hardware Support Matrix

-|          | Support | Minimum Compiler Version |
-| -------- | ------- | ------------------------ |
-| IBM z15  | ✅      |                          |
-| IBM z16  | ✅      |                          |
-| IBM z17  | ✅      | GCC 15.1.0               |
-| IBM zDNN | ✅      |                          |
+|         | Support | Minimum Compiler Version |
+| ------- | ------- | ------------------------ |
+| IBM z15 | ✅      |                          |
+| IBM z16 | ✅      |                          |
+| IBM z17 | ✅      | GCC 15.1.0               |

 -   ✅ - supported and verified to run as intended
 -   🚫 - unsupported, we are unlikely able to provide support
@@ -260,7 +214,7 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl

 |            | VX/VXE/VXE2 | NNPA | zDNN | Spyre |
 | ---------- | ----------- | ---- | ---- | ----- |
-| FP32       | ✅          | ✅   | ✅   | ❓    |
+| FP32       | ✅          | ✅   | ❓   | ❓    |
 | FP16       | ✅          | ✅   | ❓   | ❓    |
 | BF16       | 🚫          | 🚫   | ❓   | ❓    |
 | Q4_0       | ✅          | ✅   | ❓   | ❓    |
@@ -290,5 +244,3 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 -   ✅ - acceleration available
 -   🚫 - acceleration unavailable, will still run using scalar implementation
 -   ❓ - acceleration unknown, please contribute if you can test it yourself
-
-Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on July 31, 2025.
--- a/docs/development/HOWTO-add-model.md
+++ b/docs/development/HOWTO-add-model.md
@@ -23,19 +23,11 @@ The convert script reads the model configuration, tokenizer, tensor names+data a

 The required steps to implement for an HF model are:

-1. Define the model `ModelBase.register` annotation in a new `TextModel` or `MmprojModel` subclass, example:
+1. Define the model `Model.register` annotation in a new `Model` subclass, example:

 ```python
-@ModelBase.register("MyModelForCausalLM")
-class MyModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.MYMODEL
-```
-
-or
-
-```python
-@ModelBase.register("MyModelForConditionalGeneration")
-class MyModel(MmprojModel):
+@Model.register("MyModelForCausalLM")
+class MyModel(Model):
    model_arch = gguf.MODEL_ARCH.MYMODEL
 ```

@@ -83,10 +75,9 @@ block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
 `transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF.

 Depending on the model configuration, tokenizer, code and tensors layout, you will have to override:
- `TextModel#set_gguf_parameters`
- `MmprojModel#set_gguf_parameters`
- `ModelBase#set_vocab`
- `ModelBase#modify_tensors`
+- `Model#set_gguf_parameters`
+- `Model#set_vocab`
+- `Model#write_tensors`

 NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the convention and several tools like `quantize` expect this to proceed the weights.

--- a/docs/multimodal.md
+++ b/docs/multimodal.md
@@ -97,9 +97,6 @@ NOTE: some models may require large context window, for example: `-c 8192`
 # Qwen2-Audio and SeaLLM-Audio
 # note: no pre-quantized GGUF this model, as they have very poor result
 # ref: https://github.com/ggml-org/llama.cpp/pull/13760
-
-# Mistral's Voxtral
-(tool_name) -hf ggml-org/Voxtral-Mini-3B-2507-GGUF
 ```

 **Mixed modalities**:
--- a/docs/multimodal/minicpmo2.6.md
+++ b/docs/multimodal/minicpmo2.6.md
@@ -13,7 +13,7 @@ If there are differences in usage, please refer to the official build [documenta

 Clone llama.cpp:
 ```bash
-git clone https://github.com/ggml-org/llama.cpp
+git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 ```

@@ -29,8 +29,8 @@ cmake --build build --config Release
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)

 ```bash
-python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-o-2_6
-python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --minicpmv_version 4
+python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-o-2_6
+python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
 python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model

 # quantize int4 version
--- a/docs/multimodal/minicpmo4.0.md
+++ b/docs/multimodal/minicpmo4.0.md
@@ -1,47 +0,0 @@
-## MiniCPM-o 4
-
-### Prepare models and code
-
-Download [MiniCPM-o-4](https://huggingface.co/openbmb/MiniCPM-o-4) PyTorch model from huggingface to "MiniCPM-o-4" folder.
-
-
-### Build llama.cpp
-Readme modification time: 20250206
-
-If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
-
-Clone llama.cpp:
-```bash
-git clone https://github.com/ggerganov/llama.cpp
-cd llama.cpp
-```
-
-Build llama.cpp using `CMake`:
-```bash
-cmake -B build
-cmake --build build --config Release
-```
-
-
-### Usage of MiniCPM-o 4
-
-Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-4-gguf) by us)
-
-```bash
-python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-o-4
-python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-4 --minicpmv-projector ../MiniCPM-o-4/minicpmv.projector --output-dir ../MiniCPM-o-4/ --minicpmv_version 6
-python ./convert_hf_to_gguf.py ../MiniCPM-o-4/model
-
-# quantize int4 version
-./build/bin/llama-quantize ../MiniCPM-o-4/model/ggml-model-f16.gguf ../MiniCPM-o-4/model/ggml-model-Q4_K_M.gguf Q4_K_M
-```
-
-
-Inference on Linux or Mac
-```bash
-# run in single-turn mode
-./build/bin/llama-mtmd-cli -m ../MiniCPM-o-4/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-4/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
-
-# run in conversation mode
-./build/bin/llama-mtmd-cli -m ../MiniCPM-o-4/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-4/mmproj-model-f16.gguf
-```
--- a/docs/multimodal/minicpmv2.5.md
+++ b/docs/multimodal/minicpmv2.5.md
@@ -28,8 +28,8 @@ cmake --build build --config Release
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)

 ```bash
-python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
-python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --minicpmv_version 2
+python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
+python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
 python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model

 # quantize int4 version
--- a/docs/multimodal/minicpmv2.6.md
+++ b/docs/multimodal/minicpmv2.6.md
@@ -12,7 +12,7 @@ If there are differences in usage, please refer to the official build [documenta

 Clone llama.cpp:
 ```bash
-git clone https://github.com/ggml-org/llama.cpp
+git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 ```

@@ -28,8 +28,8 @@ cmake --build build --config Release
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)

 ```bash
-python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-2_6
-python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --minicpmv_version 3
+python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-V-2_6
+python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
 python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model

 # quantize int4 version
--- a/docs/multimodal/minicpmv4.0.md
+++ b/docs/multimodal/minicpmv4.0.md
@@ -1,47 +0,0 @@
-## MiniCPM-V 4
-
-### Prepare models and code
-
-Download [MiniCPM-V-4](https://huggingface.co/openbmb/MiniCPM-V-4) PyTorch model from huggingface to "MiniCPM-V-4" folder.
-
-
-### Build llama.cpp
-Readme modification time: 20250206
-
-If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
-
-Clone llama.cpp:
-```bash
-git clone https://github.com/ggerganov/llama.cpp
-cd llama.cpp
-```
-
-Build llama.cpp using `CMake`:
-```bash
-cmake -B build
-cmake --build build --config Release
-```
-
-
-### Usage of MiniCPM-V 4
-
-Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-4-gguf) by us)
-
-```bash
-python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-4
-python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-4 --minicpmv-projector ../MiniCPM-V-4/minicpmv.projector --output-dir ../MiniCPM-V-4/ --minicpmv_version 5
-python ./convert_hf_to_gguf.py ../MiniCPM-V-4/model
-
-# quantize int4 version
-./build/bin/llama-quantize ../MiniCPM-V-4/model/ggml-model-f16.gguf ../MiniCPM-V-4/model/ggml-model-Q4_K_M.gguf Q4_K_M
-```
-
-
-Inference on Linux or Mac
-```bash
-# run in single-turn mode
-./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
-
-# run in conversation mode
-./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4/mmproj-model-f16.gguf
-```
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -2,102 +2,94 @@

 List of GGML operations and backend support status.

-## How to add a backend to this table:
-
-1. Run `test-backend-ops support --output csv` with your backend name and redirect output to a csv file in `docs/ops/` (e.g., `docs/ops/CUDA.csv`)
-2. Regenerate `/docs/ops.md` via `./scripts/create_ops_docs.py`
-
 Legend:
 - ✅ Fully supported by this backend
 - 🟡 Partially supported by this backend
 - ❌ Not supported by this backend

-| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | zDNN |
-|-----------|------|------|------|------|------|------|------|------|------|
-|                              ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ |
-|                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
-|                          CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ |
-|                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|               CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
-|                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                      GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                             GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
-|                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                           IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
-|                          L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                              LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
-|                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
-|                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                             NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
-|                              PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ |
-|                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
-|                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                             ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                              SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                         SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                          SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ |
-|                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ❌ | ❌ |
-|                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                              SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
-|                         SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
-|               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
+| Operation | BLAS | CPU | CUDA | Metal |
+|-----------|------|------|------|------|
+|                              ABS | ❌ | ✅ | 🟡 | ❌ |
+|                              ACC | ❌ | ✅ | ✅ | ✅ |
+|                              ADD | ❌ | ✅ | ✅ | 🟡 |
+|                             ADD1 | ❌ | ✅ | ✅ | ❌ |
+|                           ARANGE | ❌ | ✅ | ✅ | ✅ |
+|                           ARGMAX | ❌ | ✅ | ✅ | ✅ |
+|                          ARGSORT | ❌ | ✅ | ✅ | ✅ |
+|                            CLAMP | ❌ | ✅ | ✅ | 🟡 |
+|                           CONCAT | ❌ | ✅ | 🟡 | ✅ |
+|                             CONT | ❌ | ✅ | 🟡 | ✅ |
+|                       CONV_2D_DW | ❌ | ✅ | ✅ | ❌ |
+|                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ |
+|                CONV_TRANSPOSE_2D | ❌ | ✅ | ✅ | ❌ |
+|                              COS | ❌ | ✅ | ✅ | 🟡 |
+|                      COUNT_EQUAL | ❌ | ✅ | ✅ | ❌ |
+|                              CPY | ❌ | 🟡 | 🟡 | 🟡 |
+|               CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ❌ |
+|          CROSS_ENTROPY_LOSS_BACK | ❌ | ✅ | ✅ | ❌ |
+|                    DIAG_MASK_INF | ❌ | ✅ | ✅ | 🟡 |
+|                              DIV | ❌ | ✅ | ✅ | 🟡 |
+|                              DUP | ❌ | ✅ | 🟡 | 🟡 |
+|                              ELU | ❌ | ✅ | ❌ | 🟡 |
+|                              EXP | ❌ | ✅ | 🟡 | ❌ |
+|                   FLASH_ATTN_EXT | ❌ | ✅ | 🟡 | 🟡 |
+|                GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ❌ |
+|                            GEGLU | ❌ | ✅ | ✅ | 🟡 |
+|                        GEGLU_ERF | ❌ | ✅ | ✅ | 🟡 |
+|                      GEGLU_QUICK | ❌ | ✅ | ✅ | 🟡 |
+|                             GELU | ❌ | ✅ | 🟡 | 🟡 |
+|                         GELU_ERF | ❌ | ✅ | 🟡 | 🟡 |
+|                       GELU_QUICK | ❌ | ✅ | 🟡 | 🟡 |
+|                         GET_ROWS | ❌ | ✅ | 🟡 | ✅ |
+|                    GET_ROWS_BACK | ❌ | 🟡 | 🟡 | ❌ |
+|                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ |
+|                      HARDSIGMOID | ❌ | ✅ | 🟡 | ❌ |
+|                        HARDSWISH | ❌ | ✅ | 🟡 | ❌ |
+|                           IM2COL | ❌ | ✅ | ✅ | 🟡 |
+|                          L2_NORM | ❌ | ✅ | ✅ | ✅ |
+|                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ |
+|                              LOG | ❌ | ✅ | ✅ | ❌ |
+|                             MEAN | ❌ | ✅ | ✅ | ✅ |
+|                              MUL | ❌ | ✅ | ✅ | 🟡 |
+|                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 |
+|                       MUL_MAT_ID | ❌ | ✅ | ✅ | ✅ |
+|                              NEG | ❌ | ✅ | 🟡 | 🟡 |
+|                             NORM | ❌ | ✅ | ✅ | 🟡 |
+|                   OPT_STEP_ADAMW | ❌ | ✅ | ✅ | ❌ |
+|                         OUT_PROD | 🟡 | 🟡 | 🟡 | ❌ |
+|                              PAD | ❌ | ✅ | ✅ | ✅ |
+|                   PAD_REFLECT_1D | ❌ | ✅ | ❌ | ✅ |
+|                          POOL_2D | ❌ | ✅ | ✅ | ✅ |
+|                            REGLU | ❌ | ✅ | ✅ | 🟡 |
+|                             RELU | ❌ | ✅ | 🟡 | 🟡 |
+|                           REPEAT | ❌ | ✅ | 🟡 | ✅ |
+|                      REPEAT_BACK | ❌ | ✅ | ✅ | ❌ |
+|                         RMS_NORM | ❌ | ✅ | ✅ | 🟡 |
+|                    RMS_NORM_BACK | ❌ | ✅ | ✅ | ❌ |
+|                     RMS_NORM_MUL | ❌ | ✅ | ✅ | ✅ |
+|                             ROPE | ❌ | ✅ | ✅ | ✅ |
+|                        ROPE_BACK | ❌ | ✅ | ✅ | ❌ |
+|                        RWKV_WKV6 | ❌ | ✅ | ✅ | ✅ |
+|                        RWKV_WKV7 | ❌ | ✅ | ✅ | ✅ |
+|                            SCALE | ❌ | ✅ | ✅ | ✅ |
+|                              SET | ❌ | ✅ | ❌ | ✅ |
+|                         SET_ROWS | ❌ | 🟡 | ❌ | 🟡 |
+|                              SGN | ❌ | ✅ | 🟡 | ❌ |
+|                          SIGMOID | ❌ | ✅ | 🟡 | 🟡 |
+|                             SILU | ❌ | ✅ | 🟡 | 🟡 |
+|                        SILU_BACK | ❌ | ✅ | ✅ | ❌ |
+|                              SIN | ❌ | ✅ | ✅ | 🟡 |
+|                         SOFT_MAX | ❌ | ✅ | ✅ | ✅ |
+|                    SOFT_MAX_BACK | ❌ | 🟡 | 🟡 | ❌ |
+|                              SQR | ❌ | ✅ | ✅ | 🟡 |
+|                             SQRT | ❌ | ✅ | ✅ | 🟡 |
+|                         SSM_CONV | ❌ | ✅ | ✅ | ✅ |
+|                         SSM_SCAN | ❌ | ✅ | ✅ | ✅ |
+|                             STEP | ❌ | ✅ | 🟡 | ❌ |
+|                              SUB | ❌ | ✅ | ✅ | 🟡 |
+|                              SUM | ❌ | ✅ | ✅ | ❌ |
+|                         SUM_ROWS | ❌ | ✅ | ✅ | ✅ |
+|                           SWIGLU | ❌ | ✅ | ✅ | 🟡 |
+|                             TANH | ❌ | ✅ | 🟡 | 🟡 |
+|               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ |
+|                          UPSCALE | ❌ | ✅ | ✅ | 🟡 |
--- a/docs/ops/BLAS.csv
+++ b/docs/ops/BLAS.csv
--- a/docs/ops/CANN.csv
+++ b/docs/ops/CANN.csv
--- a/docs/ops/CPU.csv
+++ b/docs/ops/CPU.csv
--- a/docs/ops/CUDA.csv
+++ b/docs/ops/CUDA.csv
--- a/docs/ops/Metal.csv
+++ b/docs/ops/Metal.csv
--- a/docs/ops/OpenCL.csv
+++ b/docs/ops/OpenCL.csv
--- a/docs/ops/SYCL.csv
+++ b/docs/ops/SYCL.csv
--- a/docs/ops/Vulkan.csv
+++ b/docs/ops/Vulkan.csv
--- a/docs/ops/zDNN.csv
+++ b/docs/ops/zDNN.csv
--- a/examples/diffusion/README.md
+++ b/examples/diffusion/README.md
@@ -1,13 +0,0 @@
-# Diffusion Text Generation
-
-This directory contains implementations for Diffusion LLMs (DLLMs)
-
-More Info:
- https://github.com/ggml-org/llama.cpp/pull/14644
- https://github.com/ggml-org/llama.cpp/pull/14771
-
-
-Example of using Dream architechture: `llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual`
-
-Example of using LLaDA architechture: `llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual`
-
--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@@ -5,128 +5,344 @@
 #include "log.h"

 #include <limits.h>
-
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-#include <limits>
-#include <random>
 #include <string>
 #include <vector>
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <random>

-enum diffusion_algorithm { ORIGIN = 0, ENTROPY_BASED = 1, MARGIN_BASED = 2, RANDOM = 3, CONFIDENCE_BASED = 4 };
-
-// Unified transfer scheduling methods
-enum transfer_schedule {
-    TIMESTEP_BASED = 0,  // Dream-style: (1.0 - s/t) * remaining
-    BLOCK_BASED    = 1,  // LLaDA-style: process in blocks with get_num_transfer_tokens
-};
-
-typedef bool (*diffusion_step_callback_t)(int32_t             step,
-                                          int32_t             total_steps,
+typedef bool (*diffusion_step_callback_t)(int32_t step,
+                                          int32_t total_steps,
                                          const llama_token * tokens,
-                                          int32_t             n_tokens,
-                                          void *              user_data);
+                                          int32_t n_tokens,
+                                          void * user_data);
+
+enum diffusion_alg {
+    DIFFUSION_ALG_ORIGIN       = 0,
+    DIFFUSION_ALG_MASKGIT_PLUS = 1,
+    DIFFUSION_ALG_TOPK_MARGIN  = 2,
+    DIFFUSION_ALG_ENTROPY      = 3,
+};

 struct diffusion_params {
-    int32_t                   steps                   = 0;
-    float                     temperature             = 0;
-    llama_token               mask_token_id           = LLAMA_TOKEN_NULL;
-    diffusion_step_callback_t step_callback           = nullptr;
-    void *                    step_callback_user_data = nullptr;
-    int32_t                   seed                    = 0;
-    bool                      visual_mode             = false;
-    bool                      shift_logits            = false;  // Shift logits by -1 after decode
-
-    float   top_p = 0.;
-    int32_t top_k = 0.;
-
-    diffusion_algorithm algorithm = CONFIDENCE_BASED;
-    transfer_schedule   schedule  = TIMESTEP_BASED;
-
-    float   cfg_scale        = 0.;     // Config scale for classifier-free guidance
-    float   eps              = 0.;     // Timestep scheduling
-    int32_t block_length     = 0;      // Block size (for block scheduling)
-    float   alg_temp         = 0;      // algorithm temperature (0.0 = deterministic)
-    bool    add_gumbel_noise = false;  // Add gumbel noise to the logits if temp > 0.0
-
-    int32_t max_length = 0;            // Maximum sequence length
+    int32_t                   steps;
+    float                     eps;
+    float                     temperature;
+    float                     top_p;
+    int32_t                   top_k;
+    llama_token               mask_token_id;
+    enum diffusion_alg        algorithm;
+    float                     alg_temp;
+    diffusion_step_callback_t step_callback;
+    void *                    step_callback_user_data;
+    int32_t                   seed;
 };

+
+static diffusion_params diffusion_default_params() {
+    diffusion_params params        = {};
+    params.steps                   = 64;
+    params.eps                     = 1e-3f;
+    params.temperature             = 0.2f;
+    params.top_p                   = 0.95f;
+    params.top_k                   = 0;
+    params.mask_token_id           = LLAMA_TOKEN_NULL;
+    params.algorithm               = DIFFUSION_ALG_ORIGIN;
+    params.alg_temp                = 0.0f;
+    params.step_callback           = nullptr;
+    params.step_callback_user_data = nullptr;
+    params.seed                    = 0;
+    return params;
+}
+
+static void diffusion_generate(llama_context * ctx,
+                        const llama_token * input_tokens,
+                        llama_token * output_tokens,
+                        int32_t n_input,
+                        int32_t max_length,
+                        struct diffusion_params params,
+                        int32_t & n_generated) {
+
+    n_generated = 0;
+    if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || max_length <= n_input) {
+        return;
+    }
+
+    const llama_model * model = llama_get_model(ctx);
+
+    // Initialize with input and pad with mask tokens
+    std::copy(input_tokens, input_tokens + n_input, output_tokens);
+    std::fill(output_tokens + n_input, output_tokens + max_length, params.mask_token_id);
+
+    std::mt19937 rng(params.seed);
+
+    std::vector<float> timesteps(params.steps + 1);
+    for (int32_t i = 0; i <= params.steps; i++) {
+        timesteps[i] = 1.0f - (float) i / params.steps * (1.0f - params.eps);
+    }
+
+    llama_set_causal_attn(ctx, false);
+
+    int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
+
+    std::vector<llama_token_data> candidates(n_vocab);
+
+    std::vector<llama_token_data> conf_candidates;
+    conf_candidates.reserve(max_length);
+
+    std::vector<int32_t> mask_positions;
+    mask_positions.reserve(max_length);
+
+    struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
+    if (params.top_k > 0) {
+        llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
+    }
+    if (params.top_p < 1.0f) {
+        llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
+    }
+    if (params.temperature > 0.0f) {
+        llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
+    }
+    llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
+
+    struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
+
+    llama_batch batch = llama_batch_init(max_length, 0, 1);
+    batch.n_tokens    = max_length;
+
+    int64_t total_sampling_time = 0;
+    int64_t total_time = 0;
+
+    int64_t time_start = ggml_time_us();
+    for (int32_t step = 0; step < params.steps; step++) {
+        if (params.step_callback) {
+            if (!params.step_callback(step, params.steps, output_tokens, max_length, params.step_callback_user_data)) {
+                break;
+            }
+        }
+
+        for (int32_t i = 0; i < max_length; i++) {
+            batch.token[i]     = output_tokens[i];
+            batch.pos[i]       = i;
+            batch.n_seq_id[i]  = 1;
+            batch.seq_id[i][0] = 0;
+            batch.logits[i]    = 1;
+        }
+
+        int ret = llama_decode(ctx, batch);
+        if (ret != 0) {
+            LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, step, ret);
+            break;
+        }
+
+        float * raw_logits = llama_get_logits(ctx);
+        if (!raw_logits) {
+            LOG_ERR("%s: failed to get logits at step %d\n", __func__, step);
+            break;
+        }
+
+        auto get_logits_for_pos = [&](int32_t pos) -> const float * {
+            return pos == 0 ? raw_logits : raw_logits + (pos - 1) * n_vocab;
+        };
+
+        int64_t time_start_sampling = ggml_time_us();
+
+        mask_positions.clear();
+        for (int32_t i = 0; i < max_length; i++) {
+            if (output_tokens[i] == params.mask_token_id) {
+                mask_positions.push_back(i);
+            }
+        }
+
+        if (mask_positions.empty()) {
+            break;
+        }
+
+        float t = timesteps[step];
+        float s = timesteps[step + 1];
+
+        if (params.algorithm == DIFFUSION_ALG_ORIGIN) {
+            float p_transfer = (step < params.steps - 1) ? (1.0f - s / t) : 1.0f;
+
+            for (int32_t pos : mask_positions) {
+                if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
+                    const float * pos_logits = get_logits_for_pos(pos);
+                    for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
+                        candidates[token_id].id    = token_id;
+                        candidates[token_id].logit = pos_logits[token_id];
+                        candidates[token_id].p     = 0.0f;
+                    }
+
+                    llama_token_data_array cur_p = {
+                        /* .data       = */ candidates.data(),
+                        /* .size       = */ (size_t) n_vocab,  // Reset size to full vocab
+                        /* .selected   = */ -1,
+                        /* .sorted     = */ false,
+                    };
+
+                    llama_sampler_apply(sampler, &cur_p);
+                    output_tokens[pos] = cur_p.data[cur_p.selected].id;
+                }
+            }
+        } else {
+            std::vector<std::pair<float, int32_t>> confidences;
+            std::vector<llama_token>               sampled_tokens(mask_positions.size());
+
+            for (size_t i = 0; i < mask_positions.size(); i++) {
+                int32_t       pos        = mask_positions[i];
+                const float * pos_logits = get_logits_for_pos(pos);
+
+                for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
+                    candidates[token_id].logit = pos_logits[token_id];
+                    candidates[token_id].p     = 0.0f;
+                    candidates[token_id].id    = token_id;
+                }
+
+                llama_token_data_array cur_p = {
+                    /* .data       = */ candidates.data(),
+                    /* .size       = */ candidates.size(),
+                    /* .selected   = */ -1,
+                    /* .sorted     = */ false,
+                };
+
+                llama_sampler_apply(sampler, &cur_p);
+
+                llama_token sampled_token = cur_p.data[cur_p.selected].id;
+
+                float confidence = 0.0f;
+                if (params.algorithm == DIFFUSION_ALG_ENTROPY) {
+                    const float epsilon = 1e-10f;
+                    for (size_t j = 0; j < cur_p.size; j++) {
+                        float prob = cur_p.data[j].p;
+                        confidence += prob * logf(prob + epsilon);
+                    }
+                } else if (params.algorithm == DIFFUSION_ALG_TOPK_MARGIN) {
+                    confidence = cur_p.data[0].p - cur_p.data[1].p;
+                } else {
+                    confidence = cur_p.data[cur_p.selected].p;
+                }
+
+                sampled_tokens[i] = sampled_token;
+                confidences.emplace_back(confidence, i);
+            }
+
+            int32_t num_transfer =
+                (step < params.steps - 1) ? (int32_t) (mask_positions.size() * (1.0f - s / t)) : mask_positions.size();
+
+            if (num_transfer > 0) {
+                if (params.alg_temp == 0.0f) {
+                    std::partial_sort(confidences.begin(), confidences.begin() + num_transfer, confidences.end(),
+                                      [](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
+                                          if (a.first != b.first) {
+                                              return a.first > b.first;
+                                          }
+                                          return a.second < b.second;
+                                      });
+                } else {
+                    conf_candidates.clear();
+
+                    for (int32_t pos = 0; pos < max_length; pos++) {
+                        float conf_logit = -std::numeric_limits<float>::infinity();
+
+                        auto it = std::find(mask_positions.begin(), mask_positions.end(), pos);
+                        if (it != mask_positions.end()) {
+                            size_t mask_idx = std::distance(mask_positions.begin(), it);
+                            conf_logit = confidences[mask_idx].first / params.alg_temp;  // Apply temperature scaling
+                        }
+
+                        conf_candidates.emplace_back(llama_token_data{ pos, conf_logit, 0.0f });
+                    }
+
+                    llama_token_data_array conf_array = {
+                        /* .data       = */ conf_candidates.data(),
+                        /* .size       = */ conf_candidates.size(),
+                        /* .selected   = */ -1,
+                        /* .sorted     = */ false,
+                    };
+
+                    for (int32_t i = 0; i < num_transfer; i++) {
+                        // Apply distribution sampler to get selected index
+                        llama_sampler_apply(dist_sampler, &conf_array);
+                        int selected_idx      = conf_array.selected;
+                        confidences[i].second = conf_candidates[selected_idx].id;
+
+                        conf_candidates[selected_idx].p = 0.0f;
+                        conf_array.selected             = -1;
+                    }
+                }
+
+                if (params.alg_temp == 0.0f) {
+                    // Deterministic - use confidence order
+                    for (int32_t i = 0; i < num_transfer; i++) {
+                        int32_t     mask_idx = confidences[i].second;
+                        int32_t     pos      = mask_positions[mask_idx];
+                        llama_token token    = sampled_tokens[mask_idx];
+                        output_tokens[pos]   = token;
+                    }
+                } else {
+                    for (int32_t i = 0; i < num_transfer; i++) {
+                        int32_t pos = confidences[i].second;
+                        auto    it  = std::find(mask_positions.begin(), mask_positions.end(), pos);
+                        if (it != mask_positions.end()) {
+                            int32_t mask_idx   = std::distance(mask_positions.begin(), it);
+                            output_tokens[pos] = sampled_tokens[mask_idx];
+                        }
+                    }
+                }
+            }
+        }
+        int64_t time_end_sampling = ggml_time_us();
+        total_sampling_time += time_end_sampling - time_start_sampling;
+    }
+    int64_t time_end = ggml_time_us();
+    total_time += time_end - time_start;
+
+    LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
+            total_time / 1000.0, total_time / 1000.0 / params.steps, total_sampling_time / 1000.0 / params.steps);
+
+
+    llama_batch_free(batch);
+    llama_sampler_free(sampler);
+    llama_sampler_free(dist_sampler);
+
+    n_generated = max_length;
+}
+
+
+
+
+static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
+    if (!use_chat_template) {
+        return prompt;
+    }
+
+    auto chat_templates = common_chat_templates_init(model, "");
+
+    common_chat_templates_inputs inputs;
+    common_chat_msg              user_msg;
+    user_msg.role                = "user";
+    user_msg.content             = prompt;
+    inputs.add_generation_prompt = true;
+    inputs.messages.push_back(user_msg);
+
+    auto result = common_chat_templates_apply(chat_templates.get(), inputs);
+
+    return result.prompt;
+}
+
 struct callback_data {
-    diffusion_params *  diff_params;
-    const llama_vocab * vocab;
-    int32_t             n_input;
+    const common_params_diffusion * diff_params;
+    const llama_vocab *             vocab;
+    int32_t                         n_input;
 };

-static float calculate_confidence(const llama_token_data_array & cur_p,
-                                  diffusion_algorithm            algorithm,
-                                  std::mt19937 &                 rng) {
-    switch (algorithm) {
-        case CONFIDENCE_BASED:
-            return cur_p.data[cur_p.selected].p;  // Selected token probability
-
-        case ENTROPY_BASED:
-            {
-                float       entropy = 0.0f;
-                const float epsilon = 1e-10f;
-                for (size_t i = 0; i < cur_p.size; i++) {
-                    float prob = cur_p.data[i].p;
-                    entropy += prob * logf(prob + epsilon);
-                }
-                return -entropy;  // Higher entropy = lower confidence
-            }
-
-        case MARGIN_BASED:
-            return (cur_p.size > 1) ? cur_p.data[0].p - cur_p.data[1].p : cur_p.data[0].p;
-
-        case RANDOM:
-            {
-                std::uniform_real_distribution<float> uniform(0.0f, 1.0f);
-                return uniform(rng);  // Random confidence
-            }
-
-        case ORIGIN:
-            return cur_p.data[cur_p.selected].p;
-
-        default:
-            return 0.0f;
-    }
-}
-
-// Unified transfer count calculation function
-static int32_t calculate_transfer_count(int32_t                      step,
-                                        int32_t                      total_steps,
-                                        int32_t                      remaining_masked,
-                                        transfer_schedule            schedule,
-                                        float                        eps,
-                                        const std::vector<int32_t> & num_transfer_tokens = {}) {
-    switch (schedule) {
-        case TIMESTEP_BASED:
-            {
-                float t          = 1.0f - (float) step / total_steps * (1.0f - eps);
-                float s          = 1.0f - (float) (step + 1) / total_steps * (1.0f - eps);
-                float p_transfer = (step < total_steps - 1) ? (1.0f - s / t) : 1.0f;
-                return (int32_t) (remaining_masked * p_transfer);
-            }
-
-        case BLOCK_BASED:
-            if (!num_transfer_tokens.empty() && step < (int32_t) num_transfer_tokens.size()) {
-                return num_transfer_tokens[step];
-            }
-            return remaining_masked / (total_steps - step);  // Fallback
-
-        default:
-            return remaining_masked / (total_steps - step);
-    }
-}
-
-static bool diffusion_step_callback(int32_t             step,
-                                    int32_t             total_steps,
+static bool diffusion_step_callback(int32_t step,
+                                    int32_t total_steps,
                                    const llama_token * tokens,
-                                    int32_t             n_tokens,
-                                    void *              user_data) {
-    (void) user_data;
+                                    int32_t n_tokens,
+                                    void * user_data) {
+    (void)user_data;

    callback_data * data = static_cast<callback_data *>(user_data);

@@ -134,11 +350,11 @@ static bool diffusion_step_callback(int32_t             step,
        int progress_percent = (step * 100) / total_steps;
        int progress_bars    = (step * 50) / total_steps;
        LOG_INF("\rdiffusion step: %d/%d [%s%s] %d%%",
-                step,
-                total_steps,
-                std::string(progress_bars, '=').c_str(),
-                std::string(50 - progress_bars, ' ').c_str(),
-                progress_percent);
+            step,
+            total_steps,
+            std::string(progress_bars, '=').c_str(),
+            std::string(50 - progress_bars, ' ').c_str(),
+            progress_percent);
    };

    if (data->diff_params->visual_mode) {
@@ -175,360 +391,6 @@ static bool diffusion_step_callback(int32_t             step,
    return true;
 }

-static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) {
-    if (temperature == 0.0f) {
-        return;
-    }
-
-    std::uniform_real_distribution<double> uniform(0.0, 1.0);
-    for (int32_t i = 0; i < n_vocab; i++) {
-        double noise        = uniform(rng);
-        // Prevent log(0)
-        noise               = std::max(noise, 1e-20);
-        double gumbel_noise = std::pow(-std::log(noise), temperature);
-        logits[i]           = std::exp(logits[i]) / gumbel_noise;
-    }
-}
-
-static std::vector<int32_t> get_num_transfer_tokens(int32_t mask_count, int32_t steps) {
-    std::vector<int32_t> num_transfer_tokens(steps);
-
-    int32_t base      = mask_count / steps;
-    int32_t remainder = mask_count % steps;
-
-    for (int32_t i = 0; i < steps; i++) {
-        num_transfer_tokens[i] = base + (i < remainder ? 1 : 0);
-    }
-
-    return num_transfer_tokens;
-}
-
-static void diffusion_generate(llama_context *          ctx,
-                               const llama_token *      input_tokens,
-                               llama_token *            output_tokens,
-                               int32_t                  n_input,
-                               const diffusion_params & params,
-                               int32_t &                n_generated) {
-    n_generated = 0;
-    if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || params.max_length <= n_input) {
-        return;
-    }
-
-    const llama_model * model = llama_get_model(ctx);
-
-    // Initialize with input and pad with mask tokens
-    std::copy(input_tokens, input_tokens + n_input, output_tokens);
-    std::fill(output_tokens + n_input, output_tokens + params.max_length, params.mask_token_id);
-
-    std::mt19937 rng(params.seed);
-
-    llama_set_causal_attn(ctx, false);
-
-    int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
-
-    std::vector<llama_token_data> candidates(n_vocab);
-    std::vector<llama_token_data> conf_candidates;
-    conf_candidates.reserve(params.max_length);
-    std::vector<int32_t> mask_positions;
-    mask_positions.reserve(params.max_length);
-
-    // Setup sampler chain
-    struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
-    if (params.top_k > 0) {
-        llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
-    }
-    if (params.top_p < 1.0f) {
-        llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
-    }
-    if (params.temperature > 0.0f) {
-        llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
-    }
-    llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
-
-    struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
-
-    llama_batch batch = llama_batch_init(params.max_length, 0, 1);
-    batch.n_tokens    = params.max_length;
-
-    // Pre-allocate buffers for CFG if needed
-    int32_t                  logits_size = n_vocab * params.max_length;
-    std::vector<float>       cond_logits_buffer;
-    std::vector<llama_token> un_x_buffer;
-    if (params.cfg_scale > 0.0f) {
-        cond_logits_buffer.resize(logits_size);
-        un_x_buffer.resize(params.max_length);
-    }
-
-    // For block-based processing
-    std::vector<int32_t> num_transfer_tokens;
-    int32_t              num_blocks      = 1;
-    int32_t              steps_per_block = params.steps;
-
-    if (params.schedule == BLOCK_BASED) {
-        GGML_ASSERT(params.max_length % params.block_length == 0);
-        num_blocks = params.max_length / params.block_length;
-        GGML_ASSERT(params.steps % num_blocks == 0);
-        steps_per_block = params.steps / num_blocks;
-    }
-
-    std::vector<float> confidence(params.max_length);
-
-    int64_t total_sampling_time = 0;
-    int64_t total_time          = 0;
-    int64_t time_start          = ggml_time_us();
-
-    for (int block_num = 0; block_num < num_blocks; block_num++) {
-        int32_t block_start = (params.schedule == BLOCK_BASED) ? n_input + block_num * params.block_length : 0;
-        int32_t block_end   = (params.schedule == BLOCK_BASED) ?
-                                  std::min(n_input + (block_num + 1) * params.block_length, params.max_length) :
-                                  params.max_length;
-
-        // Count masked tokens in current block for block-based processing
-        if (params.schedule == BLOCK_BASED) {
-            int32_t block_mask_count = 0;
-            for (int i = block_start; i < block_end; i++) {
-                if (output_tokens[i] == params.mask_token_id) {
-                    block_mask_count++;
-                }
-            }
-            num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block);
-        }
-
-        for (int32_t step = 0; step < steps_per_block; step++) {
-            int32_t global_step = block_num * steps_per_block + step;
-
-            if (params.step_callback) {
-                if (!params.step_callback(
-                        global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) {
-                    break;
-                }
-            }
-
-            // Setup batch
-            for (int32_t i = 0; i < params.max_length; i++) {
-                batch.token[i]     = output_tokens[i];
-                batch.pos[i]       = i;
-                batch.n_seq_id[i]  = 1;
-                batch.seq_id[i][0] = 0;
-                batch.logits[i]    = 1;
-            }
-
-            float * logits = nullptr;
-
-            if (params.cfg_scale > 0.0f) {
-                int ret = llama_decode(ctx, batch);
-                if (ret != 0) {
-                    LOG_ERR("Failed to generate conditional");
-                    break;
-                }
-                float * cond_logits_ptr = llama_get_logits(ctx);
-                std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float));
-
-                // Unconditional generation (mask input)
-                std::copy(output_tokens, output_tokens + params.max_length, un_x_buffer.begin());
-                for (int32_t i = 0; i < n_input; i++) {
-                    un_x_buffer[i] = params.mask_token_id;
-                }
-
-                for (int32_t i = 0; i < params.max_length; i++) {
-                    batch.token[i] = un_x_buffer[i];
-                }
-                ret = llama_decode(ctx, batch);
-                if (ret != 0) {
-                    LOG_ERR("Failed to generate unconditional");
-                    break;
-                }
-                float * uncond_logits = llama_get_logits(ctx);
-
-                // Apply CFG
-                for (int32_t i = 0; i < logits_size; i++) {
-                    cond_logits_buffer[i] =
-                        uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]);
-                }
-                logits = cond_logits_buffer.data();
-            } else {
-                int ret = llama_decode(ctx, batch);
-                if (ret != 0) {
-                    LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, global_step, ret);
-                    break;
-                }
-                logits = llama_get_logits(ctx);
-            }
-
-            if (!logits) {
-                LOG_ERR("%s: failed to get logits at step %d\n", __func__, global_step);
-                break;
-            }
-
-            auto get_logits_for_pos = [&](int32_t pos) -> const float * {
-                if (params.shift_logits) {
-                    return pos == 0 ? logits : logits + (pos - 1) * n_vocab;
-                }
-                return logits + (pos) *n_vocab;
-            };
-
-            int64_t time_start_sampling = ggml_time_us();
-
-            mask_positions.clear();
-            for (int32_t i = 0; i < params.max_length; i++) {
-                if (output_tokens[i] == params.mask_token_id) {
-                    // For block-based, only consider current block
-                    if (params.schedule != BLOCK_BASED || (i >= block_start && i < block_end)) {
-                        mask_positions.push_back(i);
-                    }
-                }
-            }
-
-            if (mask_positions.empty()) {
-                break;
-            }
-
-            if (params.add_gumbel_noise && params.temperature > 0.0f) {
-                add_gumbel_noise(logits, n_vocab, params.temperature, rng);
-            }
-
-            if (params.algorithm == ORIGIN) {
-                int32_t transfer_count = calculate_transfer_count(
-                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
-                float p_transfer = (float) transfer_count / mask_positions.size();
-
-                for (int32_t pos : mask_positions) {
-                    if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
-                        const float * pos_logits = get_logits_for_pos(pos);
-                        for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
-                            candidates[token_id].id    = token_id;
-                            candidates[token_id].logit = pos_logits[token_id];
-                            candidates[token_id].p     = 0.0f;
-                        }
-
-                        llama_token_data_array cur_p = {
-                            candidates.data(),
-                            (size_t) n_vocab,
-                            -1,
-                            false,
-                        };
-
-                        llama_sampler_apply(sampler, &cur_p);
-                        output_tokens[pos] = cur_p.data[cur_p.selected].id;
-                    }
-                }
-            } else {
-                std::vector<std::pair<float, int32_t>> confidences;
-                std::vector<llama_token>               sampled_tokens(mask_positions.size());
-
-                for (size_t i = 0; i < mask_positions.size(); i++) {
-                    int32_t       pos        = mask_positions[i];
-                    const float * pos_logits = get_logits_for_pos(pos);
-
-                    for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
-                        candidates[token_id].logit = pos_logits[token_id];
-                        candidates[token_id].p     = 0.0f;
-                        candidates[token_id].id    = token_id;
-                    }
-
-                    llama_token_data_array cur_p = {
-                        candidates.data(),
-                        candidates.size(),
-                        -1,
-                        false,
-                    };
-
-                    llama_sampler_apply(sampler, &cur_p);
-                    llama_token sampled_token = cur_p.data[cur_p.selected].id;
-
-                    float conf = calculate_confidence(cur_p, params.algorithm, rng);
-
-                    sampled_tokens[i] = sampled_token;
-                    confidences.emplace_back(conf, i);
-                }
-
-                int32_t transfer_count = calculate_transfer_count(
-                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
-
-                if (transfer_count > 0) {
-                    if (params.alg_temp == 0.0f) {
-                        std::partial_sort(confidences.begin(),
-                                          confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()),
-                                          confidences.end(),
-                                          [](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
-                                              if (a.first != b.first) {
-                                                  return a.first > b.first;
-                                              }
-                                              return a.second < b.second;
-                                          });
-
-                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
-                            int32_t mask_idx   = confidences[i].second;
-                            int32_t pos        = mask_positions[mask_idx];
-                            output_tokens[pos] = sampled_tokens[mask_idx];
-                        }
-                    } else {
-                        conf_candidates.clear();
-                        for (size_t i = 0; i < confidences.size(); i++) {
-                            float conf_logit = confidences[i].first / params.alg_temp;
-                            conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f });
-                        }
-
-                        llama_token_data_array conf_array = {
-                            conf_candidates.data(),
-                            conf_candidates.size(),
-                            -1,
-                            false,
-                        };
-
-                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
-                            llama_sampler_apply(dist_sampler, &conf_array);
-                            int32_t selected_idx = conf_array.selected;
-                            int32_t mask_idx     = selected_idx;
-                            int32_t pos          = mask_positions[mask_idx];
-                            output_tokens[pos]   = sampled_tokens[mask_idx];
-
-                            conf_candidates[selected_idx].p = 0.0f;
-                            conf_array.selected             = -1;
-                        }
-                    }
-                }
-            }
-
-            int64_t time_end_sampling = ggml_time_us();
-            total_sampling_time += time_end_sampling - time_start_sampling;
-        }
-    }
-
-    int64_t time_end = ggml_time_us();
-    total_time += time_end - time_start;
-
-    LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
-            total_time / 1000.0,
-            total_time / 1000.0 / params.steps,
-            total_sampling_time / 1000.0 / params.steps);
-
-    llama_batch_free(batch);
-    llama_sampler_free(sampler);
-    llama_sampler_free(dist_sampler);
-
-    n_generated = params.max_length;
-}
-
-static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
-    if (!use_chat_template) {
-        return prompt;
-    }
-
-    auto chat_templates = common_chat_templates_init(model, "");
-
-    common_chat_templates_inputs inputs;
-    common_chat_msg              user_msg;
-    user_msg.role                = "user";
-    user_msg.content             = prompt;
-    inputs.add_generation_prompt = true;
-    inputs.messages.push_back(user_msg);
-
-    auto result = common_chat_templates_apply(chat_templates.get(), inputs);
-
-    return result.prompt;
-}
-
 int main(int argc, char ** argv) {
    ggml_time_init();

@@ -538,6 +400,11 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    const char * alg_names[] = { "ORIGIN", "MASKGIT_PLUS", "TOPK_MARGIN", "ENTROPY" };
+    const char * alg_name    = (params.diffusion.algorithm >= 0 && params.diffusion.algorithm <= 3) ?
+                                   alg_names[params.diffusion.algorithm] :
+                                   "UNKNOWN";
+
    common_init();
    llama_backend_init();

@@ -554,12 +421,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    if (!llama_model_is_diffusion(model)) {
-        LOG_ERR("error: unsupported model for diffusion");
-        llama_model_free(model);
-        return 1;
-    }
-
    llama_context_params ctx_params = llama_context_default_params();
    ctx_params.n_ctx                = params.n_ctx;
    ctx_params.n_batch              = params.n_batch;
@@ -581,12 +442,10 @@ int main(int argc, char ** argv) {
    const llama_vocab * vocab            = llama_model_get_vocab(model);
    std::string         formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model);

-    std::vector<llama_token> input_tokens = common_tokenize(vocab,
-                                                            formatted_prompt,
+    std::vector<llama_token> input_tokens = common_tokenize(vocab, formatted_prompt,
                                                            /*add special tokens*/ true,
                                                            /*parse special*/ true);
-
-    int n_input = input_tokens.size();
+    int                      n_input      = input_tokens.size();

    if (n_input >= params.n_ctx) {
        LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx);
@@ -595,79 +454,44 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    struct diffusion_params ldiff_params = diffusion_default_params();
+    ldiff_params.steps                   = params.diffusion.steps;
+    ldiff_params.eps                     = params.diffusion.eps;
+    ldiff_params.temperature             = params.sampling.temp;
+    ldiff_params.top_p                   = params.sampling.top_p;
+    ldiff_params.top_k                   = params.sampling.top_k;
+    ldiff_params.algorithm               = static_cast<enum diffusion_alg>(params.diffusion.algorithm);
+    ldiff_params.alg_temp                = params.diffusion.alg_temp;
+    ldiff_params.seed                    = params.sampling.seed;
+
    llama_token mask_token_id = llama_vocab_mask(vocab);
    GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL);

-    bool visual_mode = params.diffusion.visual_mode;
-
-    int32_t                  n_generated = 0;
-    std::vector<llama_token> output_tokens(params.n_ubatch);
-
-    struct diffusion_params diff_params;
-
-    char shift_logits_str[8];
-    if (llama_model_meta_val_str(model, "diffusion.shift_logits", shift_logits_str, sizeof(shift_logits_str)) >= 0) {
-        diff_params.shift_logits = (strcmp(shift_logits_str, "true") == 0);
-    } else {
-        diff_params.shift_logits = true;
-    }
-
-    //Use either eps or block length, but not both
-    GGML_ASSERT((params.diffusion.eps == 0) ^ (params.diffusion.block_length == 0));
-
-    if (params.diffusion.eps) {
-        diff_params.schedule = TIMESTEP_BASED;
-        diff_params.eps      = params.diffusion.eps;
-    } else if (params.diffusion.block_length) {
-        diff_params.schedule     = BLOCK_BASED;
-        diff_params.block_length = params.diffusion.block_length;
-    }
-
-    diff_params.mask_token_id    = mask_token_id;
-    diff_params.seed             = params.sampling.seed;
-    diff_params.temperature      = params.sampling.temp;
-    diff_params.steps            = params.diffusion.steps;
-    diff_params.algorithm        = static_cast<diffusion_algorithm>(params.diffusion.algorithm);
-    diff_params.max_length       = params.n_ubatch;
-    diff_params.top_p            = params.sampling.top_p;
-    diff_params.top_k            = params.sampling.top_k;
-    diff_params.visual_mode      = params.diffusion.visual_mode;
-    diff_params.add_gumbel_noise = params.diffusion.add_gumbel_noise;
-
-    diff_params.step_callback           = diffusion_step_callback;
-    callback_data cb_data               = { &diff_params, vocab, n_input };
-    diff_params.step_callback_user_data = &cb_data;
-
-    const char * alg_names[]   = { "ORIGIN", "ENTROPY_BASED", "MARGIN_BASED", "RANDOM", "CONFIDENCE_BASED" };
-    const char * sched_names[] = { "TIMESTEP_BASED", "BLOCK_BASED" };
-    const char * alg_name =
-        (diff_params.algorithm >= 0 && diff_params.algorithm <= 4) ? alg_names[diff_params.algorithm] : "UNKNOWN";
-    const char * sched_name =
-        (diff_params.schedule >= 0 && diff_params.schedule <= 1) ? sched_names[diff_params.schedule] : "UNKNOWN";
-
    LOG_INF("diffusion_params: - %-25s llama_token      = %d\n", "mask_token_id", mask_token_id);
-    LOG_INF("diffusion_params: - %-25s u32              = %d\n", "steps", diff_params.steps);
-    LOG_INF("diffusion_params: - %-25s u32              = %d\n", "max_length", diff_params.max_length);
-    LOG_INF("diffusion_params: - %-25s enum             = %d (%s)\n", "algorithm", diff_params.algorithm, alg_name);
-    LOG_INF("diffusion_params: - %-25s enum             = %d (%s)\n", "schedule", diff_params.schedule, sched_name);
-    LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "temperature", diff_params.temperature);
-    if (diff_params.schedule == TIMESTEP_BASED) {
-        LOG_INF("diffusion_params: - %-25s f32              = %.6f\n", "eps", diff_params.eps);
-        LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "alg_temp", diff_params.alg_temp);
-    }
-    if (diff_params.schedule == BLOCK_BASED) {
-        LOG_INF("diffusion_params: - %-25s u32              = %d\n", "block_length", diff_params.block_length);
-        LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "cfg_scale", diff_params.cfg_scale);
-    }
+    LOG_INF("diffusion_params: - %-25s u32              = %d\n", "steps", params.diffusion.steps);
+    LOG_INF("diffusion_params: - %-25s f32              = %.6f\n", "eps", params.diffusion.eps);
+    LOG_INF("diffusion_params: - %-25s u32              = %d (%s)\n", "algorithm", params.diffusion.algorithm,
+            alg_name);
+    LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "alg_temp", params.diffusion.alg_temp);

-    diffusion_generate(ctx, input_tokens.data(), output_tokens.data(), n_input, diff_params, n_generated);
+    ldiff_params.mask_token_id = mask_token_id;
+
+    callback_data cb_data = { &params.diffusion, vocab, n_input };
+
+    ldiff_params.step_callback           = diffusion_step_callback;
+    ldiff_params.step_callback_user_data = &cb_data;
+
+    int32_t n_generated = 0;
+
+    std::vector<llama_token> output_tokens(params.n_ubatch);
+    diffusion_generate(ctx, input_tokens.data(), output_tokens.data(), n_input, params.n_ubatch,
+                       ldiff_params, n_generated);

    if (n_generated > 0) {
-        if (visual_mode) {
+        if (params.diffusion.visual_mode) {
            //clear screen and move cursor to top-left
            LOG_INF("\033[2J\033[H");
        }
-
        output_tokens.erase(output_tokens.begin(), output_tokens.begin() + n_input);
        std::string output_data = common_detokenize(vocab, output_tokens, false);
        LOG_INF("\n%s\n", output_data.c_str());
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -81,14 +81,6 @@ int main(int argc, char ** argv) {

    params.embedding = true;

-    // if the number of prompts that would be encoded is known in advance, it's more efficient to specify the
-    //   --parallel argument accordingly. for convenience, if not specified, we fallback to unified KV cache
-    //   in order to support any number of prompts
-    if (params.n_parallel == 1) {
-        LOG_INF("%s: n_parallel == 1 -> unified KV cache is enabled\n", __func__);
-        params.kv_unified = true;
-    }
-
    // utilize the full context
    if (params.n_batch < params.n_ctx) {
        LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -7,7 +7,6 @@
 #include <cstdio>
 #include <string>
 #include <vector>
-#include <numeric>

 /**
 * This the arbitrary data which will be passed to each callback.
@@ -78,12 +77,6 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
        LOG("                                     ]\n");
        LOG("                                     sum = %f\n", sum);
    }
-
-    // TODO: make this abort configurable/optional?
-    if (std::isnan(sum)) {
-        LOG_ERR("encountered NaN - aborting\n");
-        exit(0);
-    }
 }

 /**
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -15,12 +15,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    if (params.n_parallel == 1) {
-        // the example uses 2 sequences, so when n_parallel == 1, we need to enable unified kv cache
-        printf("%s: n_parallel == 1, enabling unified kv cache\n", __func__);
-        params.kv_unified = true;
-    }
-
    common_init();

    if (params.n_predict < 0) {
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -59,15 +59,13 @@ int main(int argc, char ** argv) {
    }

    params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
-    params.tensor_buft_overrides     = params.speculative.tensor_buft_overrides;
-
    common_init_result llama_init_dft = common_init_from_params(params);

    //model_dft = llama_init_dft.model.get();
    ctx_dft   = llama_init_dft.context.get();

    if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
-        LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str());
+        return 1;
    }

    // Tokenize the prompt
@@ -132,10 +130,7 @@ int main(int argc, char ** argv) {
    params_spec.n_reuse = llama_n_ctx(ctx_dft) - n_draft;
    params_spec.p_min   = p_min;

-    struct common_speculative * spec = common_speculative_init(ctx_tgt, ctx_dft);
-    for (auto &pair : params.speculative.replacements) {
-        common_speculative_add_replacement_tgt_dft(spec, pair.first.c_str(), pair.second.c_str());
-    }
+    struct common_speculative * spec = common_speculative_init(ctx_dft);

    llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);

--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -85,8 +85,6 @@ int main(int argc, char ** argv) {
    }

    params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
-    params.tensor_buft_overrides     = params.speculative.tensor_buft_overrides;
-
    common_init_result llama_init_dft = common_init_from_params(params);

    model_dft = llama_init_dft.model.get();
--- a/examples/training/finetune.cpp
+++ b/examples/training/finetune.cpp
@@ -10,20 +10,20 @@
 #include <vector>

 #if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267)  // possible loss of data
+#pragma warning(disable: 4244 4267) // possible loss of data
 #endif

 int main(int argc, char ** argv) {
    common_params params;
+
    params.escape = false;

-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
        return 1;
    }

    if (params.use_mmap) {
-        LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n",
-                __func__);
+        LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__);
        params.use_mmap = false;
    }
    if (params.cache_type_k != GGML_TYPE_F32) {
@@ -38,10 +38,11 @@ int main(int argc, char ** argv) {
    common_init();
    llama_backend_init();
    llama_numa_init(params.numa);
+
    // load the model and apply lora adapter, if any
-    common_init_result   llama_init = common_init_from_params(params);
-    llama_model_ptr    & model      = llama_init.model;
-    llama_context_ptr  & ctx        = llama_init.context;
+    common_init_result llama_init = common_init_from_params(params);
+    llama_model_ptr   & model = llama_init.model;
+    llama_context_ptr & ctx   = llama_init.context;

    if (model == NULL) {
        LOG_ERR("%s: unable to load model\n", __func__);
@@ -54,32 +55,31 @@ int main(int argc, char ** argv) {
        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
    }

-    std::vector<llama_token> tokens  = common_tokenize(ctx.get(), params.prompt, true);
-    ggml_opt_dataset_t       dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get()) / 2);
+    constexpr float val_split = 0.05f;

-    struct lr_opt & lr = params.lr;
-    LOG_INF("-optimizer %s -lr0 %.2g -wd %.2g -lr-min %.2g -min-epochs %.2g -epochs %d -period %.2g -val %.2g\n",
-            ggml_opt_optimizer_name(params.optimizer), (double) lr.lr0, (double) lr.wd, (double) lr.lr_min, (double) lr.decay_epochs,
-            (unsigned) lr.epochs, (double) params.n_batch / params.n_ubatch, (double) params.val_split);
+    std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
+    ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);

-    struct llama_opt_params lopt_params{
-        /*n_ctx_train     =*/0,
-        /*param_filter    =*/llama_opt_param_filter_all,
-        /*param_filter_ud =*/nullptr,
-        /*get_opt_pars    =*/common_opt_lr_pars,
-        /*get_opt_pars_ud =*/&params.lr,
-        /*optimizer_type  =*/params.optimizer,
+    struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr);
+    optimizer_params.adamw.alpha = 1e-7f; // learning rate
+
+    struct llama_opt_params lopt_params {
+        /*n_ctx_train     =*/ 0,
+        /*param_filter    =*/ llama_opt_param_filter_all,
+        /*param_filter_ud =*/ nullptr,
+        /*get_opt_pars    =*/ ggml_opt_get_constant_optimizer_params,
+        /*get_opt_pars_ud =*/ &optimizer_params,
    };
    llama_opt_init(ctx.get(), model.get(), lopt_params);

-    const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split);
+    const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - val_split);

    ggml_opt_result_t result_train = ggml_opt_result_init();
    ggml_opt_result_t result_eval  = ggml_opt_result_init();

-    for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) {
+    for (int epoch = 0; epoch < 2; ++epoch) {
        llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
-                        ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
+            ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
        fprintf(stderr, "\n");

        ggml_opt_result_reset(result_train);
@@ -88,7 +88,7 @@ int main(int argc, char ** argv) {
    ggml_opt_result_free(result_train);
    ggml_opt_result_free(result_eval);

-    llama_model_save_to_file(model.get(), params.out_file.c_str());
+    llama_model_save_to_file(model.get(), "finetuned-model.gguf");

    llama_backend_free();

--- a/flake.nix
+++ b/flake.nix
@@ -36,6 +36,9 @@
  # ```
  # nixConfig = {
  #   extra-substituters = [
+  #     # Populated by the CI in ggml-org/llama.cpp
+  #     "https://llama-cpp.cachix.org"
+  #
  #     # A development cache for nixpkgs imported with `config.cudaSupport = true`.
  #     # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
  #     # This lets one skip building e.g. the CUDA-enabled openmpi.
@@ -44,8 +47,10 @@
  #   ];
  #
  #   # Verify these are the same keys as published on
+  #   # - https://app.cachix.org/cache/llama-cpp
  #   # - https://app.cachix.org/cache/cuda-maintainers
  #   extra-trusted-public-keys = [
+  #     "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
  #     "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
  #   ];
  # };
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -39,9 +39,8 @@ if (WIN32)
    set(CMAKE_SHARED_MODULE_PREFIX  "")
 endif()

-option(BUILD_SHARED_LIBS           "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
-option(GGML_BACKEND_DL             "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
-set(GGML_BACKEND_DIR "" CACHE PATH "ggml: directory to load dynamic backends from (requires GGML_BACKEND_DL")
+option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
+option(GGML_BACKEND_DL   "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)

 #
 # option list
@@ -132,7 +131,7 @@ option(GGML_RVV              "ggml: enable rvv"              ON)
 option(GGML_RV_ZFH           "ggml: enable riscv zfh"        OFF)
 option(GGML_XTHEADVECTOR     "ggml: enable xtheadvector"     OFF)
 option(GGML_VXE              "ggml: enable vxe"              ON)
-option(GGML_NNPA             "ggml: enable nnpa"             OFF)  # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
+option(GGML_NNPA             "ggml: enable nnpa"             ON)

 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
 set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
@@ -175,8 +174,6 @@ option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental,
 option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
 option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
 option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12   "ggml: enable rocWMMA FlashAttention on GFX12"    OFF)
-option(GGML_HIP_MMQ_MFMA                    "ggml: enable MFMA MMA for CDNA in MMQ"           ON)
-option(GGML_HIP_EXPORT_METRICS              "ggml: enable kernel perf metrics output"         OFF)
 option(GGML_MUSA_GRAPHS                     "ggml: use MUSA graph, experimental, unstable"    OFF)
 option(GGML_MUSA_MUDNN_COPY                 "ggml: enable muDNN for accelerated copy"         OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
@@ -188,7 +185,6 @@ option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_WEBGPU                          "ggml: use WebGPU"                                OFF)
 option(GGML_WEBGPU_DEBUG                    "ggml: enable WebGPU debug output"                OFF)
-option(GGML_ZDNN                            "ggml: use zDNN"                                  OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
 option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
--- a/ggml/cmake/ggml-config.cmake.in
+++ b/ggml/cmake/ggml-config.cmake.in
@@ -34,8 +34,8 @@ if (NOT GGML_SHARED_LIB)

    if (GGML_BLAS)
        find_dependency(BLAS)
-        list(APPEND GGML_BLAS_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
-        list(APPEND GGML_BLAS_INTERFACE_LINK_OPTIONS   ${BLAS_LINKER_FLAGS})
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
+        list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS   ${BLAS_LINKER_FLAGS})
    endif()

    if (GGML_CUDA)
@@ -125,56 +125,54 @@ if(NOT TARGET ggml::ggml)
            IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")

    set(_ggml_all_targets "")
-    if (NOT GGML_BACKEND_DL)
-        foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
-            string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
-            string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
+    foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
+        string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
+        string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)

-            find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
-                REQUIRED
-                HINTS ${GGML_LIB_DIR}
-                NO_CMAKE_FIND_ROOT_PATH)
+        find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
+            REQUIRED
+            HINTS ${GGML_LIB_DIR}
+            NO_CMAKE_FIND_ROOT_PATH)

-            message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
+        message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")

-            add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
+        add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
+        set_target_properties(ggml::${_ggml_backend}
+            PROPERTIES
+                INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
+                IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+                IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
+                INTERFACE_COMPILE_FEATURES c_std_90
+                POSITION_INDEPENDENT_CODE ON)
+
+        string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
+        if(is_cpu_variant)
+            list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
            set_target_properties(ggml::${_ggml_backend}
-                PROPERTIES
-                    INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
-                    IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
-                    IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
-                    INTERFACE_COMPILE_FEATURES c_std_90
-                    POSITION_INDEPENDENT_CODE ON)
+            PROPERTIES
+                INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")

-            string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
-            if(is_cpu_variant)
-                list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
-                set_target_properties(ggml::${_ggml_backend}
-                PROPERTIES
-                    INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
-
-                if(GGML_CPU_INTERFACE_LINK_OPTIONS)
-                    set_target_properties(ggml::${_ggml_backend}
-                        PROPERTIES
-                            INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
-                endif()
-
-            else()
-                list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
+            if(GGML_CPU_INTERFACE_LINK_OPTIONS)
                set_target_properties(ggml::${_ggml_backend}
                    PROPERTIES
-                        INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
-
-                if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
-                    set_target_properties(ggml::${_ggml_backend}
-                        PROPERTIES
-                            INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
-                endif()
+                        INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
            endif()

-            list(APPEND _ggml_all_targets ggml::${_ggml_backend})
-        endforeach()
-    endif()
+        else()
+            list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
+            set_target_properties(ggml::${_ggml_backend}
+                PROPERTIES
+                    INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
+
+            if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
+                set_target_properties(ggml::${_ggml_backend}
+                    PROPERTIES
+                        INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
+            endif()
+        endif()
+
+        list(APPEND _ggml_all_targets ggml::${_ggml_backend})
+    endforeach()

    list(APPEND GGML_INTERFACE_LINK_LIBRARIES ggml::ggml-base "${_ggml_all_targets}")
    set_target_properties(ggml::ggml
--- a/ggml/include/ggml-opt.h
+++ b/ggml/include/ggml-opt.h
@@ -74,26 +74,16 @@ extern "C" {
        GGML_OPT_BUILD_TYPE_OPT     = 30,
    };

-    enum ggml_opt_optimizer_type {
-        GGML_OPT_OPTIMIZER_TYPE_ADAMW,
-        GGML_OPT_OPTIMIZER_TYPE_SGD,
-
-        GGML_OPT_OPTIMIZER_TYPE_COUNT
-    };
-
    // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
    struct ggml_opt_optimizer_params {
+        // AdamW optimizer parameters
        struct {
            float alpha; // learning rate
-            float beta1; // first AdamW momentum
-            float beta2; // second AdamW momentum
+            float beta1;
+            float beta2;
            float eps;   // epsilon for numerical stability
-            float wd;    // weight decay - 0.0f to disable
+            float wd;    // weight decay for AdamW, use 0.0f to disable
        } adamw;
-        struct {
-            float alpha; // learning rate
-            float wd;    // weight decay
-        } sgd;
    };

    // callback to calculate optimizer parameters prior to a backward pass
@@ -122,11 +112,8 @@ extern "C" {

        int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done

-        ggml_opt_get_optimizer_params get_opt_pars;    // callback for calculating optimizer parameters
-        void *                        get_opt_pars_ud; // userdata for calculating optimizer parameters
-
-        // only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor
-        enum ggml_opt_optimizer_type optimizer;
+        ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
+        void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
    };

    // get parameters for an optimization context with defaults set where possible
@@ -155,10 +142,6 @@ extern "C" {
    // get the gradient accumulator for a node from the forward graph
    GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);

-    GGML_API enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t); //TODO consistent naming scheme
-
-    GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type);
-
    // ====== Optimization Result ======

    GGML_API ggml_opt_result_t ggml_opt_result_init(void);
@@ -243,14 +226,12 @@ extern "C" {
            struct ggml_tensor            * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
            ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
            enum ggml_opt_loss_type         loss_type,      // loss to minimize
-            enum ggml_opt_optimizer_type    optimizer,      // sgd or adamw
            ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
            int64_t                         nepoch,         // how many times the dataset should be iterated over
            int64_t                         nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
            float                           val_split,      // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
            bool                            silent);        // whether or not info prints to stderr should be suppressed

-
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/include/ggml-zdnn.h
+++ b/ggml/include/ggml-zdnn.h
@@ -1,16 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-GGML_BACKEND_API ggml_backend_t ggml_backend_zdnn_init(void);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -241,8 +241,6 @@
 #define GGML_ROPE_TYPE_MROPE  8
 #define GGML_ROPE_TYPE_VISION 24

-#define GGML_MROPE_SECTIONS   4
-
 #define GGML_UNUSED(x) (void)(x)

 #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
@@ -306,16 +304,6 @@
    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)

-#define GGML_TENSOR_TERNARY_OP_LOCALS \
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne2, src2, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb2, src2, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
-
 #define GGML_TENSOR_BINARY_OP_LOCALS01 \
    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
@@ -407,8 +395,7 @@ extern "C" {
        // GGML_TYPE_IQ4_NL_4_4 = 36,
        // GGML_TYPE_IQ4_NL_4_8 = 37,
        // GGML_TYPE_IQ4_NL_8_8 = 38,
-        GGML_TYPE_MXFP4   = 39, // MXFP4 (1 block)
-        GGML_TYPE_COUNT   = 40,
+        GGML_TYPE_COUNT   = 39,
    };

    // precision
@@ -443,7 +430,6 @@ extern "C" {
        GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
        GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
-        GGML_FTYPE_MOSTLY_MXFP4   = 25, // except 1d tensors
    };

    // available tensor operations:
@@ -452,7 +438,6 @@ extern "C" {

        GGML_OP_DUP,
        GGML_OP_ADD,
-        GGML_OP_ADD_ID,
        GGML_OP_ADD1,
        GGML_OP_ACC,
        GGML_OP_SUB,
@@ -542,7 +527,6 @@ extern "C" {
        GGML_OP_CROSS_ENTROPY_LOSS,
        GGML_OP_CROSS_ENTROPY_LOSS_BACK,
        GGML_OP_OPT_STEP_ADAMW,
-        GGML_OP_OPT_STEP_SGD,

        GGML_OP_GLU,

@@ -573,7 +557,6 @@ extern "C" {
        GGML_GLU_OP_REGLU,
        GGML_GLU_OP_GEGLU,
        GGML_GLU_OP_SWIGLU,
-        GGML_GLU_OP_SWIGLU_OAI,
        GGML_GLU_OP_GEGLU_ERF,
        GGML_GLU_OP_GEGLU_QUICK,

@@ -848,13 +831,6 @@ extern "C" {
            struct ggml_tensor  * b,
            enum   ggml_type      type);

-    // dst[i0, i1, i2] = a[i0, i1, i2] + b[i0, ids[i1, i2]]
-    GGML_API struct ggml_tensor * ggml_add_id(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            struct ggml_tensor  * ids);
-
    GGML_API struct ggml_tensor * ggml_add1(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -1222,13 +1198,6 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

-    GGML_API struct ggml_tensor * ggml_swiglu_oai(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            float                 alpha,
-            float                 limit);
-
    // normalize along rows
    GGML_API struct ggml_tensor * ggml_norm(
            struct ggml_context * ctx,
@@ -1601,10 +1570,6 @@ extern "C" {
            float                 scale,
            float                 max_bias);

-    GGML_API void ggml_soft_max_add_sinks(
-            struct ggml_tensor * a,
-            struct ggml_tensor * sinks);
-
    GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -1663,7 +1628,7 @@ extern "C" {
            struct ggml_tensor  * b,
            struct ggml_tensor  * c,
            int                   n_dims,
-            int                   sections[GGML_MROPE_SECTIONS],
+            int                   sections[4],
            int                   mode,
            int                   n_ctx_orig,
            float                 freq_base,
@@ -1689,22 +1654,6 @@ extern "C" {
            float                 beta_fast,
            float                 beta_slow);

-    GGML_API struct ggml_tensor * ggml_rope_multi_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            struct ggml_tensor  * c,
-            int                   n_dims,
-            int                   sections[GGML_MROPE_SECTIONS],
-            int                   mode,
-            int                   n_ctx_orig,
-            float                 freq_base,
-            float                 freq_scale,
-            float                 ext_factor,
-            float                 attn_factor,
-            float                 beta_fast,
-            float                 beta_slow);
-
    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -2103,10 +2052,6 @@ extern "C" {
    GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
            const struct ggml_tensor * a);

-    GGML_API void ggml_flash_attn_ext_add_sinks(
-            struct ggml_tensor * a,
-            struct ggml_tensor * sinks);
-
    // TODO: needs to be adapted to ggml_flash_attn_ext
    GGML_API struct ggml_tensor * ggml_flash_attn_back(
           struct ggml_context * ctx,
@@ -2312,14 +2257,7 @@ extern "C" {
            struct ggml_tensor  * grad,
            struct ggml_tensor  * m,
            struct ggml_tensor  * v,
-            struct ggml_tensor  * adamw_params); // parameters such as the learning rate
-
-    // stochastic gradient descent step (with weight decay)
-    GGML_API struct ggml_tensor * ggml_opt_step_sgd(
-        struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  grad,
-        struct ggml_tensor *  sgd_params); // alpha, weight decay
+            struct ggml_tensor  * adamw_params); // parameters such a the learning rate

    //
    // automatic differentiation
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -214,13 +214,6 @@ add_library(ggml
            ggml-backend-reg.cpp)
 add_library(ggml::ggml ALIAS ggml)

-if (GGML_BACKEND_DIR)
-    if (NOT GGML_BACKEND_DL)
-        message(FATAL_ERROR "GGML_BACKEND_DIR requires GGML_BACKEND_DL")
-    endif()
-    target_compile_definitions(ggml PUBLIC GGML_BACKEND_DIR="${GGML_BACKEND_DIR}")
-endif()
-
 target_link_libraries(ggml PUBLIC ggml-base)

 if (CMAKE_SYSTEM_NAME MATCHES "Linux")
@@ -234,11 +227,7 @@ function(ggml_add_backend_library backend)
        set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
        target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
        add_dependencies(ggml ${backend})
-        if (GGML_BACKEND_DIR)
-            install(TARGETS ${backend} LIBRARY DESTINATION ${GGML_BACKEND_DIR})
-        else()
-            install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
-        endif()
+        install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
    else()
        add_library(${backend} ${ARGN})
        target_link_libraries(ggml PUBLIC ${backend})
@@ -382,7 +371,6 @@ ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
 ggml_add_backend(Vulkan)
 ggml_add_backend(WebGPU)
-ggml_add_backend(zDNN)
 ggml_add_backend(OpenCL)

 foreach (target ggml-base ggml)
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -29,7 +29,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
        case GGML_OP_DIAG_MASK_ZERO:
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_ADD:
-        case GGML_OP_ADD_ID:
        case GGML_OP_ADD1:
        case GGML_OP_SUB:
        case GGML_OP_MUL:
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -49,10 +49,6 @@
 #include "ggml-webgpu.h"
 #endif

-#ifdef GGML_USE_ZDNN
-#include "ggml-zdnn.h"
-#endif
-
 #ifdef GGML_USE_OPENCL
 #include "ggml-opencl.h"
 #endif
@@ -184,9 +180,6 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_WEBGPU
        register_backend(ggml_backend_webgpu_reg());
 #endif
-#ifdef GGML_USE_ZDNN
-        register_backend(ggml_backend_zdnn_reg());
-#endif
 #ifdef GGML_USE_OPENCL
        register_backend(ggml_backend_opencl_reg());
 #endif
@@ -505,9 +498,6 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,

    std::vector<fs::path> search_paths;
    if (user_search_path == nullptr) {
-#ifdef GGML_BACKEND_DIR
-        search_paths.push_back(fs::u8path(GGML_BACKEND_DIR));
-#endif
        // default search paths: executable directory, current directory
        search_paths.push_back(get_executable_path());
        search_paths.push_back(fs::current_path());
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1071,11 +1071,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                }
            }
        }
-        // if the node is still unassigned, assign it to the first backend that supports it
-        for (int b = 0; b < sched->n_backends && *cur_backend_id == -1; b++) {
-            ggml_backend_sched_set_if_supported(sched, node, b, cur_backend_id);
-        }
-        GGML_ASSERT(*cur_backend_id != -1);
    }

    // pass 5: split graph, find tensors that need to be copied
@@ -1103,7 +1098,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg

            const int node_backend_id = tensor_backend_id(node);

-            GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
+            assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback

            // check if we should start a new split based on the sources of the current node
            bool need_new_split = false;
@@ -1161,7 +1156,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg

                size_t src_id = hash_id(src);
                const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
-                GGML_ASSERT(src_backend_id != -1); // all inputs should be assigned by now
+                assert(src_backend_id != -1); // all inputs should be assigned by now

                if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
                    if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -281,10 +281,10 @@ ggml_backend_t ggml_backend_blas_init(void) {
    ggml_backend_blas_context * ctx = new ggml_backend_blas_context;

    ggml_backend_t backend = new ggml_backend {
-        /* .guid    = */ ggml_backend_blas_guid(),
-        /* .iface   = */ blas_backend_i,
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
-        /* .context = */ ctx,
+        /* .guid      = */ ggml_backend_blas_guid(),
+        /* .interface = */ blas_backend_i,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
+        /* .context   = */ ctx,
    };

 #if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@@ -31,13 +31,6 @@ string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
 set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
 string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
 message(STATUS "CANN: SOC_VERSION =  ${SOC_VERSION}")
-option(USE_ACL_GRAPH "Enable CANN graph execution (ACL graph mode)" OFF)
-
-if(USE_ACL_GRAPH AND (SOC_TYPE_MAJOR_SN STREQUAL "310P" OR SOC_TYPE_COMPILE_OPTION STREQUAL "ASCEND_310P"))
-    message(FATAL_ERROR
-        "CANN Graph (ACL graph mode) is not supported on 310P devices. "
-        "Please build with -DUSE_ACL_GRAPH=OFF or use a supported SOC.")
-endif()

 if (CANN_INSTALL_DIR)
    # Only Support Linux.
@@ -75,13 +68,6 @@ if (CANN_INSTALL_DIR)

    target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")

-    if (USE_ACL_GRAPH)
-        target_compile_definitions(ggml-cann PRIVATE USE_ACL_GRAPH)
-        message(STATUS "CANN: USE_ACL_GRAPH is enabled.")
-    else()
-        message(STATUS "CANN: USE_ACL_GRAPH is disabled.")
-    endif()
-
    message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
    message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")
 else()
--- a/ggml/src/ggml-cann/acl_tensor.cpp
+++ b/ggml/src/ggml-cann/acl_tensor.cpp
@@ -77,8 +77,6 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
    for (int i = 0; i < final_dims; i++) {
        acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
    }
-    size_t elem_offset = offset / ggml_element_size(tensor);
-    acl_storage_len += elem_offset;

    // Reverse ne and stride.
    std::reverse(acl_ne, acl_ne + final_dims);
@@ -86,7 +84,7 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,

    aclTensor* acl_tensor = aclCreateTensor(
        acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
-        elem_offset, format, &acl_storage_len, 1,
+        offset / ggml_element_size(tensor), format, &acl_storage_len, 1,
        tensor->data);

    return acl_tensor;
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -68,8 +68,6 @@
 #include <aclnnop/aclnn_grouped_matmul_v3.h>
 #include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
 #include <aclnnop/aclnn_zero.h>
-#include <aclnnop/aclnn_index_copy.h>
-#include <aclnnop/aclnn_index_select.h>
 #include <float.h>

 #include <cmath>
@@ -101,7 +99,7 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclT
    }
 }

-void ggml_cann_op_unary(
+void ggml_cann_unary_op(
    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
    ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];
@@ -113,42 +111,6 @@ void ggml_cann_op_unary(
    ggml_cann_release_resources(ctx, acl_src, acl_dst);
 }

-void ggml_cann_op_unary_gated(
-    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
-    ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    ggml_tensor* src0 = dst->src[0];
-    ggml_tensor* src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_is_contiguous_1(src0));
-    GGML_ASSERT(ggml_is_contiguous_1(dst));
-    const int32_t swapped = ggml_get_op_params_i32(dst, 1);
-
-    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
-    aclTensor *acl_src0 = nullptr, *acl_src1 = nullptr;
-    if(src1) {
-        GGML_ASSERT(ggml_is_contiguous_1(src1));
-        GGML_ASSERT(src0->type == src1->type);
-
-        acl_src0 = ggml_cann_create_tensor(src0);
-        acl_src1 = ggml_cann_create_tensor(src1);
-    } else {
-        int64_t ne[] = {src0->ne[0] / 2, src0->ne[1], src0->ne[2], src0->ne[3]};
-        size_t nb[] = {src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]};
-        acl_src0 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, 0);
-        acl_src1 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, ne[0] * ggml_element_size(src0));
-        if (swapped) {
-            std::swap(acl_src0, acl_src1);
-        }
-    }
-
-    unary_op(ctx, acl_src0, acl_dst);
-    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_dst, acl_src1);
-
-    ggml_cann_release_resources(ctx, acl_src0, acl_dst);
-    if(src1)
-        ggml_cann_release_resources(ctx, acl_src1);
-}
-
 /**
 * @brief Repeats elements of a tensor along each dimension according to the
 * specified repeat array.
@@ -753,55 +715,69 @@ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src0 = dst->src[0];

+    aclTensor* acl_src = ggml_cann_create_tensor(src0);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
    if (ggml_are_same_shape(src0, dst)) {
-        aclTensor* acl_src = ggml_cann_create_tensor(src0);
-        aclTensor* acl_dst = ggml_cann_create_tensor(dst);
        if (dst->type == src0->type) {
            cann_copy(ctx, acl_src, acl_dst);
        } else {
            aclnn_cast(ctx, acl_src, acl_dst, ggml_cann_type_mapping(dst->type));
        }
-        ggml_cann_release_resources(ctx, acl_src, acl_dst);
    } else {
-        void* src_trans_buffer = src0->data;
-        ggml_cann_pool_alloc src_buffer_allocator;
-        if (!ggml_is_contiguous(src0)) {
-            aclTensor* acl_src = ggml_cann_create_tensor(src0);
-            src_buffer_allocator.alloc(ctx.pool(),
-                ggml_nelements(src0) * ggml_type_size(src0->type));
-            src_trans_buffer = src_buffer_allocator.get();
+        if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
+            if (dst->type == src0->type) {
+                size_t cpy_size = ggml_nbytes(dst);
+                ggml_cann_async_memcpy(ctx, dst->data, src0->data, cpy_size,
+                    ACL_MEMCPY_DEVICE_TO_DEVICE);
+                return;
+            } else {
+                ggml_cann_pool_alloc src_buffer_allocator(
+                    ctx.pool(),
+                    ggml_nelements(dst) * ggml_type_size(dst->type));
+                void* src_trans_buffer = src_buffer_allocator.get();
+                size_t src_trans_nb[GGML_MAX_DIMS];
+                src_trans_nb[0] = ggml_type_size(dst->type);
+                for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                    src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+                }
+                aclTensor* src_trans_tensor = ggml_cann_create_tensor(
+                    src_trans_buffer, ggml_cann_type_mapping(dst->type),
+                    ggml_type_size(dst->type), src0->ne, src_trans_nb,
+                    GGML_MAX_DIMS);
+
+                aclnn_cast(ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping(dst->type));
+                size_t cpy_size = ggml_nbytes(dst);
+                ggml_cann_async_memcpy(ctx, dst->data, src_trans_buffer, cpy_size,
+                    ACL_MEMCPY_DEVICE_TO_DEVICE);
+                ggml_cann_release_resources(ctx, src_trans_tensor);
+                return;
+            }
+        } else if (ggml_is_contiguous(dst)) {
+            ggml_cann_pool_alloc src_buffer_allocator(
+                ctx.pool(), ggml_nelements(dst) * ggml_type_size(dst->type));
+            void* src_trans_buffer = src_buffer_allocator.get();
            size_t src_trans_nb[GGML_MAX_DIMS];
-            src_trans_nb[0] = ggml_type_size(src0->type);
+            src_trans_nb[0] = ggml_type_size(dst->type);
            for (int i = 1; i < GGML_MAX_DIMS; i++) {
                src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
            }
            aclTensor* src_trans_tensor = ggml_cann_create_tensor(
-                src_trans_buffer, ggml_cann_type_mapping(src0->type),
-                ggml_type_size(src0->type), src0->ne, src_trans_nb,
+                src_trans_buffer, ggml_cann_type_mapping(dst->type),
+                ggml_type_size(dst->type), src0->ne, src_trans_nb,
                GGML_MAX_DIMS);
-            cann_copy(ctx, acl_src, src_trans_tensor);
-            ggml_cann_release_resources(ctx, acl_src, src_trans_tensor);
-        }

-        size_t src_reshape_nb[GGML_MAX_DIMS];
-        src_reshape_nb[0] = ggml_type_size(src0->type);
-        for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            src_reshape_nb[i] = src_reshape_nb[i - 1] * dst->ne[i - 1];
-        }
+            aclnn_cast(ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping(dst->type));

-        aclTensor* trans_acl_src = ggml_cann_create_tensor(src_trans_buffer,
-            ggml_cann_type_mapping(src0->type),ggml_type_size(src0->type),
-            dst->ne, src_reshape_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
-        aclTensor* acl_dst = ggml_cann_create_tensor(dst);
-
-        if (dst->type == src0->type) {
-            cann_copy(ctx, trans_acl_src, acl_dst);
+            size_t cpy_size = ggml_nbytes(dst);
+            ggml_cann_async_memcpy(ctx, dst->data, src_trans_buffer, cpy_size,
+                ACL_MEMCPY_DEVICE_TO_DEVICE);
+            ggml_cann_release_resources(ctx, src_trans_tensor);
+            return;
        } else {
-            aclnn_cast(ctx, trans_acl_src, acl_dst, ggml_cann_type_mapping(dst->type));
+            GGML_ABORT("Unsupport dst is not tontiguous.");
        }
-        ggml_cann_release_resources(ctx, trans_acl_src, acl_dst);
    }
-    return;
+    ggml_cann_release_resources(ctx, acl_src, acl_dst);
 }

 /**
@@ -1316,196 +1292,160 @@ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
 }

 /**
- * @brief Generate a range of values and apply a scalar base exponentiation.
+ * @brief   Applies the Alibi (Attention with Linear Biases) mechanism to the
+ * @details This function implements the Alibi mechanism, which introduces
+ *          learnable biases into the attention scores to simulate relative
+ *          position encoding without the need for explicit positional
+ *          embeddings.
 *
- * This function creates an evenly spaced sequence from `start` to `stop` (exclusive),
- * with step size `step`, stores it in a temporary buffer, and then computes:
+ * @param ctx          The backend CANN context for executing operations.
+ * @param acl_src      The source tensor representing the query or key.
+ * @param acl_position The position tensor containing relative positions.
+ * @param acl_dst      The destination tensor where the result will be stored.
+ * @param n_head       The number of attention heads.
+ * @param src_ne       The dimensions of the source tensor.
+ * @param src_nb0      The byte size of the first dimension of the source
+ tensor.
+ * @param max_bias     The maximum bias value used in the Alibi mechanism.
+ * @param dst          The destination tensor object for additional metadata.
 *
- * @f[
- * slope[i] = m^{\left( start + i \cdot step \right)}, \quad 0 \le i < size
- * @f]
- *
- * The results are written to the provided @p slope_buffer.
- *
- * @param ctx           CANN backend context for memory allocation and operator execution.
- * @param slope_buffer  Pointer to the output buffer (float array) for the computed slope values.
- * @param m             Scalar base for the exponentiation.
- * @param size          Number of elements in the generated sequence.
- * @param start         Starting exponent offset.
- * @param stop          Stopping exponent offset (exclusive).
- * @param step          Step size for the exponent increment.
+ * The function performs the following steps:
+ * 1. Calculates the logarithm floor of the number of heads to determine the
+      base for bias calculation.
+ * 2. Initializes arrays with arithmetic sequences and fills them with bias
+      values.
+ * 3. Computes the bias tensor based on the calculated biases and arithmetic
+      sequences.
+ * 4. Reshapes the bias tensor to match the dimensions of the input tensors.
+ * 5. Multiplies the position tensor by the bias tensor.
+ * 6. Adds the result of the multiplication to the source tensor to produce the
+      final output.
 */
-static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_buffer,
-    float m, int64_t size, float start, float stop, float step){
-    int64_t ne[] = {size};
-    size_t nb[] = {sizeof(float)};
+static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                        aclTensor* acl_position, aclTensor* acl_dst,
+                        const int n_head, int64_t* src_ne, const size_t src_nb0,
+                        float max_bias, ggml_tensor* dst) {
+    const int64_t ne2_ne3 = src_ne[2] * src_ne[3];
+    GGML_ASSERT(src_nb0 == sizeof(float));
+    GGML_ASSERT(n_head == src_ne[2]);

-    ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * sizeof(float));
-    void* arange_buffer = arange_allocator.get();
+    const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));

-    aclTensor* arange_tensor = ggml_cann_create_tensor(
-        arange_buffer, ACL_FLOAT, sizeof(float), ne, nb, 1);
-    aclnn_arange(ctx, arange_tensor, start, stop, step, size);
+    float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
+    float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);

-    aclTensor* slope_tensor = ggml_cann_create_tensor(
-        slope_buffer, ACL_FLOAT, sizeof(float), ne, nb, 1);
+    // init arange
+    ggml_cann_pool_alloc arange_allocator(ctx.pool(),
+                                          ne2_ne3 * ggml_type_size(dst->type));
+    void* tmp_arange_buffer = arange_allocator.get();

-    aclScalar* sc = aclCreateScalar(&m, aclDataType::ACL_FLOAT);
+    // arange1: [1, ..., n_heads_log2_floor+1)
+    float start = 1;
+    float stop = n_heads_log2_floor + 1;
+    float step = 1;
+    int64_t n_elements_arange = n_heads_log2_floor;

-    GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, sc, arange_tensor, slope_tensor);
-    ggml_cann_release_resources(ctx, sc, arange_tensor, slope_tensor);
+    int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
+    size_t tmp_arange1_nb[] = {sizeof(dst->type)};
+    aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
+        tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_arange1_ne, tmp_arange1_nb,
+        GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+
+    aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
+
+    aclTensor* tmp_arange2_tensor = nullptr;
+    if (n_heads_log2_floor < ne2_ne3) {
+        // arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
+        start = 1;
+        stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
+        step = 2;
+        n_elements_arange = ne2_ne3 - n_heads_log2_floor;
+        int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
+        size_t tmp_arange2_nb[] = {sizeof(dst->type)};
+
+        aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
+            (char*)tmp_arange_buffer +
+                n_heads_log2_floor * ggml_type_size(dst->type),
+            ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+            tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+        aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
+                     n_elements_arange);
+    }
+
+    // init mk_base
+    ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
+                                           ne2_ne3 * ggml_type_size(dst->type));
+    void* tmp_mk_base_buffer = mk_base_allocator.get();
+    int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
+    size_t tmp_mk_base1_nb[] = {sizeof(dst->type)};
+    aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
+        tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_mk_base1_ne, tmp_mk_base1_nb,
+        GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+
+    aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
+
+    aclTensor* tmp_mk_base2_tensor = nullptr;
+    if (n_heads_log2_floor < ne2_ne3) {
+        int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
+        size_t tmp_mk_base2_nb[] = {sizeof(dst->type)};
+        aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
+            (char*)tmp_mk_base_buffer +
+                n_heads_log2_floor * ggml_type_size(dst->type),
+            ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+            tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+        aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
+    }
+
+    // init mk
+    int64_t tmp_mk_base_ne[] = {ne2_ne3};
+    size_t tmp_mk_base_nb[] = {sizeof(dst->type)};
+    aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
+        tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
+        GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+    aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
+        tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
+        GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+    aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
+
+    // reshape mk
+    int64_t tmp_mk_ne[] = {1, 1, src_ne[2], src_ne[3]};
+    size_t tmp_mk_nb[GGML_MAX_DIMS];
+    tmp_mk_nb[0] = ggml_type_size(dst->type);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
+    }
+    aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
+        tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
+        ACL_FORMAT_ND);
+
+    // acl_position * mk
+    int64_t tmp_output_ne[] = {src_ne[0], src_ne[1], src_ne[2], src_ne[3]};
+    size_t tmp_output_nb[GGML_MAX_DIMS];
+    tmp_output_nb[0] = ggml_type_size(dst->type);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        tmp_output_nb[i] = tmp_output_nb[i - 1] * tmp_output_ne[i - 1];
+    }
+    ggml_cann_pool_alloc output_allocator(ctx.pool(), ggml_nbytes(dst));
+    void* tmp_output_buffer = output_allocator.get();
+    aclTensor* tmp_output_tensor = ggml_cann_create_tensor(
+        tmp_output_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_output_ne, tmp_output_nb, GGML_MAX_DIMS,
+        ACL_FORMAT_ND);
+    aclnn_mul(ctx, acl_position, tmp_mk_tensor, tmp_output_tensor);
+
+    // add
+    aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst);
+    ggml_cann_release_resources(ctx, tmp_arange1_tensor, tmp_arange2_tensor,
+        tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
+        tmp_arange_tensor, tmp_mk_tensor, tmp_output_tensor);
 }

-/**
- * @brief Compute slope values for multiple attention heads based on ALiBi bias parameters.
- *
- * This function generates slope values for each attention head according to the ALiBi
- * (Attention with Linear Biases) method. It splits the computation into two ranges depending
- * on whether the head index is less than @p n_head_log2 or not, and uses different base values
- * (`m0` and `m1`) for the exponentiation.
- *
- * @f[
- * slope[h] =
- * \begin{cases}
- * m_0^{(h + 1)}, & h < n\_head\_log2 \\
- * m_1^{\left( 2 \cdot (h - n\_head\_log2) + 1 \right)}, & h \geq n\_head\_log2
- * \end{cases}
- * \quad , \quad \text{if } max\_bias > 0
- * @f]
- *
- * If @p max_bias <= 0, all slope values are set to 1.0.
- *
- * @param ctx           CANN backend context for memory allocation and operator execution.
- * @param n_head        Total number of attention heads.
- * @param slope_buffer  Pointer to the output buffer (float array) for storing slopes.
- * @param max_bias      Maximum bias value for slope computation.
- *
-*/
-static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
-    void* slope_buffer, float max_bias) {
-    const int n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
-
-    float m0 = powf(2.0f, -(max_bias) / n_head_log2);
-    float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
-
-    // const float slope = (max_bias > 0.0f) ?
-    //                          h < n_head_log2 ?
-    //                              powf(m0, h + 1) :
-    //                              powf(m1, 2*(h - n_head_log2) + 1) :
-    //                          1.0f;
-    // arange1
-    float start = 0 + 1;
-    float end   = (n_head_log2 - 1) + 1;
-    float step  = 1;
-    float count = n_head_log2;
-    // end needs to be +1 because aclnn uses a left-closed, right-open interval.
-    aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step);
-    if (n_head_log2 < n_head) {
-        // arange2
-        start = 2 * (n_head_log2 - n_head_log2) + 1;
-        end   = 2 * ((n_head - 1) - n_head_log2) + 1;
-        step  = 2;
-        count = n_head - n_head_log2;
-        aclnn_get_slope_inner(
-            ctx, (char *) slope_buffer + n_head_log2 * sizeof(float),
-            m1, count, start, end + 1, step);
-    }
-}
-
-/**
- * @brief Add ALiBi (Attention with Linear Biases) positional biases to the attention mask.
- *
- * This function computes the ALiBi slopes for each attention head (if max_bias > 0),
- * multiplies them with the attention mask to produce bias tensors, and adds these biases
- * to the destination tensor (@p dst).
- *
- * The function performs necessary broadcasting of the mask and slope tensors to match
- * the shape of the destination tensor, then applies element-wise multiplication and addition
- * using CANN operators.
- *
- * @param ctx         CANN backend context for memory management and operator execution.
- * @param mask        Input attention mask tensor, assumed to be contiguous.
- * @param dst         Destination tensor to which ALiBi biases will be added.
- * @param dst_ptr     Pointer to the memory of the destination tensor.
- * @param max_bias    Maximum bias value controlling the slope scaling.
- *
- * @note
- * - Write data into dst_ptr using only the shape information of the dst tensor.
- * - `GGML_MAX_DIMS + 2` is used to extend tensor dimensions for broadcasting.
- */
-static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
-    ggml_tensor* dst, void* dst_ptr, float max_bias) {
-    void* slope_buffer = nullptr;
-    void* bias_buffer = nullptr;
-
-    if (max_bias > 0.0f) {
-        int64_t n_heads = dst->ne[2];
-        ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(float));
-        slope_buffer = slope_allocator.get();
-        ggml_cann_pool_alloc bias_allocator(
-                    ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst));
-        bias_buffer = bias_allocator.get();
-        aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias);
-    }
-
-    // broadcast for mask, slop and dst;
-    int64_t nr2 = dst->ne[2] / mask->ne[2];
-    int64_t nr3 = dst->ne[3] / mask->ne[3];
-
-    // broadcast the mask across rows
-    int64_t mask_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], 1, mask->ne[3], 1 };
-    size_t  mask_nb[] = {
-        mask_nb[0] = mask->nb[0], mask_nb[1] = mask->nb[1], mask_nb[2] = mask->nb[2],
-        mask_nb[3] = mask->nb[2], mask_nb[4] = mask->nb[3], mask_nb[5] = mask->nb[3]
-    };
-
-    int64_t dst_ne[] = { dst->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], nr3 };
-    size_t  dst_nb[] = {
-        dst_nb[0] = dst->nb[0], dst_nb[1] = dst->nb[1], dst_nb[2] = dst->nb[2],
-        dst_nb[3] = dst->nb[2], dst_nb[4] = dst->nb[3], dst_nb[5] = dst->nb[3]
-    };
-
-    // slope is a 1 dim tensor, slope.ne2 == dst.ne2
-    int64_t slope_ne[] = { 1, 1, mask->ne[2], nr2, 1, 1 };
-    size_t  slope_nb[GGML_MAX_DIMS + 2];
-    slope_nb[0] = sizeof(float);
-    for (int i = 1; i < GGML_MAX_DIMS + 2; i++) {
-        slope_nb[i] = slope_nb[i - 1] * slope_ne[i - 1];
-    }
-
-    aclTensor* acl_slope = ggml_cann_create_tensor(
-                            slope_buffer, ACL_FLOAT, sizeof(float),
-                            slope_ne, slope_nb, GGML_MAX_DIMS + 2);
-    aclTensor* acl_mask = ggml_cann_create_tensor(
-                            mask, mask_ne, mask_nb, GGML_MAX_DIMS + 2);
-
-    // write data into dst_ptr using only the shape information of the dst tensor.
-    aclTensor* acl_dst  = ggml_cann_create_tensor(
-                            dst_ptr, ggml_cann_type_mapping(dst->type),
-                            ggml_type_size(dst->type), dst_ne, dst_nb,
-                            GGML_MAX_DIMS + 2);
-
-    if (max_bias > 0.0f) {
-        int64_t bias_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], 1 };
-        size_t  bias_nb[GGML_MAX_DIMS + 2];
-        bias_nb[0] = sizeof(float);
-        for (int i = 1; i < GGML_MAX_DIMS + 2; i++) {
-            bias_nb[i] = bias_nb[i - 1] * bias_ne[i - 1];
-        }
-        aclTensor* bias_tensor = ggml_cann_create_tensor(
-                                    bias_buffer, ACL_FLOAT, sizeof(float),
-                                    bias_ne, bias_nb, GGML_MAX_DIMS + 2);
-
-        aclnn_mul(ctx, acl_slope, acl_mask, bias_tensor);
-        aclnn_add(ctx, acl_dst, bias_tensor);
-        ggml_cann_release_resources(ctx, bias_tensor);
-    } else {
-        aclnn_add(ctx, acl_dst, acl_mask);
-    }
-    ggml_cann_release_resources(ctx, acl_slope, acl_mask, acl_dst);
-}
-
-void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_cann_dup(ctx, dst);
 }

@@ -1523,135 +1463,165 @@ void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 * @param acl_dst The destination tensor where the softmax results will be
 * stored.
 */
-static void aclnn_softmax(ggml_backend_cann_context & ctx,
-    aclTensor* acl_src, int64_t dim, aclTensor * acl_dst) {
+static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                          int64_t dim, aclTensor* acl_dst) {
    GGML_CANN_CALL_ACLNN_OP(ctx, Softmax, acl_src, dim, acl_dst);
 }

-void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src0 = dst->src[0];
    ggml_tensor* src1 = dst->src[1];  // mask

    aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
-    aclTensor* acl_dst  = ggml_cann_create_tensor(dst);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);

-    float scale    = 1.0f;
+    float scale = 1.0f;
    float max_bias = 0.0f;

-    memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
-    memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
+    memcpy(&scale, (float*)dst->op_params + 0, sizeof(float));
+    memcpy(&max_bias, (float*)dst->op_params + 1, sizeof(float));

    // input mul scale
    aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
-    ggml_cann_pool_alloc src_tensor_allocator(ctx.pool(), ggml_nbytes(src0));
-    void* src_tensor_buffer = src_tensor_allocator.get();
-    aclTensor* softmax_tensor = ggml_cann_create_tensor(
-        src_tensor_buffer, ggml_cann_type_mapping(src0->type),
-        ggml_element_size(src0), src0->ne, src0->nb,GGML_MAX_DIMS);

-    aclnn_muls(ctx, acl_src0, scale, softmax_tensor, false);
+    size_t n_bytes = ggml_nbytes(src0);
+    ggml_cann_pool_alloc mul_scale_allocator(ctx.pool(), n_bytes);
+    void* input_mul_scale_buffer = mul_scale_allocator.get();
+    aclTensor* acl_input_mul_scale_tensor = ggml_cann_create_tensor(
+        input_mul_scale_buffer, ACL_FLOAT, ggml_type_size(src0->type), src0->ne,
+        src0->nb, GGML_MAX_DIMS);
+
+    bool inplace = false;
+    aclnn_muls(ctx, acl_src0, scale, acl_input_mul_scale_tensor, inplace);

    // mask
+    aclTensor* acl_src1_fp32_tensor = nullptr;
+    aclTensor* tmp_mask_tensor = nullptr;
+    ggml_cann_pool_alloc src1_fp32_allocator(ctx.pool());
    if (src1) {
-        aclnn_add_alibi(ctx, src1, src0, src_tensor_buffer, max_bias);
-    }
-    // softmax
-    aclnn_softmax(ctx, softmax_tensor, 3, acl_dst);
-    ggml_cann_release_resources(ctx, acl_src0, acl_dst, acl_scale, softmax_tensor);
-}
-
-/**
- * @brief Performs index select operation on a 4D tensor using the CANN backend.
- *
- * This function applies the `IndexSelect` operation along a specific dimension
- * of the source tensor (`src_buffer`) using the indices from the index tensor (`index`).
- * It iterates over the last two dimensions of the source tensor, creates the corresponding
- * CANN tensors for the source, index, and output slices, and executes the `IndexSelect`
- * operation for each slice.
- *
- * @param ctx The context for CANN backend operations.
- * @param src_buffer The source buffer containing the 4D input tensor data.
- * @param src_ne The dimensions of the source tensor.
- * @param src_nb The strides (byte offsets) of the source tensor.
- * @param dst_buffer The destination buffer where the output tensor data will be written.
- * @param dst_ne The dimensions of the destination tensor.
- * @param dst_nb The strides (byte offsets) of the destination tensor.
- * @param index The index tensor specifying the indices to select from the source tensor.
- * @param type The data type of the source and destination tensors.
- */
-static void aclnn_index_select_4d(ggml_backend_cann_context& ctx,
-                                void* src_buffer,int64_t* src_ne, size_t* src_nb,
-                                void* dst_buffer, int64_t* dst_ne, size_t* dst_nb,
-                                ggml_tensor* index, ggml_type type) {
-    for (int64_t i = 0; i < src_ne[3]; i++) {
-        for (int64_t j = 0; j < src_ne[2]; j++) {
-            // src
-            aclTensor* acl_src_tensor = ggml_cann_create_tensor(
-                (char*)src_buffer + i * src_nb[3] + j * src_nb[2],
-                ggml_cann_type_mapping(type), ggml_type_size(type),
-                src_ne, src_nb, 2);
-
-            // index
-            aclTensor* acl_index = ggml_cann_create_tensor(
-                (char*)index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
-                ggml_cann_type_mapping(index->type), ggml_element_size(index),
-                index->ne, index->nb, 1);
-
-            // out
-            aclTensor* acl_out = ggml_cann_create_tensor(
-                (char*)dst_buffer + i * dst_nb[3] + j * dst_nb[2],
-                ggml_cann_type_mapping(type), ggml_type_size(type),
-                dst_ne, dst_nb, 2);
-            GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, acl_src_tensor, 0, acl_index, acl_out);
-            ggml_cann_release_resources(ctx, acl_src_tensor, acl_index, acl_out);
+        const bool use_f16 = src1->type == GGML_TYPE_F16;
+        if (use_f16) {
+            // cast to fp32
+            size_t n_bytes = ggml_nelements(src1) * sizeof(float_t);
+            size_t src1_fp32_nb[GGML_MAX_DIMS];
+            src1_fp32_nb[0] = sizeof(float_t);
+            for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                src1_fp32_nb[i] = src1_fp32_nb[i - 1] * src1->ne[i - 1];
+            }
+            src1_fp32_allocator.alloc(n_bytes);
+            void* src1_fp32_buffer = src1_fp32_allocator.get();
+            acl_src1_fp32_tensor = ggml_cann_create_tensor(
+                src1_fp32_buffer, ACL_FLOAT, sizeof(float), src1->ne,
+                src1_fp32_nb, GGML_MAX_DIMS);
+            aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
+            aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT);
+            ggml_cann_release_resources(ctx, acl_src1);
+        } else {
+            acl_src1_fp32_tensor = ggml_cann_create_tensor(src1);
        }
+
+        // broadcast the mask across rows, only use ne11 of ne01 in mask
+        if (src1->ne[1] != src0->ne[1]) {
+            // mask shape: [1,1,ne11,ne10]
+            int64_t tmp_mask_ne[] = {src0->ne[0], src0->ne[1], 1, 1};
+            size_t tmp_mask_nb[GGML_MAX_DIMS];
+            tmp_mask_nb[0] = sizeof(float_t);
+            for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                tmp_mask_nb[i] = tmp_mask_nb[i - 1] * tmp_mask_ne[i - 1];
+            }
+            tmp_mask_tensor = ggml_cann_create_tensor(
+                src1->data, ACL_FLOAT, sizeof(float), tmp_mask_ne, tmp_mask_nb,
+                GGML_MAX_DIMS, ACL_FORMAT_ND);
+        }
+
+        // alibi
+        const int n_head = src0->ne[2];
+        const size_t src_nb0 = src0->nb[0];
+
+        n_bytes = ggml_nbytes(dst);
+        ggml_cann_pool_alloc output_allocator(ctx.pool(), n_bytes);
+        void* output_buffer = output_allocator.get();
+        aclTensor* alibi_output_tensor = ggml_cann_create_tensor(
+            output_buffer, ACL_FLOAT, ggml_type_size(dst->type), dst->ne,
+            dst->nb, GGML_MAX_DIMS);
+        if (max_bias <= 0.0f) {
+            // slope = 1.0
+            if (tmp_mask_tensor) {
+                aclnn_add(ctx, tmp_mask_tensor, acl_input_mul_scale_tensor,
+                          alibi_output_tensor);
+            } else {
+                aclnn_add(ctx, acl_src1_fp32_tensor, acl_input_mul_scale_tensor,
+                          alibi_output_tensor);
+            }
+        } else {
+            // slope != 1.0
+            if (tmp_mask_tensor) {
+                aclnn_alibi(ctx, acl_input_mul_scale_tensor, tmp_mask_tensor,
+                            alibi_output_tensor, n_head, src0->ne, src_nb0,
+                            max_bias, dst);
+            } else {
+                aclnn_alibi(ctx, acl_input_mul_scale_tensor,
+                            acl_src1_fp32_tensor, alibi_output_tensor, n_head,
+                            src0->ne, src_nb0, max_bias, dst);
+            }
+        }
+
+        // softmax
+        aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst);
+        ggml_cann_release_resources(ctx, alibi_output_tensor);
+    } else {
+        aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst);
    }
+
+    ggml_cann_release_resources(ctx, acl_src0, acl_src1_fp32_tensor, acl_dst,
+        acl_scale, acl_input_mul_scale_tensor, tmp_mask_tensor);
 }

 /**
- * @brief Performs inplace index copy operation on a 4D tensor using the CANN backend.
+ * @brief Performs embedding operation on a 4D tensor using the CANN backend.
 *
- * This function applies the `IndexCopy` operation along a specific dimension of the
- * destination tensor (`dst_buffer`) by copying elements from the source tensor (`src_buffer`)
- * to positions specified by the index tensor (`index`).
- * It iterates over the last two dimensions of the tensors, creates the corresponding
- * CANN tensors for source, index, and destination slices, and performs the index copy
- * operation for each slice.
+ * This function extracts slices from the source tensor (`src_buffer`),
+ * index tensor (`index`), and destination tensor (`dst`), and performs an
+ * embedding operation on them. The embedding operation is applied by iterating
+ * over the last two dimensions of the source tensor, creating the necessary
+ * tensors for the source, index, and output, and executing the embedding operation.
 *
 * @param ctx The context for CANN backend operations.
- * @param src_buffer The source buffer containing the 4D input tensor data to be copied.
+ * @param src_buffer The source buffer holding the data for the source tensor.
 * @param src_ne The dimensions of the source tensor.
 * @param src_nb The strides (byte offsets) of the source tensor.
- * @param dst_buffer The destination buffer where values will be copied to.
- * @param dst_ne The dimensions of the destination tensor.
- * @param dst_nb The strides (byte offsets) of the destination tensor.
- * @param index The index tensor specifying target positions in the destination tensor.
- * @param type The data type of the source and destination tensors.
+ * @param index The index tensor used in the embedding operation.
+ * @param dst The destination tensor where the result will be stored.
 */
-static void aclnn_index_copy_4d(ggml_backend_cann_context& ctx,
-                                void* src_buffer,int64_t* src_ne, size_t* src_nb,
-                                void* dst_buffer, int64_t* dst_ne, size_t* dst_nb,
-                                ggml_tensor* index, ggml_type type) {
+static void aclnn_embedding_4d(ggml_backend_cann_context& ctx, void* src_buffer,
+                            int64_t* src_ne, size_t* src_nb, ggml_tensor* index,
+                            ggml_tensor* dst) {
    for (int64_t i = 0; i < src_ne[3]; i++) {
        for (int64_t j = 0; j < src_ne[2]; j++) {
            // src
+            int64_t acl_src_ne[2] = {src_ne[0], src_ne[1]};
+            size_t acl_src_nb[2] = {src_nb[0], src_nb[1]};
            aclTensor* acl_src_tensor = ggml_cann_create_tensor(
                (char*)src_buffer + i * src_nb[3] + j * src_nb[2],
-                ggml_cann_type_mapping(type), ggml_type_size(type),
-                src_ne, src_nb, 2);
+                ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
+                acl_src_ne, acl_src_nb, 2);

            // index
+            int64_t acl_index_ne[1] = {index->ne[0]};
+            size_t acl_index_nb[1] = {index->nb[0]};
            aclTensor* acl_index = ggml_cann_create_tensor(
-                (char*)index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
+                (char*)index->data + i * index->nb[2] + j * index->nb[1],
                ggml_cann_type_mapping(index->type), ggml_element_size(index),
-                index->ne, index->nb, 1);
+                acl_index_ne, acl_index_nb, 1);

            // out
+            int64_t acl_out_ne[2] = {dst->ne[0], dst->ne[1]};
+            size_t acl_out_nb[2] = {dst->nb[0], dst->nb[1]};
            aclTensor* acl_out = ggml_cann_create_tensor(
-                (char*)dst_buffer + i * dst_nb[3] + j * dst_nb[2],
-                ggml_cann_type_mapping(type), ggml_type_size(type),
-                dst_ne, dst_nb, 2);
-            GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexCopy, acl_out, 0, acl_index, acl_src_tensor);
+                (char*)dst->data + i * dst->nb[3] + j * dst->nb[2],
+                ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
+                acl_out_ne, acl_out_nb, 2);
+            GGML_CANN_CALL_ACLNN_OP(ctx, Embedding, acl_src_tensor, acl_index, acl_out);
            ggml_cann_release_resources(ctx, acl_src_tensor, acl_index, acl_out);
        }
    }
@@ -1663,9 +1633,8 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {

    switch (src0->type) {
        case GGML_TYPE_F32: {
-            aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb,
-                                dst->data, dst->ne, dst->nb,
-                                src1, dst->type);
+            aclnn_embedding_4d(ctx, src0->data, src0->ne, src0->nb, src1,
+                                   dst);
            break;
        }
        case GGML_TYPE_F16: {
@@ -1682,9 +1651,8 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
                src0->ne, src_trans_nb, GGML_MAX_DIMS);
            aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
-            aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
-                                dst->data, dst->ne, dst->nb,
-                                src1, dst->type);
+            aclnn_embedding_4d(ctx, src_trans_buffer, src0->ne,
+                                   src_trans_nb, src1, dst);
            ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
            break;
        }
@@ -1744,10 +1712,8 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
            }

-            aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(),
-                                   dequant_ne, dequant_nb,
-                                   dst->data, dst->ne, dst->nb,
-                                   src1, dst->type);
+            aclnn_embedding_4d(ctx, dequant_buffer_allocator.get(),
+                                   dequant_ne, dequant_nb, src1, dst);

            ggml_cann_release_resources(ctx, dequant_tensor);
            break;
@@ -1758,43 +1724,6 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    }
 }

-void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    ggml_tensor* src0 = dst->src[0];  // src
-    ggml_tensor* src1 = dst->src[1];  // index
-
-    switch (dst->type) {
-        case GGML_TYPE_F32: {
-            aclnn_index_copy_4d(ctx, src0->data, src0->ne, src0->nb,
-                                dst->data, dst->ne, dst->nb,
-                                src1, dst->type);
-            break;
-        }
-        case GGML_TYPE_F16: {
-            aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
-            ggml_cann_pool_alloc src_buffer_allocator(
-                ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t));
-            void* src_trans_buffer = src_buffer_allocator.get();
-            size_t src_trans_nb[GGML_MAX_DIMS];
-            src_trans_nb[0] = sizeof(uint16_t);
-            for (int i = 1; i < GGML_MAX_DIMS; i++) {
-                src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
-            }
-            aclTensor* src_trans_tensor = ggml_cann_create_tensor(
-                src_trans_buffer, ACL_FLOAT16, ggml_type_size(dst->type),
-                src0->ne, src_trans_nb, GGML_MAX_DIMS);
-            aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
-            aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
-                                dst->data, dst->ne, dst->nb,
-                                src1, dst->type);
-            ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
-            break;
-        }
-        default:
-            GGML_ABORT("Unsupported tensor type for GGML_OP_SET_ROWS");
-            break;
-    }
-}
-
 /**
 * @brief Repeats elements of a tensor along a specified dimension.
 *
@@ -1858,9 +1787,11 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
                             bcast_weight_nb[4], bcast_weight_nb[5]};
    aclTensor* acl_weight_tensor;

-    // Only check env once.
-    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
-    if (weight_to_nz && is_matmul_weight(weight)) {
+    bool weightToNZ = false;
+#ifdef ASCEND_310P
+    weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
+#endif
+    if (weightToNZ && is_matmul_weight(weight)) {
        int64_t acl_stride[2] = {1, transpose_ne[1]};

        // Reverse ne.
@@ -3153,24 +3084,104 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
            // Compute the slope if needed. Derived from ggml_cann_softmax().
            if(maxBias != 0.0f){
                // alibi
-                const int64_t n_heads = src0->ne[2];
-                ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(float));
-                void* slope_buffer = slope_allocator.get();
-                aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias);
+                const int64_t ne2_ne3 = src0->ne[2] * src0->ne[3];
+                const int64_t n_head = src0->ne[2];
+                const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
+                float m0 = powf(2.0f, -(maxBias) / n_heads_log2_floor);
+                float m1 = powf(2.0f, -(maxBias / 2.0f) / n_heads_log2_floor);
+                // init arange
+                ggml_cann_pool_alloc arange_allocator(ctx.pool(),
+                                                    ne2_ne3 * faElemSize);
+                void* tmp_arange_buffer = arange_allocator.get();

-                int64_t slope_ne[] = {1, 1, n_heads, 1};
-                size_t slope_nb[GGML_MAX_DIMS];
-                slope_nb[0] = sizeof(float);
-                for(int i = 1;i<GGML_MAX_DIMS;i++) {
-                    slope_nb[i] = slope_nb[i-1] * slope_ne[0];
+                // arange1: [1, ..., n_heads_log2_floor+1)
+                float start = 1;
+                float stop = n_heads_log2_floor + 1;
+                float step = 1;
+                int64_t n_elements_arange = n_heads_log2_floor;
+
+                int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
+                size_t tmp_arange1_nb[] = {faElemSize};
+                aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
+                    tmp_arange_buffer, faDataType, faElemSize,
+                    tmp_arange1_ne, tmp_arange1_nb,
+                    GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+
+                aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
+
+                aclTensor* tmp_arange2_tensor = nullptr;
+                if (n_heads_log2_floor < ne2_ne3) {
+                    // arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
+                    start = 1;
+                    stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
+                    step = 2;
+                    n_elements_arange = ne2_ne3 - n_heads_log2_floor;
+                    int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
+                    size_t tmp_arange2_nb[] = {faElemSize};
+
+                    aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
+                        (char*)tmp_arange_buffer +
+                            n_heads_log2_floor * faElemSize,
+                        faDataType, faElemSize,
+                        tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+                    aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
+                                n_elements_arange);
                }

-                aclTensor* slope_tensor = ggml_cann_create_tensor(
-                    slope_buffer, ACL_FLOAT, sizeof(float),
-                    slope_ne, slope_nb, GGML_MAX_DIMS);
-                GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor, slope_tensor);
+                // init mk_base
+                ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
+                                                    ne2_ne3 * faElemSize);
+                void* tmp_mk_base_buffer = mk_base_allocator.get();
+                int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
+                size_t tmp_mk_base1_nb[] = {faElemSize};
+                aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
+                    tmp_mk_base_buffer, faDataType, faElemSize,
+                    tmp_mk_base1_ne, tmp_mk_base1_nb,
+                    GGML_MAX_DIMS - 3, ACL_FORMAT_ND);

-                ggml_cann_release_resources(ctx, slope_tensor);
+                aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
+
+                aclTensor* tmp_mk_base2_tensor = nullptr;
+                if (n_heads_log2_floor < ne2_ne3) {
+                    int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
+                    size_t tmp_mk_base2_nb[] = {faElemSize};
+                    aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
+                        (char*)tmp_mk_base_buffer +
+                            n_heads_log2_floor * faElemSize,
+                        faDataType, faElemSize,
+                        tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+                    aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
+                }
+
+                // init mk
+                int64_t tmp_mk_base_ne[] = {ne2_ne3};
+                size_t tmp_mk_base_nb[] = {faElemSize};
+                aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
+                    tmp_mk_base_buffer, faDataType, faElemSize,
+                    tmp_mk_base_ne, tmp_mk_base_nb,
+                    GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+                aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
+                    tmp_arange_buffer, faDataType, faElemSize,
+                    tmp_mk_base_ne, tmp_mk_base_nb,
+                    GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+                aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
+
+                // reshape mk
+                int64_t tmp_mk_ne[] = {1, 1, src0->ne[2], src0->ne[3]};
+                size_t tmp_mk_nb[GGML_MAX_DIMS];
+                tmp_mk_nb[0] = faElemSize;
+                for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                    tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
+                }
+                aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
+                    tmp_mk_base_buffer, faDataType, faElemSize,
+                    tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
+                    ACL_FORMAT_ND);
+                GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor, tmp_mk_tensor);
+
+                ggml_cann_release_resources(ctx, tmp_arange1_tensor, tmp_arange2_tensor,
+                    tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
+                    tmp_arange_tensor, tmp_mk_tensor);
            }
        }

--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -424,25 +424,15 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 *
 * @details This function retrieves rows from a source tensor src0 according to
 *          the indices provided in another tensor src1 and stores the result in
- *          a destination tensor (\p dst).
+ *          a destination tensor (\p dst). It supports different data types
+ *          including F32, F16, Q4_0, and Q8_0.
 *
 * @param ctx The backend CANN context for executing operations.
 * @param dst The destination tensor where the extracted rows will be stored.
+ *            dst->op is `GGML_OP_GET_ROWS`.
 */
 void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);

-/**
- * @brief   Writes specific rows into a tensor at positions specified by indices.
- *
- * @details This function copies rows from a source tensor into a destination
- *          tensor (\p dst) at the positions indicated by the indices in another
- *          tensor.
- *
- * @param ctx The backend CANN context for executing operations.
- * @param dst The destination tensor where the specified rows will be updated.
- */
-void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
 /**
 * @brief   Executes matrix multiplication for the given tensor.
 *
@@ -1108,7 +1098,7 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 * @param dst The destination tensor. Its src[0] is treated as the input tensor.
 */
 template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
-    void ggml_cann_op_unary(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    void ggml_cann_unary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];

    aclTensor* acl_src = ggml_cann_create_tensor(src);
@@ -1119,125 +1109,49 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
 }

 /**
- * @brief Applies a unary operation to a ggml tensor using the CANN backend.
+ * @brief   Applies a unary operation to a ggml tensor using the CANN backend.
 *
- * @details This function applies a unary operation to the input tensor using
- * a user-provided lambda or callable `unary_op`. The lambda receives the
- * CANN backend context and two ACL tensors: the source and the destination.
+ * @details This function performs a unary operation on the input tensor using
+ * a user-provided lambda or callable object `unary_op`, which accepts the CANN
+ * context and two ACL tensors (source and destination). Internally, this function
+ * creates ACL representations of the ggml tensors and invokes the unary operation.
+ * The result is stored in the destination tensor `dst`. This utility abstracts the
+ * common boilerplate of tensor conversion and cleanup when implementing unary ops.
 *
- * Internally, this function handles the conversion from GGML tensors to ACL tensors,
- * calls the provided unary op, and manages resource cleanup. The input is assumed
- * to be `dst->src[0]`, and the result is written to `dst`.
- *
- * This utility simplifies writing unary op wrappers by abstracting tensor preparation.
- *
- * @param unary_op A callable that performs the unary operation using CANN ACL APIs.
- * @param ctx The CANN context for operation execution.
- * @param dst The destination ggml_tensor where the result will be stored.
- *            The input tensor is assumed to be `dst->src[0]`.
- *
- * @see GGML_CANN_CALL_OP_UNARY
+ * @param unary_op A callable that performs the unary operation using CANN APIs.
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the result will be stored.
+ *            The source tensor is retrieved from `dst->src[0]`.
 */
-void ggml_cann_op_unary(
+void ggml_cann_unary_op(
    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
    ggml_backend_cann_context& ctx, ggml_tensor* dst);

 /**
- * @brief Applies a gated (GLU-style) unary operation using the CANN backend.
+ * @brief Helper macro to invoke a unary ACL operation using ggml_cann_unary_op.
 *
- * @details This function performs a gated activation such as GEGLU or ReGLU.
- * It supports two input modes:
- *
- * 1. **Dual input mode**: `dst->src[0]` and `dst->src[1]` are both valid tensors.
- *    These are used directly as the value and gate tensors.
- *
- * 2. **Packed input mode**: Only `dst->src[0]` is valid, and it is assumed to
- *    contain a concatenation of value and gate along the first dimension. This tensor
- *    will be split into two equal halves to form the value and gate inputs.
- *
- * The function applies a user-provided unary operation (e.g., GELU) to the value tensor,
- * then multiplies the result in-place with the gate tensor:
+ * This macro defines an inline lambda wrapping a specific ACL operation name,
+ * and passes it to the templated ggml_cann_unary_op function. It simplifies
+ * calling unary ops by hiding the lambda boilerplate.
 *
+ * Internally, the lambda will call:
 * @code
- * dst = unary_op(value) * gate;
+ * GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
 * @endcode
 *
- * The `swapped` parameter (from `dst->op_params[1]`) allows flipping the
- * order of value/gate in the packed input case.
- *
- * @param unary_op A callable that performs the unary operation using CANN ACL APIs.
- *                 It receives (ctx, acl_value_tensor, acl_output_tensor).
- * @param ctx      The CANN context used for execution.
- * @param dst      The destination ggml_tensor. Source tensors are in `dst->src[0]` and optionally `src[1]`.
- *
- * @see GGML_CANN_CALL_OP_UNARY_GATED
- */
-void ggml_cann_op_unary_gated(
-    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
-    ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief Helper macro to call a unary ACL operator via ggml_cann_op_unary.
- *
- * This macro wraps the specified ACLNN unary operator name into a lambda expression,
- * and passes it to `ggml_cann_op_unary`, which handles the common logic for executing
- * unary ops in the CANN backend.
- *
- * Internally, this macro expands to a lambda like:
- * @code
- * [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
- *     GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
- * };
- * @endcode
- *
- * This lambda is then passed to `ggml_cann_op_unary`, which applies the operation.
- *
 * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
 *
- * @see ggml_cann_op_unary
+ * @see ggml_cann_unary_op
 * @see GGML_CANN_CALL_ACLNN_OP
 */
-#define GGML_CANN_CALL_OP_UNARY(OP_NAME)                              \
+#define GGML_CANN_CALL_UNARY_OP(OP_NAME)                              \
    do {                                                              \
        auto lambda = [](ggml_backend_cann_context& ctx,              \
            aclTensor* acl_src,                                       \
            aclTensor* acl_dst) {                                     \
            GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);  \
        };                                                            \
-        ggml_cann_op_unary(lambda, ctx, dst);                         \
+        ggml_cann_unary_op(lambda, ctx, dst);                         \
    }                                                                 \
    while (0)
-
-/**
- * @brief Helper macro to call a gated unary ACL operator via ggml_cann_op_unary_gated.
- *
- * This macro wraps the specified ACLNN unary operator name into a lambda expression,
- * and passes it to `ggml_cann_op_unary_gated`, which handles the common logic for
- * executing gated unary ops in the CANN backend.
- *
- * Internally, this macro expands to a lambda like:
- * @code
- * [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
- *     GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
- * };
- * @endcode
- *
- * This lambda is then passed to `ggml_cann_op_unary_gated`, which applies the operation.
- *
- * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
- *
- * @see ggml_cann_op_unary_gated
- * @see GGML_CANN_CALL_ACLNN_OP
- */
-#define GGML_CANN_CALL_OP_UNARY_GATED(OP_NAME)                        \
-    do {                                                              \
-        auto lambda = [](ggml_backend_cann_context& ctx,              \
-            aclTensor* acl_src,                                       \
-            aclTensor* acl_dst) {                                     \
-            GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);  \
-        };                                                            \
-        ggml_cann_op_unary_gated(lambda, ctx, dst);                   \
-    }                                                                 \
-    while (0)
-
 #endif  // CANN_ACLNN_OPS
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@@ -337,29 +337,6 @@ private:
    int32_t device_;
 };

-#ifdef USE_ACL_GRAPH
-struct ggml_graph_node_properties {
-    void * node_address;
-    ggml_op node_op;
-    int64_t ne[GGML_MAX_DIMS];
-    size_t nb[GGML_MAX_DIMS];
-    void * src_address[GGML_MAX_SRC];
-    int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
-};
-
-struct ggml_cann_graph {
-    ~ggml_cann_graph() {
-        if (graph != nullptr) {
-            aclmdlRIDestroy(graph);
-        }
-    }
-
-    aclmdlRI graph = nullptr;
-
-    std::vector<ggml_graph_node_properties> ggml_graph_properties;
-};
-#endif  // USE_ACL_GRAPH
-
 /**
 * @brief Context for managing CANN backend operations.
 */
@@ -368,13 +345,8 @@ struct ggml_backend_cann_context {
    std::string name;                /**< Name of the device. */
    std::string description;         /**< Description of the device. */
    aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
-#ifdef USE_ACL_GRAPH
-    /// Cached CANN ACL graph used for executing the current ggml computation graph.
-    std::unique_ptr<ggml_cann_graph> cann_graph;
-#endif
    cann_task_queue task_queue;
    bool async_mode;
-    bool support_set_rows;

    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */

@@ -390,14 +362,6 @@ struct ggml_backend_cann_context {
        async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
        GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
            device, async_mode ? "ON" : "OFF");
-
-        support_set_rows = parse_bool(get_env("LLAMA_SET_ROWS").value_or(""));
-        GGML_LOG_INFO("%s: LLAMA_SET_ROWS is %s\n", __func__, support_set_rows ? "ON" : "OFF");
-
-        if (!support_set_rows) {
-            GGML_LOG_INFO("%s: CANN Graph currently only supports execution when LLAMA_SET_ROWS is ON. "
-                    "Falling back to eager mode.\n", __func__);
-        }
    }

    /**
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1116,59 +1116,61 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
    return GGML_STATUS_SUCCESS;
 }

-// ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed
-namespace {
-    void* g_nz_workspace = nullptr;
-    size_t g_nz_workspace_allocated = 0;
-
-    void release_nz_workspace() {
-        if (g_nz_workspace) {
-            aclrtFree(g_nz_workspace);
-            g_nz_workspace = nullptr;
-            g_nz_workspace_allocated = 0;
-        }
+static int CreateAclTensorWeight(const void *hostData, const std::vector<int64_t> &shape, void **deviceAddr,
+                      aclDataType dataType, aclTensor **tensor)
+{
+    uint64_t size = 1;
+    for (auto i : shape) {
+        size *= i;
    }

-    void relloc_nz_workspace(size_t new_size) {
-        if (new_size > g_nz_workspace_allocated) {
-        if (g_nz_workspace) {
-            aclrtFree(g_nz_workspace);
-            g_nz_workspace = nullptr;
-        }
-        ACL_CHECK(aclrtMalloc(&g_nz_workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
-        g_nz_workspace_allocated = new_size;
-    }
+    const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size());
+    ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size));
+
+    size *= sizeof(int16_t);
+
+    ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
+    aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);
+
+    std::vector<int64_t> strides(shape.size(), 1);
+    for (int64_t i = shape.size() - 2; i >= 0; i--) {
+        strides[i] = shape[i + 1] * strides[i + 1];
    }
+
+    *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
+                              shape.data(), shape.size(), *deviceAddr);
+    return 0;
 }

-/**
- * @brief Convert tensor weights to NZ format using Ascend CANN API.
- *
- * This function creates a transposed tensor descriptor and performs the
- * TransMatmulWeight operation. Converting tensor formats can significantly
- * improve performance on certain hardware.
- *
- * @param tensor Pointer to the input ggml_tensor containing the weights.
- * @param data Pointer to the raw data buffer for the tensor weights.
- * @param offset Byte offset within the tensor data buffer where weights start.
- *
- * @note The workspace buffer used in this function is managed globally and reused
- *       across calls. This reduces overhead from repeated memory allocation and deallocation.
- */
 static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) {
-    aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne,
-                                    tensor->nb, 2, ACL_FORMAT_ND, offset);
+    aclrtStream stream;
+    ACL_CHECK(aclrtCreateStream(&stream));
+
+    std::vector<int64_t> weightTransposedShape = {tensor->ne[1], tensor->ne[0]};
+    void *weightTransposedDeviceAddr = nullptr;
+    aclTensor *weightTransposed = nullptr;
+    CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr,
+                          ggml_cann_type_mapping(tensor->type), &weightTransposed);
+
    uint64_t workspaceSize = 0;
    aclOpExecutor *executor;
+    void *workspaceAddr = nullptr;

    // TransMatmulWeight
-    ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
-                                                    &workspaceSize, &executor));
-    // Avoid frequent malloc/free of the workspace.
-    relloc_nz_workspace(workspaceSize);
+    ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor));
+    std::unique_ptr<void, aclError (*)(void *)> workspaceAddrPtrTrans(nullptr, aclrtFree);
+    if (workspaceSize > 0) {
+        ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+        workspaceAddrPtrTrans.reset(workspaceAddr);
+    }
+    ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream));

-    ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
+    size_t size = ggml_nelements(tensor) * ggml_element_size(tensor);
+
+    aclrtMemcpy((char *)tensor->data + offset, size,
+                weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
    ACL_CHECK(aclDestroyTensor(weightTransposed));
+    aclrtFree(weightTransposedDeviceAddr);
 }

 // TODO: need handle tensor which has paddings.
@@ -1195,14 +1197,14 @@ static void ggml_backend_cann_buffer_set_tensor(
    // For acl, synchronous functions use this default stream.
    // Why aclrtSynchronizeDevice?

-    // Only check env once.
-    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
+    bool weightToNZ = false;
+#ifdef ASCEND_310P
+    weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
+#endif
    if (!need_transform(tensor->type)) {
        ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
                              ACL_MEMCPY_HOST_TO_DEVICE));
-        if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
-            GGML_ASSERT(tensor->ne[2] == 1);
-            GGML_ASSERT(tensor->ne[3] == 1);
+        if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) {
            weight_format_to_nz(tensor, data, offset);
        }
    } else {
@@ -1438,32 +1440,20 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
    size_t size = ggml_nbytes(tensor);
    int64_t ne0 = tensor->ne[0];

-    // Only check env once.
-    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
-
    // last line must bigger than 32, because every single op deal at
    // least 32 bytes.
    // TODO: quantized type?
    // int64_t line_size = ne0 * ggml_element_size(tensor);
    // int64_t line_size_align_32 = (line_size + 31) & ~31;
    // size += (line_size_align_32 - line_size);
+
+    // TODO: not support quantized yet.
+    // TODO: consider un-continue tensor.
    if (ggml_is_quantized(tensor->type)) {
        if (ne0 % MATRIX_ROW_PADDING != 0) {
            size += ggml_row_size(
                tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
        }
-    } else if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
-        // NZ format weight are not support quantized yet.
-        // If ND tensor transform to NZ, size may changed.
-        int64_t shape[] = {tensor->ne[1], tensor->ne[0]};
-        GGML_ASSERT(tensor->ne[2] == 1);
-        GGML_ASSERT(tensor->ne[3] == 1);
-        const aclIntArray *acl_shape = aclCreateIntArray(shape, 2);
-        size_t new_size;
-        ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape,
-                    ggml_cann_type_mapping(tensor->type), &new_size));
-        ACL_CHECK(aclDestroyIntArray(acl_shape));
-        size = std::max(size, new_size);
    }

    return size;
@@ -1669,9 +1659,6 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
        case GGML_OP_GET_ROWS:
            ggml_cann_get_rows(ctx, dst);
            break;
-        case GGML_OP_SET_ROWS:
-            ggml_cann_set_rows(ctx, dst);
-            break;
        case GGML_OP_DUP:
            ggml_cann_dup(ctx, dst);
            break;
@@ -1694,18 +1681,16 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(dst)) {
                case GGML_UNARY_OP_ABS:
-                    GGML_CANN_CALL_OP_UNARY(Abs);
+                    GGML_CANN_CALL_UNARY_OP(Abs);
                    break;
                case GGML_UNARY_OP_NEG:
-                    GGML_CANN_CALL_OP_UNARY(Neg);
+                    GGML_CANN_CALL_UNARY_OP(Neg);
                    break;
                case GGML_UNARY_OP_GELU:
-                case GGML_UNARY_OP_GELU_ERF:
-                    // aclnnGelu internally uses the erf-based approximation.
-                    GGML_CANN_CALL_OP_UNARY(Gelu);
+                    GGML_CANN_CALL_UNARY_OP(Gelu);
                    break;
                case GGML_UNARY_OP_SILU:
-                    GGML_CANN_CALL_OP_UNARY(Silu);
+                    GGML_CANN_CALL_UNARY_OP(Silu);
                    break;
                case GGML_UNARY_OP_GELU_QUICK: {
                    auto lambda = [](ggml_backend_cann_context& ctx,
@@ -1713,31 +1698,31 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
                        aclTensor* acl_dst) {
                        GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
                    };
-                    ggml_cann_op_unary(lambda, ctx, dst);
+                    ggml_cann_unary_op(lambda, ctx, dst);
                } break;
                case GGML_UNARY_OP_TANH:
-                    GGML_CANN_CALL_OP_UNARY(Tanh);
+                    GGML_CANN_CALL_UNARY_OP(Tanh);
                    break;
                case GGML_UNARY_OP_RELU:
-                    GGML_CANN_CALL_OP_UNARY(Relu);
+                    GGML_CANN_CALL_UNARY_OP(Relu);
                    break;
                case GGML_UNARY_OP_SIGMOID:
-                    GGML_CANN_CALL_OP_UNARY(Sigmoid);
+                    GGML_CANN_CALL_UNARY_OP(Sigmoid);
                    break;
                case GGML_UNARY_OP_HARDSIGMOID:
-                    GGML_CANN_CALL_OP_UNARY(Hardsigmoid);
+                    GGML_CANN_CALL_UNARY_OP(Hardsigmoid);
                    break;
                case GGML_UNARY_OP_HARDSWISH:
-                    GGML_CANN_CALL_OP_UNARY(Hardswish);
+                    GGML_CANN_CALL_UNARY_OP(Hardswish);
                    break;
                case GGML_UNARY_OP_EXP:
-                    GGML_CANN_CALL_OP_UNARY(Exp);
+                    GGML_CANN_CALL_UNARY_OP(Exp);
                    break;
                case GGML_UNARY_OP_ELU:
                    ggml_cann_elu(ctx, dst);
                    break;
                case GGML_UNARY_OP_SGN:
-                    GGML_CANN_CALL_OP_UNARY(Sign);
+                    GGML_CANN_CALL_UNARY_OP(Sign);
                    break;
                case GGML_UNARY_OP_STEP:
                    ggml_cann_step(ctx, dst);
@@ -1746,31 +1731,6 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
                    return false;
            }
            break;
-        case GGML_OP_GLU:
-            switch (ggml_get_glu_op(dst)) {
-                case GGML_GLU_OP_REGLU:
-                    GGML_CANN_CALL_OP_UNARY_GATED(Relu);
-                    break;
-                case GGML_GLU_OP_GEGLU:
-                case GGML_GLU_OP_GEGLU_ERF:
-                    // aclnnGelu internally uses the erf-based approximation.
-                    GGML_CANN_CALL_OP_UNARY_GATED(Gelu);
-                    break;
-                case GGML_GLU_OP_SWIGLU:
-                    GGML_CANN_CALL_OP_UNARY_GATED(Silu);
-                    break;
-                case GGML_GLU_OP_GEGLU_QUICK: {
-                    auto lambda = [](ggml_backend_cann_context& ctx,
-                        aclTensor* acl_src,
-                        aclTensor* acl_dst) {
-                        GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
-                    };
-                    ggml_cann_op_unary_gated(lambda, ctx, dst);
-                } break;
-                default:
-                    return false;
-            }
-            break;
        case GGML_OP_NORM:
            ggml_cann_norm(ctx, dst);
            break;
@@ -1813,7 +1773,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
            ggml_cann_binary_op<aclnn_mul>(ctx, dst);
            break;
        case GGML_OP_SQRT:
-            GGML_CANN_CALL_OP_UNARY(Sqrt);
+            GGML_CANN_CALL_UNARY_OP(Sqrt);
            break;
        case GGML_OP_CLAMP:
            ggml_cann_clamp(ctx, dst);
@@ -1858,16 +1818,16 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
            ggml_cann_argmax(ctx, dst);
            break;
        case GGML_OP_COS:
-            ggml_cann_op_unary<aclnn_cos>(ctx, dst);
+            ggml_cann_unary_op<aclnn_cos>(ctx, dst);
            break;
        case GGML_OP_SIN:
-            ggml_cann_op_unary<aclnn_sin>(ctx, dst);
+            ggml_cann_unary_op<aclnn_sin>(ctx, dst);
            break;
        case GGML_OP_CONV_TRANSPOSE_1D:
            ggml_cann_conv_transpose_1d(ctx, dst);
            break;
        case GGML_OP_LOG:
-            GGML_CANN_CALL_OP_UNARY(Log);
+            GGML_CANN_CALL_UNARY_OP(Log);
            break;
        case GGML_OP_MEAN:
            ggml_cann_mean(ctx, dst);
@@ -2016,9 +1976,6 @@ static bool ggml_backend_cann_cpy_tensor_async(
        (ggml_backend_cann_context*)backend_dst->context;

    size_t copy_size = ggml_nbytes(dst);
-    if (copy_size == 0) {
-        return true;
-    }
    if (backend_src != backend_dst) {
        ggml_backend_cann_buffer_context* buf_ctx_src =
            (ggml_backend_cann_buffer_context*)buf_src->context;
@@ -2075,160 +2032,6 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
    ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
 }

-#ifdef USE_ACL_GRAPH
-/**
- * @brief Populate the internal CANN graph node properties from the ggml computation graph.
- *
- * This function copies all node attributes (operation type, dimensions, strides, input sources,
- * and operation parameters) into the cached CANN graph structure for later reuse or comparison.
- *
- * @param cann_ctx  The CANN backend context.
- * @param cgraph    The ggml computational graph.
- */
-static void set_ggml_graph_node_properties(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
-    for (int node_idx = 0; node_idx < cgraph->n_nodes; node_idx++) {
-        ggml_tensor * node = cgraph->nodes[node_idx];
-        cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_address = node->data;
-        cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_op = node->op;
-
-        for (int dim = 0; dim < GGML_MAX_DIMS; dim++) {
-            cann_ctx->cann_graph->ggml_graph_properties[node_idx].ne[dim] = node->ne[dim];
-            cann_ctx->cann_graph->ggml_graph_properties[node_idx].nb[dim] = node->nb[dim];
-        }
-        for (int src = 0; src < GGML_MAX_SRC; src++) {
-            cann_ctx->cann_graph->ggml_graph_properties[node_idx].src_address[src] =
-                node->src[src] ? node->src[src]->data : nullptr;
-        }
-        memcpy(cann_ctx->cann_graph->ggml_graph_properties[node_idx].op_params, node->op_params, GGML_MAX_OP_PARAMS);
-    }
-}
-
-/**
- * @brief Check if a ggml tensor node matches a previously captured CANN graph node.
- *
- * This function compares all relevant fields (address, op type, shape, source inputs, op params)
- * to determine whether the current node matches a previously recorded version.
- *
- * @param node                  The current ggml tensor node.
- * @param graph_node_properties The stored properties of a CANN graph node.
- * @return true if all fields match (excluding GGML_OP_VIEW); false otherwise.
- */
-static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
-    if (node->data != graph_node_properties->node_address &&
-           node->op != GGML_OP_VIEW) {
-        return false;
-    }
-    if (node->op != graph_node_properties->node_op) {
-        return false;
-    }
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        if (node->ne[i] != graph_node_properties->ne[i]) {
-            return false;
-        }
-        if (node->nb[i] != graph_node_properties->nb[i]) {
-            return false;
-        }
-    }
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        if (node->src[i] &&
-            node->src[i]->data != graph_node_properties->src_address[i] &&
-            node->op != GGML_OP_VIEW
-        ) {
-            return false;
-        }
-    }
-    if (node->op == GGML_OP_SCALE &&
-        memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
-        return false;
-    }
-    return true;
-}
-
-/**
- * @brief Determine if the CANN graph needs to be rebuilt due to graph changes.
- *
- * This checks whether the number or properties of ggml graph nodes have changed
- * compared to the last captured CANN graph. If so, the CANN graph must be re-captured.
- *
- * @param cann_ctx  The CANN backend context.
- * @param cgraph    The current ggml computation graph.
- * @return true if an update is required; false otherwise.
- */
-static bool is_cann_graph_update_required(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
-    // The number of nodes is different, so the graph needs to be reconstructed.
-    if (cann_ctx->cann_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
-        cann_ctx->cann_graph->ggml_graph_properties.resize(cgraph->n_nodes);
-        return true;
-    }
-
-    // The number of nodes is the same; iterate over each node to check whether they match.
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        bool has_matching_properties = ggml_graph_node_has_matching_properties(
-            cgraph->nodes[i], &cann_ctx->cann_graph->ggml_graph_properties[i]);
-        if(!has_matching_properties) {
-            return true;
-        }
-    }
-    return false;
-}
-#endif  // USE_ACL_GRAPH
-
-/**
- * @brief Evaluate the computation graph and optionally capture or execute it using CANN graph API.
- *
- * If CANN graph execution is enabled and graph capture is required, this function begins
- * graph capture, runs the graph, ends capture, and stores the captured graph.
- *
- * Otherwise, it falls back to op-by-op execution using the CANN compute kernel dispatcher.
- *
- * @param cann_ctx                 The CANN backend context.
- * @param cgraph                   The ggml computation graph.
- * @param use_cann_graph           Whether to use CANN graph execution.
- * @param cann_graph_update_required Whether graph capture is needed due to graph changes.
- */
-static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph,
-    bool & use_cann_graph, bool & cann_graph_update_required) {
-#ifdef USE_ACL_GRAPH
-    if (use_cann_graph && cann_graph_update_required) {
-        if (cann_ctx->cann_graph->graph != nullptr) {
-            ACL_CHECK(aclmdlRIDestroy(cann_ctx->cann_graph->graph));
-            cann_ctx->cann_graph->graph = nullptr;
-        }
-        ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
-    }
-#endif // USE_ACL_GRAPH
-
-    // Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
-    // With the use of CANN graphs, the execution will be performed by the graph launch.
-    if (!use_cann_graph || cann_graph_update_required) {
-        for (int i = 0; i < cgraph->n_nodes; i++) {
-            ggml_tensor * node = cgraph->nodes[i];
-
-            if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
-                continue;
-            }
-
-            bool ok = ggml_cann_compute_forward(*cann_ctx, node);
-            if (!ok) {
-                GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
-            }
-            GGML_ASSERT(ok);
-        }
-    }
-
-#ifdef USE_ACL_GRAPH
-    if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture
-        ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &cann_ctx->cann_graph->graph));
-    }
-
-    if (use_cann_graph) {
-        // Execute graph
-        ACL_CHECK(aclmdlRIExecuteAsync(cann_ctx->cann_graph->graph, cann_ctx->stream()));
-    }
-#endif // USE_ACL_GRAPH
-}
-
-
 /**
 * @brief Computes a computational graph using a CANN backend.
 *
@@ -2245,37 +2048,24 @@ static enum ggml_status ggml_backend_cann_graph_compute(
    ggml_backend_t backend, ggml_cgraph* cgraph) {
    ggml_backend_cann_context* cann_ctx =
        (ggml_backend_cann_context*)backend->context;
+
    ggml_cann_set_device(cann_ctx->device);
-    release_nz_workspace();
-#ifdef USE_ACL_GRAPH
-    bool use_cann_graph = true;
-    bool cann_graph_update_required = false;

-    // check environment LLAMA_SET_ROWS
-    if (!cann_ctx->support_set_rows) {
-        use_cann_graph = false;
-    }
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor* node = cgraph->nodes[i];

-    if (use_cann_graph) {
-        if (cann_ctx->cann_graph == nullptr) {
-            cann_ctx->cann_graph.reset(new ggml_cann_graph());
-            cann_graph_update_required = true;
+        if (ggml_is_empty(node) || node->op == GGML_OP_NONE) {
+            continue;
        }

-        cann_graph_update_required = is_cann_graph_update_required(cann_ctx, cgraph);
-        set_ggml_graph_node_properties(cann_ctx, cgraph);
-    }
-#else
-    bool use_cann_graph = false;
-    bool cann_graph_update_required = false;
-#endif  // USE_ACL_GRAPH
+        bool ok = ggml_cann_compute_forward(*cann_ctx, node);

-    evaluate_and_capture_cann_graph(
-        cann_ctx,
-        cgraph,
-        use_cann_graph,
-        cann_graph_update_required
-    );
+        if (!ok) {
+            GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
+                    node->name, ggml_op_name(node->op));
+        }
+        GGML_ASSERT(ok);
+    }

    return GGML_STATUS_SUCCESS;
 }
@@ -2311,23 +2101,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                case GGML_UNARY_OP_ELU:
                case GGML_UNARY_OP_SGN:
                case GGML_UNARY_OP_STEP:
-                case GGML_UNARY_OP_GELU_ERF:
                    return true;
                default:
                    return false;
            }
-        case GGML_OP_GLU:
-            switch (ggml_get_glu_op(op)) {
-                case GGML_GLU_OP_REGLU:
-                case GGML_GLU_OP_GEGLU:
-                case GGML_GLU_OP_SWIGLU:
-                case GGML_GLU_OP_GEGLU_ERF:
-                case GGML_GLU_OP_GEGLU_QUICK:
-                    return true;
-                default:
-                    return false;
-            }
-            break;
        case GGML_OP_MUL_MAT: {
            switch (op->src[0]->type) {
                case GGML_TYPE_F16:
@@ -2374,15 +2151,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                    return false;
            }
        } break;
-        case GGML_OP_SET_ROWS: {
-            switch (op->type) {
-                case GGML_TYPE_F32:
-                case GGML_TYPE_F16:
-                    return true;
-                default:
-                    return false;
-            }
-        } break;
+        case GGML_OP_SET_ROWS:
+            {
+                // TODO: add support
+                // ref: https://github.com/ggml-org/llama.cpp/pull/14274
+#pragma message("TODO: implement F32, F16, BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
+                return false;
+            } break;
        case GGML_OP_CPY: {
            ggml_tensor *src = op->src[0];
            if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
@@ -2391,6 +2166,12 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                // only support F32 and F16.
                return false;
            }
+
+            if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
+                // unsupport dst is not contiguous.
+                return false;
+            }
+
            return true;
        } break;
        case GGML_OP_CONT: {
@@ -2456,8 +2237,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            // value of paddingW should be at most half of kernelW
            return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
        }
-        case GGML_OP_DUP:
        case GGML_OP_SUM:
+        case GGML_OP_DUP:
        case GGML_OP_IM2COL:
        case GGML_OP_CONCAT:
        case GGML_OP_REPEAT:
@@ -2499,11 +2280,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            memcpy(&bias, (float*)op->op_params + 1, sizeof(float));
            return bias == 0.0f; // TODO: support bias != 0.0f
        case GGML_OP_SOFT_MAX:
-            // TODO: support attention sinks [TAG_ATTN_SINKS]
-            if (op->src[2]) {
-                return false;
-            }
-            return true;
+            // TODO: support broadcast
+            // ref: https://github.com/ggml-org/llama.cpp/pull/14435
+            return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
        case GGML_OP_FLASH_ATTN_EXT:{
            // derived from [ggml-cuda.cu]
            if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
@@ -2515,10 +2294,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){
                return false;
            }
-            // TODO: support attention sinks [TAG_ATTN_SINKS]
-            if (op->src[4]) {
-                return false;
-            }
            if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
                // different head sizes of K and V are not supported yet
                return false;
@@ -2530,6 +2305,11 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                // DeepSeek MLA
                return false;
            }
+            // TODO: support broadcast
+            // ref: https://github.com/ggml-org/llama.cpp/pull/14435
+            if (op->src[0]->ne[3] != 1) {
+                return false;
+            }
            float logitSoftcap = 0.0f;
            memcpy(&logitSoftcap,  (float*)op->op_params + 2, sizeof(float));
            if(logitSoftcap != 0.0f) {
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -99,9 +99,6 @@ typedef sycl::half2 ggml_half2;
 #define QI4_1 (QK4_1 / (4 * QR4_1))
 #define QR4_1 2

-#define QI_MXFP4 (QK_MXFP4 / (4 * QR_MXFP4))
-#define QR_MXFP4 2
-
 #define QI5_0 (QK5_0 / (4 * QR5_0))
 #define QR5_0 2

@@ -187,13 +184,6 @@ typedef struct {
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");

-#define QK_MXFP4 32
-typedef struct {
-    uint8_t e; // E8M0
-    uint8_t qs[QK_MXFP4/2];
-} block_mxfp4;
-static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + QK_MXFP4/2, "wrong mxfp4 block size/padding");
-
 #define QK5_0 32
 typedef struct {
    ggml_half d;           // delta
@@ -1084,17 +1074,10 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
    0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
 GGML_TABLE_END()

-// TODO: fix name to kvalues_iq4_nl
 GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
    -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
 GGML_TABLE_END()

-// e2m1 values (doubled)
-// ref: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
-GGML_TABLE_BEGIN(int8_t, kvalues_mxfp4, 16)
-    0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12,
-GGML_TABLE_END()
-
 #define NGRID_IQ1S 2048
 #define IQ1S_DELTA 0.125f
 #define IQ1M_DELTA 0.125f
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -458,9 +458,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            list(APPEND ARCH_FLAGS -march=z16)
        elseif (${S390X_M} MATCHES "9175|9176")
            # NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
-            #       binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
            message(STATUS "z17 target")
-            list(APPEND ARCH_FLAGS -march=arch15)
+            list(APPEND ARCH_FLAGS -march=z17)
        else()
            message(STATUS "Unknown target")
            message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@@ -13,7 +13,6 @@
 #define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
 #define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
 #define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
-#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
 #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
 #define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
@@ -38,25 +37,17 @@
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
-#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
-#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
-#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
-#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
 // repack.cpp
 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
-#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
-#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
-#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
-#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
@@ -73,7 +64,6 @@
 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
 #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
 #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
-#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@@ -82,23 +72,18 @@
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
-#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
-#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
-#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
-#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #elif defined(__loongarch64)
 // quants.c
 #define quantize_row_q8_K_generic quantize_row_q8_K
 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
 #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
 #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
-#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@@ -107,16 +92,12 @@
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
-#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
-#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
-#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
-#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #elif defined(__riscv)
 // quants.c
 #define quantize_row_q8_K_generic quantize_row_q8_K
@@ -131,7 +112,6 @@
 #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
 #define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
 #define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
-#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@@ -139,15 +119,11 @@
 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
-#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
-#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
-#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
-#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #elif defined(__s390x__)
 // quants.c
 #define quantize_row_q8_K_generic quantize_row_q8_K
@@ -163,7 +139,6 @@
 #define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
 #define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
 #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
-#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@@ -172,16 +147,12 @@
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
-#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
-#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
-#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
-#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #elif defined(__wasm__)
 // quants.c
 #define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
@@ -196,7 +167,6 @@
 #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
 #define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
 #define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
-#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@@ -205,14 +175,10 @@
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
 #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
-#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
-#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
 #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
-#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
-#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
 #endif
--- a/ggml/src/ggml-cpu/arch/arm/quants.c
+++ b/ggml/src/ggml-cpu/arch/arm/quants.c
@@ -589,67 +589,6 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sumf;
 }

-void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-    assert(n % QK_MXFP4 == 0);
-    static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
-
-    const block_mxfp4 * GGML_RESTRICT x = vx;
-    const block_q8_0 * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_MXFP4;
-
-    int ib = 0;
-    float sumf = 0;
-
-#if defined __ARM_NEON
-    const int8x16_t values = vld1q_s8(kvalues_mxfp4);
-    const uint8x16_t m4b = vdupq_n_u8(0x0f);
-    uint8x16x2_t q4bits;
-    int8x16x4_t q4b;
-    int8x16x4_t q8b;
-    int32x4_t prod_1;
-    int32x4_t prod_2;
-
-    for (; ib + 1 < nb; ib += 2) {
-        q4bits.val[0] = vld1q_u8(x[ib + 0].qs);
-        q4bits.val[1] = vld1q_u8(x[ib + 1].qs);
-        q8b.val[0]    = vld1q_s8(y[ib + 0].qs);
-        q8b.val[1]    = vld1q_s8(y[ib + 0].qs + 16);
-        q8b.val[2]    = vld1q_s8(y[ib + 1].qs);
-        q8b.val[3]    = vld1q_s8(y[ib + 1].qs + 16);
-
-        q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[0], m4b));
-        q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
-        q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[1], m4b));
-        q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
-
-        prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
-        prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
-
-        sumf +=
-            GGML_E8M0_TO_FP32_HALF(x[ib + 0].e) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
-            GGML_E8M0_TO_FP32_HALF(x[ib + 1].e) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
-    }
-
-#endif
-    for (; ib < nb; ++ib) {
-        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
-        int sumi1 = 0;
-        int sumi2 = 0;
-        for (int j = 0; j < QK_MXFP4/2; ++j) {
-            sumi1 += y[ib].qs[j +          0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
-            sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >>  4];
-        }
-        sumf += d * (sumi1 + sumi2);
-    }
-    *s = sumf;
-}
-
 void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@@ -1297,10 +1236,44 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    *s = sumf;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
+
+    float sumf = 0.0f;
+
+    for (int i = 0; i < nb; ++i) {
+        int sum = 0;
+
+        for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
+            for (size_t l = 0; l < 5; ++l) {
+                for (size_t m = 0; m < 32; ++m) {
+                    uint8_t q = x[i].qs[j + m] * pow3[l];
+                    uint16_t xi = ((uint16_t) q * 3) >> 8;
+                    sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
+                }
+            }
+        }
+        for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
+            for (size_t l = 0; l < 5; ++l) {
+                for (size_t m = 0; m < 16; ++m) {
+                    uint8_t q = x[i].qs[j + m] * pow3[l];
+                    uint16_t xi = ((uint16_t) q * 3) >> 8;
+                    sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
+                }
+            }
+        }
+
+        for (size_t l = 0; l < 4; ++l) {
+            for (size_t j = 0; j < sizeof(x->qh); ++j) {
+                uint8_t q = x[i].qh[j] * pow3[l];
+                uint16_t xi = ((uint16_t) q * 3) >> 8;
+                sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
+            }
+        }
+
+        sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
+    }
+
+    *s = sumf;
 #endif
 }

@@ -1408,10 +1381,25 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    *s = sumf;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    float sumf = 0.0f;
+
+    for (int i = 0; i < nb; ++i) {
+        int32_t sumi = 0;
+
+        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
+            for (size_t l = 0; l < 4; ++l) {
+                for (size_t k = 0; k < 32; ++k) {
+                    sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
+                }
+            }
+        }
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+
+        sumf += (float) sumi * d;
+    }
+
+    *s = sumf;
 #endif
 }

@@ -1741,10 +1729,45 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sum;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        int summs = 0;
+        for (int j = 0; j < 16; ++j) {
+            summs += y[i].bsums[j] * (sc[j] >> 4);
+        }
+
+        const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        int isum = 0;
+        int is = 0;
+        int d;
+        for (int k = 0; k < QK_K/128; ++k) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                d = sc[is++] & 0xF;
+                int isuml = 0;
+                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                d = sc[is++] & 0xF;
+                isuml = 0;
+                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                shift += 2;
+                q8 += 32;
+            }
+            q2 += 32;
+        }
+        sumf += dall * isum - dmin * summs;
+    }
+    *s = sumf;
 #endif
 }

@@ -2034,12 +2057,68 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sum;

 #else
-    UNUSED(kmask1);
-    UNUSED(kmask2);
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    // scalar version
+    // This function is written like this so the compiler can manage to vectorize most of it
+    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
+    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
+    // The ideal situation would be if we could just write the code once, and the compiler would
+    // automatically produce the best possible set of machine instructions, instead of us having to manually
+    // write vectorized versions for AVX, ARM_NEON, etc.
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    uint32_t auxs[4];
+    const int8_t * scales = (const int8_t*)auxs;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            q3 += 32;
+        }
+        a = aux8;
+
+        memcpy(auxs, x[i].scales, 12);
+        uint32_t tmp = auxs[2];
+        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+
 #endif

 }
@@ -2352,14 +2431,61 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sumf;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    UNUSED(kmask1);
-    UNUSED(kmask2);
-    UNUSED(kmask3);
-    UNUSED(utmp);
-    ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            a += 32;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            a += 32; q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
 #endif
 }

@@ -2452,14 +2578,66 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sumf;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    UNUSED(kmask1);
-    UNUSED(kmask2);
-    UNUSED(kmask3);
-    UNUSED(utmp);
-    ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
 #endif
 }

@@ -2915,10 +3093,47 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    }
    *s = sum;
 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            }
+            a  += 128;
+            q4 += 64;
+            qh += 32;
+        }
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int scale = x[i].scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
 #endif
 }

@@ -3014,10 +3229,34 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
    *s = 0.25f * sumf;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    uint32_t aux32[2];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(aux32, q2, 2*sizeof(uint32_t));
+            q2 += 4;
+            const uint32_t ls = 2*(aux32[1] >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
 #endif
 }

@@ -3088,10 +3327,42 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
    *s = 0.125f * sumf;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const uint8_t  * GGML_RESTRICT sc = x[i].scales;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
+            const uint16_t ls2 = 2*(sc[ib32] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls2;
+            q2 += 4;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
 #endif
 }

@@ -3184,10 +3455,45 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    *s = 0.125f * sumf;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const int8_t  * q8 = y[i].qs;
+        const uint8_t * qs = x[i].qs;
+        const uint8_t * qh = x[i].qh;
+        const uint8_t * signs = qs + QK_K/8;
+
+        int bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
+            int ls2 = 1 + 2*(x[i].scales[ib32] >>  4);
+            int sumi1 = 0, sumi2 = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += ls1 * sumi1 + ls2 * sumi2;
+            qs += 4;
+            signs += 4;
+        }
+
+        sumf += d * bsum;
+    }
+
+    *s = 0.125f * sumf;
+
 #endif

 }
@@ -3247,10 +3553,36 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
    *s = 0.5f * sumf;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    uint32_t aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
+            const uint32_t ls = 2*(aux32 >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
+                const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            q3 += 8;
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.25f * sumf;
 #endif
 }

@@ -3357,10 +3689,48 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    *s = sumf;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint8_t * GGML_RESTRICT signs = x[i].signs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
+            const uint32_t ls2 = 2*(x[i].scales[ib32/2] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls2;
+        }
+        sumf += d * bsum;
+    }
+    *s = sumf;
 #endif
 }

@@ -3423,10 +3793,36 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    *s = sumf;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        int sumi = 0, sumi1 = 0;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            const int ls = 2*((qh[ib] >> 12) & 7) + 1;
+            const int delta = qh[ib] & 0x8000 ? -1 : 1;
+            int lsum = 0;
+            for (int l = 0; l < 4; ++l) {
+                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
+                for (int j = 0; j < 8; ++j) {
+                    lsum += q8[j] * grid[j];
+                }
+                q8 += 8;
+            }
+            sumi  += ls * lsum;
+            sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
+            qs += 4;
+        }
+
+        sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
+    }
+
+    *s = sumf;
+
 #endif
 }

@@ -3516,11 +3912,52 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    *s = sumf;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    UNUSED(scale);
-    ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    int sum1[2], sum2[2], delta[4];
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint8_t  * qh = x[i].qh;
+        const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+        int sumi1 = 0, sumi2 = 0;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            delta[0] = qh[0] & 0x08 ? -1 : 1;
+            delta[1] = qh[0] & 0x80 ? -1 : 1;
+            delta[2] = qh[1] & 0x08 ? -1 : 1;
+            delta[3] = qh[1] & 0x80 ? -1 : 1;
+            sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
+            for (int l = 0; l < 4; ++l) {
+                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
+                int lsum1 = 0, lsum2 = 0;
+                for (int j = 0; j < 8; ++j) {
+                    lsum1 += q8[j] * grid[j];
+                    lsum2 += q8[j];
+                }
+                q8 += 8;
+                sum1[l/2] += lsum1;
+                sum2[l/2] += lsum2*delta[l];
+            }
+
+            const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
+            const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
+
+            sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
+            sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
+            qs += 4;
+            qh += 2;
+        }
+
+        sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
+    }
+
+    *s = sumf;
+
 #endif
 }

@@ -3641,10 +4078,37 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
    *s = sumf;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    float sumf = 0;
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
+        uint16_t h = x[ibl].scales_h;
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
+            const uint8_t ls2 = (x[ibl].scales_l[ib/2] >>  4) | ((h << 2) & 0x30);
+            h >>= 4;
+            const float d1 = d4d8*(ls1 - 32);
+            const float d2 = d4d8*(ls2 - 32);
+            int sumi1 = 0, sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d1 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+            sumi1 = sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d2 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+        }
+    }
+    *s = sumf;
 #endif
 }

--- a/ggml/src/ggml-cpu/arch/arm/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp
@@ -86,9 +86,35 @@ void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTR
        }
    }
 #else
-    UNUSED(nb);
-    UNUSED(y);
-    ggml_quantize_mat_q8_0_4x4_generic(x, vy, k);
+    // scalar
+    const int blck_size_interleave = 4;
+    float srcv[4][QK8_0];
+    float id[4];
+
+    for (int i = 0; i < nb; i++) {
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            float amax = 0.0f; // absolute max
+
+            for (int j = 0; j < QK8_0; j++) {
+                srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
+                amax = MAX(amax, fabsf(srcv[row_iter][j]));
+            }
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < QK8_0 * 4; j++) {
+            int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
+            int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
+            src_offset += (j % blck_size_interleave);
+
+            float x0 = srcv[src_id][src_offset] * id[src_id];
+            y[i].qs[j] = roundf(x0);
+        }
+    }
 #endif
 }

@@ -179,9 +205,35 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
    }

 #else
-    UNUSED(nb);
-    UNUSED(y);
-    ggml_quantize_mat_q8_0_4x8_generic(x, vy, k);
+    // scalar
+    const int blck_size_interleave = 8;
+    float srcv[4][QK8_0];
+    float id[4];
+
+    for (int i = 0; i < nb; i++) {
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            float amax = 0.0f; // absolute max
+
+            for (int j = 0; j < QK8_0; j++) {
+                srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
+                amax = MAX(amax, fabsf(srcv[row_iter][j]));
+            }
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < QK8_0 * 4; j++) {
+            int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
+            int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
+            src_offset += (j % blck_size_interleave);
+
+            float x0 = srcv[src_id][src_offset] * id[src_id];
+            y[i].qs[j] = roundf(x0);
+        }
+    }
 #endif
 }

@@ -243,7 +295,29 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
    }
    return;
 #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
-    ggml_gemv_q4_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+    float sumf[4];
+    int sumi;
+
+    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+        for (int l = 0; l < nb; l++) {
+            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumi = 0;
+                    for (int i = 0; i < blocklen; ++i) {
+                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                    }
+                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+                }
+            }
+        }
+        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+    }
 }

 void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -309,7 +383,29 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
    }
    return;
 #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
-    ggml_gemv_q4_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+    float sumf[4];
+    int sumi;
+
+    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+        for (int l = 0; l < nb; l++) {
+            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumi = 0;
+                    for (int i = 0; i < blocklen; ++i) {
+                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                    }
+                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+                }
+            }
+        }
+        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+    }
 }

 void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -401,7 +497,31 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
 #endif // #if defined(__ARM_FEATURE_SVE)

 #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
-    ggml_gemv_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+    {
+        float sumf[8];
+        int sumi;
+
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+
+            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int j = 0; j < ncols_interleaved; j++) {
+                        sumi = 0;
+                        for (int i = 0; i < blocklen; ++i) {
+                            const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                            const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                        }
+                        sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+                    }
+                }
+            }
+            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+        }
+    }
 }

 void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -471,7 +591,31 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
    }
    return;
 #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
-    ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+    {
+        float sumf[4];
+        int sumi;
+
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+
+            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int j = 0; j < ncols_interleaved; j++) {
+                        sumi = 0;
+                        for (int i = 0; i < blocklen; ++i) {
+                            const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
+                            const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
+                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
+                        }
+                        sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+                    }
+                }
+            }
+            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+        }
+    }
 }

 void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -952,7 +1096,40 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
    );
    return;
 #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
-    ggml_gemm_q4_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+    {
+        float sumf[4][4];
+        int sumi;
+
+        for (int y = 0; y < nr / 4; y++) {
+            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+            for (int x = 0; x < nc / ncols_interleaved; x++) {
+                const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+                }
+                for (int l = 0; l < nb; l++) {
+                    for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                        for (int m = 0; m < 4; m++) {
+                            for (int j = 0; j < ncols_interleaved; j++) {
+                                sumi = 0;
+                                for (int i = 0; i < blocklen; ++i) {
+                                    const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                    const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                    sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                            (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                                }
+                                sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+                            }
+                        }
+                    }
+                }
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++)
+                        s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+                }
+            }
+        }
+    }
 }

 void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -1373,7 +1550,38 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
    );
    return;
 #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-    ggml_gemm_q4_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+    float sumf[4][4];
+    int sumi;
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                        (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                            }
+                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++)
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+            }
+        }
+    }
 }

 void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -1811,7 +2019,38 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
 #endif // #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)

 #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
-    ggml_gemm_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+    float sumf[4][8];
+    int sumi;
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                         (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                            }
+                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++)
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+            }
+        }
+    }
 }

 void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -1887,5 +2126,38 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
    }
    return;
 #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
-    ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+    {
+        float sumf[4][4];
+        int sumi;
+
+        for (int y = 0; y < nr / 4; y++) {
+            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+            for (int x = 0; x < nc / ncols_interleaved; x++) {
+                const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+                }
+                for (int l = 0; l < nb; l++) {
+                    for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                        for (int m = 0; m < 4; m++) {
+                            for (int j = 0; j < ncols_interleaved; j++) {
+                                sumi = 0;
+                                for (int i = 0; i < blocklen; ++i) {
+                                    const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
+                                    const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
+                                    sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                            (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
+                                }
+                                sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+                            }
+                        }
+                    }
+                }
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++)
+                        s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+                }
+            }
+        }
+    }
 }
--- a/ggml/src/ggml-cpu/arch/loongarch/quants.c
+++ b/ggml/src/ggml-cpu/arch/loongarch/quants.c
@@ -821,15 +821,24 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi

    sumf = hsum_float_8(acc) + summs;

-    *s = sumf;
-#else
-    UNUSED(nb);
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(ib);
-    UNUSED(sumf);
-    ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F);
+            const int v1 = (x[ib].qs[j] >>   4);
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -874,15 +883,30 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi

    sumf = hsum_float_8(acc);

-    *s = sumf;
-#else
-    UNUSED(nb);
-    UNUSED(ib);
-    UNUSED(sumf);
-    UNUSED(x);
-    UNUSED(y);
-    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
+            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -930,15 +954,30 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi

    sumf = hsum_float_8(acc) + summs;

-    *s = sumf;
-#else
-    UNUSED(nb);
-    UNUSED(ib);
-    UNUSED(sumf);
-    UNUSED(x);
-    UNUSED(y);
-    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
+            const int32_t x1 = (x[ib].qs[j] >>  4) | xh_1;
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -977,15 +1016,18 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi

    sumf = hsum_float_8(acc);

-    *s = sumf;
-#else
-    UNUSED(nb);
-    UNUSED(ib);
-    UNUSED(sumf);
-    UNUSED(x);
-    UNUSED(y);
-    ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk; j++) {
+            sumi += x[ib].qs[j]*y[ib].qs[j];
+        }
+
+        sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -1061,10 +1103,45 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = hsum_float_8(acc);

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        int summs = 0;
+        for (int j = 0; j < 16; ++j) {
+            summs += y[i].bsums[j] * (sc[j] >> 4);
+        }
+
+        const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        int isum = 0;
+        int is = 0;
+        int d;
+        for (int k = 0; k < QK_K/128; ++k) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                d = sc[is++] & 0xF;
+                int isuml = 0;
+                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                d = sc[is++] & 0xF;
+                isuml = 0;
+                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                shift += 2;
+                q8 += 32;
+            }
+            q2 += 32;
+        }
+        sumf += dall * isum - dmin * summs;
+    }
+    *s = sumf;
 #endif
 }

@@ -1162,13 +1239,70 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = hsum_float_8(acc);

 #else
-    UNUSED(kmask1);
-    UNUSED(kmask2);
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    // scalar version
+    // This function is written like this so the compiler can manage to vectorize most of it
+    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
+    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
+    // The ideal situation would be if we could just write the code once, and the compiler would
+    // automatically produce the best possible set of machine instructions, instead of us having to manually
+    // write vectorized versions for AVX, ARM_NEON, etc.
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    uint32_t auxs[4];
+    const int8_t * scales = (const int8_t*)auxs;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            q3 += 32;
+        }
+        a = aux8;
+
+        memcpy(auxs, x[i].scales, 12);
+        uint32_t tmp = auxs[2];
+        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+
 #endif
+
 }

 void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -1257,14 +1391,61 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    UNUSED(kmask1);
-    UNUSED(kmask2);
-    UNUSED(kmask3);
-    UNUSED(utmp);
-    ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            a += 32;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            a += 32; q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
 #endif
 }

@@ -1360,14 +1541,66 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    UNUSED(kmask1);
-    UNUSED(kmask2);
-    UNUSED(kmask3);
-    UNUSED(utmp);
-    ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
 #endif
 }

@@ -1445,10 +1678,47 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = hsum_float_8(acc);

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            }
+            a  += 128;
+            q4 += 64;
+            qh += 32;
+        }
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int scale = x[i].scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
 #endif
 }

@@ -1545,10 +1815,34 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
    *s = 0.125f * hsum_float_8(accumf);

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    uint32_t aux32[2];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(aux32, q2, 2*sizeof(uint32_t));
+            q2 += 4;
+            const uint32_t ls = 2*(aux32[1] >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
 #endif
 }

@@ -1684,10 +1978,42 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
    *s = 0.125f * hsum_float_8(accumf);

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const uint8_t  * GGML_RESTRICT sc = x[i].scales;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
+            const uint16_t ls2 = 2*(sc[ib32] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls2;
+            q2 += 4;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
 #endif
 }

@@ -1779,11 +2105,47 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    *s = 0.125f * hsum_float_8(accumf);

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const int8_t  * q8 = y[i].qs;
+        const uint8_t * qs = x[i].qs;
+        const uint8_t * qh = x[i].qh;
+        const uint8_t * signs = qs + QK_K/8;
+
+        int bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
+            int ls2 = 1 + 2*(x[i].scales[ib32] >>  4);
+            int sumi1 = 0, sumi2 = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += ls1 * sumi1 + ls2 * sumi2;
+            qs += 4;
+            signs += 4;
+        }
+
+        sumf += d * bsum;
+    }
+
+    *s = 0.125f * sumf;
+
 #endif
+
 }

 void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -1847,10 +2209,36 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
    *s = 0.25f * hsum_float_8(accumf);

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    uint32_t aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
+            const uint32_t ls = 2*(aux32 >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
+                const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            q3 += 8;
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.25f * sumf;
 #endif
 }

@@ -1950,10 +2338,48 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    *s = hsum_float_8(accumf);

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint8_t * GGML_RESTRICT signs = x[i].signs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
+            const uint32_t ls2 = 2*(x[i].scales[ib32/2] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls2;
+        }
+        sumf += d * bsum;
+    }
+    *s = sumf;
 #endif
 }

@@ -2034,10 +2460,36 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        int sumi = 0, sumi1 = 0;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            const int ls = 2*((qh[ib] >> 12) & 7) + 1;
+            const int delta = qh[ib] & 0x8000 ? -1 : 1;
+            int lsum = 0;
+            for (int l = 0; l < 4; ++l) {
+                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
+                for (int j = 0; j < 8; ++j) {
+                    lsum += q8[j] * grid[j];
+                }
+                q8 += 8;
+            }
+            sumi  += ls * lsum;
+            sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
+            qs += 4;
+        }
+
+        sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
+    }
+
+    *s = sumf;
+
 #endif
 }

@@ -2151,10 +2603,37 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
    *s = hsum_float_8(accum);

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    float sumf = 0;
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
+        uint16_t h = x[ibl].scales_h;
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
+            const uint8_t ls2 = (x[ibl].scales_l[ib/2] >>  4) | ((h << 2) & 0x30);
+            h >>= 4;
+            const float d1 = d4d8*(ls1 - 32);
+            const float d2 = d4d8*(ls2 - 32);
+            int sumi1 = 0, sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d1 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+            sumi1 = sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d2 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+        }
+    }
+    *s = sumf;
 #endif
 }

--- a/ggml/src/ggml-cpu/arch/powerpc/quants.c
+++ b/ggml/src/ggml-cpu/arch/powerpc/quants.c
@@ -201,14 +201,24 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi

    sumf = vec_extract(vsumf0, 0);

-    *s = sumf;
-#else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(ib);
-    UNUSED(sumf);
-    ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F) - 8;
+            const int v1 = (x[ib].qs[j] >>   4) - 8;
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -268,14 +278,24 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi

    sumf = vec_extract(vsumf0, 0);

-    *s = sumf;
-#else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(ib);
-    UNUSED(sumf);
-    ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F);
+            const int v1 = (x[ib].qs[j] >>   4);
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -340,14 +360,30 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi

    sumf = vec_extract(vsumf0, 0);

-    *s = sumf;
-#else
-    UNUSED(ib);
-    UNUSED(sumf);
-    UNUSED(x);
-    UNUSED(y);
-    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
+            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -415,15 +451,30 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi

    sumf = vec_extract(vsumf0, 0);

-    *s = sumf;
-#else
-    UNUSED(nb);
-    UNUSED(ib);
-    UNUSED(sumf);
-    UNUSED(x);
-    UNUSED(y);
-    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
+            const int32_t x1 = (x[ib].qs[j] >>  4) | xh_1;
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -484,15 +535,18 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi

    sumf = vec_extract(vsumf0, 0);

-    *s = sumf;
-#else
-    UNUSED(nb);
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(ib);
-    UNUSED(sumf);
-    ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk; j++) {
+            sumi += x[ib].qs[j]*y[ib].qs[j];
+        }
+
+        sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -641,10 +695,45 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = vec_extract(vsumf0, 0);

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        int summs = 0;
+        for (int j = 0; j < 16; ++j) {
+            summs += y[i].bsums[j] * (sc[j] >> 4);
+        }
+
+        const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        int isum = 0;
+        int is = 0;
+        int d;
+        for (int k = 0; k < QK_K/128; ++k) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                d = sc[is++] & 0xF;
+                int isuml = 0;
+                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                d = sc[is++] & 0xF;
+                isuml = 0;
+                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                shift += 2;
+                q8 += 32;
+            }
+            q2 += 32;
+        }
+        sumf += dall * isum - dmin * summs;
+    }
+    *s = sumf;
 #endif
 }

@@ -818,13 +907,70 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = vec_extract(vsumf0, 0);

 #else
-    UNUSED(kmask1);
-    UNUSED(kmask2);
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    // scalar version
+    // This function is written like this so the compiler can manage to vectorize most of it
+    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
+    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
+    // The ideal situation would be if we could just write the code once, and the compiler would
+    // automatically produce the best possible set of machine instructions, instead of us having to manually
+    // write vectorized versions for AVX, ARM_NEON, etc.
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    uint32_t auxs[4];
+    const int8_t * scales = (const int8_t*)auxs;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            q3 += 32;
+        }
+        a = aux8;
+
+        memcpy(auxs, x[i].scales, 12);
+        uint32_t tmp = auxs[2];
+        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+
 #endif
+
 }

 void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -984,14 +1130,61 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = vec_extract(vsumf0, 0);

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    UNUSED(kmask1);
-    UNUSED(kmask2);
-    UNUSED(kmask3);
-    UNUSED(utmp);
-    ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            a += 32;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            a += 32; q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
 #endif
 }

@@ -1149,14 +1342,66 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = vec_extract(vsumf0, 0);

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    UNUSED(kmask1);
-    UNUSED(kmask2);
-    UNUSED(kmask3);
-    UNUSED(utmp);
-    ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
 #endif
 }

@@ -1311,10 +1556,47 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = vec_extract(vsumf0, 0);

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            }
+            a  += 128;
+            q4 += 64;
+            qh += 32;
+        }
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int scale = x[i].scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
 #endif
 }

@@ -1455,10 +1737,34 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
    *s = 0.125f * vec_extract(vsumf0, 0);

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    uint32_t aux32[2];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(aux32, q2, 2*sizeof(uint32_t));
+            q2 += 4;
+            const uint32_t ls = 2*(aux32[1] >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
 #endif
 }

@@ -1563,10 +1869,42 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
    *s = 0.125f * vec_extract(vsumf0, 0);

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const uint8_t  * GGML_RESTRICT sc = x[i].scales;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
+            const uint16_t ls2 = 2*(sc[ib32] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls2;
+            q2 += 4;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
 #endif
 }

@@ -1692,11 +2030,47 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    *s = 0.125f * vec_extract(vsumf0, 0);

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const int8_t  * q8 = y[i].qs;
+        const uint8_t * qs = x[i].qs;
+        const uint8_t * qh = x[i].qh;
+        const uint8_t * signs = qs + QK_K/8;
+
+        int bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
+            int ls2 = 1 + 2*(x[i].scales[ib32] >>  4);
+            int sumi1 = 0, sumi2 = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += ls1 * sumi1 + ls2 * sumi2;
+            qs += 4;
+            signs += 4;
+        }
+
+        sumf += d * bsum;
+    }
+
+    *s = 0.125f * sumf;
+
 #endif
+
 }

 void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -1798,10 +2172,36 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
    *s = 0.25f * vec_extract(vsumf0, 0);

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    uint32_t aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
+            const uint32_t ls = 2*(aux32 >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
+                const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            q3 += 8;
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.25f * sumf;
 #endif
 }

@@ -1927,10 +2327,48 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    *s = vec_extract(vsumf0, 0);

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint8_t * GGML_RESTRICT signs = x[i].signs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
+            const uint32_t ls2 = 2*(x[i].scales[ib32/2] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls2;
+        }
+        sumf += d * bsum;
+    }
+    *s = sumf;
 #endif
 }

@@ -2043,10 +2481,36 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    *s = vec_extract(vsumf0, 0);

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        int sumi = 0, sumi1 = 0;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            const int ls = 2*((qh[ib] >> 12) & 7) + 1;
+            const int delta = qh[ib] & 0x8000 ? -1 : 1;
+            int lsum = 0;
+            for (int l = 0; l < 4; ++l) {
+                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
+                for (int j = 0; j < 8; ++j) {
+                    lsum += q8[j] * grid[j];
+                }
+                q8 += 8;
+            }
+            sumi  += ls * lsum;
+            sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
+            qs += 4;
+        }
+
+        sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
+    }
+
+    *s = sumf;
+
 #endif
 }

@@ -2117,15 +2581,17 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v

    sumf = vec_extract(vsumf0, 0);

-    *s = sumf;
-#else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    UNUSED(ib);
-    UNUSED(sumf);
-    ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
+        int sumi1 = 0, sumi2 = 0;
+        for (int j = 0; j < QK4_NL/2; ++j) {
+            sumi1 += y[ib].qs[j+       0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
+            sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >>  4];
+        }
+        sumf += d * (sumi1 + sumi2);
+    }
+    *s = sumf;
 }

 void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -2230,10 +2696,37 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
    *s = vec_extract(vsumf0, 0);

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    float sumf = 0;
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
+        uint16_t h = x[ibl].scales_h;
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
+            const uint8_t ls2 = (x[ibl].scales_l[ib/2] >>  4) | ((h << 2) & 0x30);
+            h >>= 4;
+            const float d1 = d4d8*(ls1 - 32);
+            const float d2 = d4d8*(ls2 - 32);
+            int sumi1 = 0, sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d1 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+            sumi1 = sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d2 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+        }
+    }
+    *s = sumf;
 #endif
 }

--- a/ggml/src/ggml-cpu/arch/riscv/quants.c
+++ b/ggml/src/ggml-cpu/arch/riscv/quants.c
@@ -116,7 +116,6 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
 //===================================== Dot products =================================

 void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-#if defined(__riscv_v)
    const int qk = QK8_0;
    const int nb = n / qk;

@@ -133,6 +132,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    int ib = 0;
    float sumf = 0;

+#if defined(__riscv_v)
    size_t vl = qk / 2;

    for (; ib < nb; ++ib) {
@@ -164,14 +164,27 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
        sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
    }

-    *s = sumf;
-#else
-    ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F) - 8;
+            const int v1 = (x[ib].qs[j] >>   4) - 8;
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-#if defined(__riscv_v)
    const int qk = QK8_1;
    const int nb = n / qk;

@@ -188,6 +201,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
    int ib = 0;
    float sumf = 0;

+#if defined(__riscv_v)
    size_t vl = qk / 2;

    for (; ib < nb; ++ib) {
@@ -215,14 +229,27 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
    }

-    *s = sumf;
-#else
-    ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F);
+            const int v1 = (x[ib].qs[j] >>   4);
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-#if defined(__riscv_v)
    const int qk = QK8_0;
    const int nb = n / qk;

@@ -240,6 +267,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    const block_q5_0 * GGML_RESTRICT x = vx;
    const block_q8_0 * GGML_RESTRICT y = vy;

+#if defined(__riscv_v)
    size_t vl;
    size_t vlenb = __riscv_vlenb();

@@ -269,14 +297,33 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
    }

-    *s = sumf;
-#else
-    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
+            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-#if defined(__riscv_v)
    const int qk = QK8_1;
    const int nb = n / qk;

@@ -294,6 +341,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
    const block_q5_1 * GGML_RESTRICT x = vx;
    const block_q8_1 * GGML_RESTRICT y = vy;

+#if defined(__riscv_v)
    size_t vl;
    size_t vlenb = __riscv_vlenb();

@@ -322,10 +370,30 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
    }

-    *s = sumf;
-#else
-    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
+            const int32_t x1 = (x[ib].qs[j] >>  4) | xh_1;
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -363,17 +431,18 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
        sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
    }

-    *s = sumf;
-#else
-
-    UNUSED(nb);
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(ib);
-    UNUSED(sumf);
-
-    ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk; j++) {
+            sumi += x[ib].qs[j]*y[ib].qs[j];
+        }
+
+        sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -669,11 +738,44 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi

 #else

-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
+    float sumf = 0;

-    ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        int summs = 0;
+        for (int j = 0; j < 16; ++j) {
+            summs += y[i].bsums[j] * (sc[j] >> 4);
+        }
+
+        const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        int isum = 0;
+        int is = 0;
+        int d;
+        for (int k = 0; k < QK_K/128; ++k) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                d = sc[is++] & 0xF;
+                int isuml = 0;
+                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                d = sc[is++] & 0xF;
+                isuml = 0;
+                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                shift += 2;
+                q8 += 32;
+            }
+            q2 += 32;
+        }
+        sumf += dall * isum - dmin * summs;
+    }
+    *s = sumf;
 #endif
 }

@@ -1045,14 +1147,68 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sumf;

 #else
+    // scalar version
+    // This function is written like this so the compiler can manage to vectorize most of it
+    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
+    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
+    // The ideal situation would be if we could just write the code once, and the compiler would
+    // automatically produce the best possible set of machine instructions, instead of us having to manually
+    // write vectorized versions for AVX, ARM_NEON, etc.

-    UNUSED(kmask1);
-    UNUSED(kmask2);
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    uint32_t auxs[4];
+    const int8_t * scales = (const int8_t*)auxs;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            q3 += 32;
+        }
+        a = aux8;
+
+        memcpy(auxs, x[i].scales, 12);
+        uint32_t tmp = auxs[2];
+        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;

-    ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif

 }
@@ -1378,15 +1534,60 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi

 #else

-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(kmask1);
-    UNUSED(kmask2);
-    UNUSED(kmask3);
-    UNUSED(nb);
-    UNUSED(utmp);
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];

-    ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            a += 32;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            a += 32; q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
 #endif
 }

@@ -1497,15 +1698,65 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi

 #else

-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(kmask1);
-    UNUSED(kmask2);
-    UNUSED(kmask3);
-    UNUSED(nb);
-    UNUSED(utmp);
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];

-    ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
 #endif
 }

@@ -1773,11 +2024,46 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi

 #else

-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));

-    ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            }
+            a  += 128;
+            q4 += 64;
+            qh += 32;
+        }
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int scale = x[i].scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
 #endif
 }

--- a/ggml/src/ggml-cpu/arch/riscv/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/riscv/repack.cpp
@@ -112,7 +112,31 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
    }

 #endif
-    ggml_gemv_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+    {
+        float sumf[8];
+        int sumi;
+
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+
+            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int j = 0; j < ncols_interleaved; j++) {
+                        sumi = 0;
+                        for (int i = 0; i < blocklen; ++i) {
+                            const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                            const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                        }
+                        sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+                    }
+                }
+            }
+            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+        }
+    }
 }

 void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -337,6 +361,37 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
        return;
    }

-#endif
-    ggml_gemm_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
+    float sumf[4][8];
+    int sumi;
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                         (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                            }
+                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++)
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+            }
+        }
+    }
 }
--- a/ggml/src/ggml-cpu/arch/s390/quants.c
+++ b/ggml/src/ggml-cpu/arch/s390/quants.c
@@ -172,15 +172,24 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi

    sumf = acc[0] + acc[1] + acc[2] + acc[3];

-    *s = sumf;
-#else
-    UNUSED(nb);
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(ib);
-    UNUSED(sumf);
-    ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F) - 8;
+            const int v1 = (x[ib].qs[j] >>   4) - 8;
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -230,15 +239,24 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi

    sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;

-    *s = sumf;
-#else
-    UNUSED(nb);
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(ib);
-    UNUSED(sumf);
-    ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F);
+            const int v1 = (x[ib].qs[j] >>   4);
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -280,15 +298,18 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi

    sumf = acc[0] + acc[1] + acc[2] + acc[3];

-    *s = sumf;
-#else
-    UNUSED(nb);
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(ib);
-    UNUSED(sumf);
-    ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk; j++) {
+            sumi += x[ib].qs[j]*y[ib].qs[j];
+        }
+
+        sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -421,13 +442,70 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sum;

 #else
-    UNUSED(kmask1);
-    UNUSED(kmask2);
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    // scalar version
+    // This function is written like this so the compiler can manage to vectorize most of it
+    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
+    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
+    // The ideal situation would be if we could just write the code once, and the compiler would
+    // automatically produce the best possible set of machine instructions, instead of us having to manually
+    // write vectorized versions for AVX, ARM_NEON, etc.
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    uint32_t auxs[4];
+    const int8_t * scales = (const int8_t*)auxs;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            q3 += 32;
+        }
+        a = aux8;
+
+        memcpy(auxs, x[i].scales, 12);
+        uint32_t tmp = auxs[2];
+        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+
 #endif
+
 }

 void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -522,14 +600,61 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sumf;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    UNUSED(kmask1);
-    UNUSED(kmask2);
-    UNUSED(kmask3);
-    UNUSED(utmp);
-    ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            a += 32;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            a += 32; q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
 #endif
 }

@@ -642,14 +767,66 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sumf;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    UNUSED(kmask1);
-    UNUSED(kmask2);
-    UNUSED(kmask3);
-    UNUSED(utmp);
-    ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
 #endif
 }

@@ -792,10 +969,47 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sum;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            }
+            a  += 128;
+            q4 += 64;
+            qh += 32;
+        }
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int scale = x[i].scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
 #endif
 }

@@ -972,15 +1186,17 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
        sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
    }

-    *s = sumf;
-#else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    UNUSED(ib);
-    UNUSED(sumf);
-    ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
+        int sumi1 = 0, sumi2 = 0;
+        for (int j = 0; j < QK4_NL/2; ++j) {
+            sumi1 += y[ib].qs[j+       0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
+            sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >>  4];
+        }
+        sumf += d * (sumi1 + sumi2);
+    }
+    *s = sumf;
 }

 void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -1048,10 +1264,37 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
    *s = sumf;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    float sumf = 0;
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
+        uint16_t h = x[ibl].scales_h;
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
+            const uint8_t ls2 = (x[ibl].scales_l[ib/2] >>  4) | ((h << 2) & 0x30);
+            h >>= 4;
+            const float d1 = d4d8*(ls1 - 32);
+            const float d2 = d4d8*(ls2 - 32);
+            int sumi1 = 0, sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d1 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+            sumi1 = sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d2 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+        }
+    }
+    *s = sumf;
 #endif
 }

--- a/ggml/src/ggml-cpu/arch/wasm/quants.c
+++ b/ggml/src/ggml-cpu/arch/wasm/quants.c
@@ -435,15 +435,30 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
           wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);

-    *s = sumf;
-#else
-    UNUSED(nb);
-    UNUSED(ib);
-    UNUSED(sumf);
-    UNUSED(x);
-    UNUSED(y);
-    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
+            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -530,15 +545,30 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
    sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
           wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;

-    *s = sumf;
-#else
-    UNUSED(nb);
-    UNUSED(ib);
-    UNUSED(sumf);
-    UNUSED(x);
-    UNUSED(y);
-    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
+            const int32_t x1 = (x[ib].qs[j] >>  4) | xh_1;
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -598,15 +628,18 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
           wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);

-    *s = sumf;
-#else
-    UNUSED(nb);
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(ib);
-    UNUSED(sumf);
-    ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
+    for (; ib < nb; ++ib) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk; j++) {
+            sumi += x[ib].qs[j]*y[ib].qs[j];
+        }
+
+        sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
+    }
+
+    *s = sumf;
 }

 void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -722,10 +755,45 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sumf;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        int summs = 0;
+        for (int j = 0; j < 16; ++j) {
+            summs += y[i].bsums[j] * (sc[j] >> 4);
+        }
+
+        const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        int isum = 0;
+        int is = 0;
+        int d;
+        for (int k = 0; k < QK_K/128; ++k) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                d = sc[is++] & 0xF;
+                int isuml = 0;
+                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                d = sc[is++] & 0xF;
+                isuml = 0;
+                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                shift += 2;
+                q8 += 32;
+            }
+            q2 += 32;
+        }
+        sumf += dall * isum - dmin * summs;
+    }
+    *s = sumf;
 #endif
 }

@@ -834,12 +902,68 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sumf;

 #else
-    UNUSED(kmask1);
-    UNUSED(kmask2);
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+    // scalar version
+    // This function is written like this so the compiler can manage to vectorize most of it
+    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
+    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
+    // The ideal situation would be if we could just write the code once, and the compiler would
+    // automatically produce the best possible set of machine instructions, instead of us having to manually
+    // write vectorized versions for AVX, ARM_NEON, etc.
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    uint32_t auxs[4];
+    const int8_t * scales = (const int8_t*)auxs;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            q3 += 32;
+        }
+        a = aux8;
+
+        memcpy(auxs, x[i].scales, 12);
+        uint32_t tmp = auxs[2];
+        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+
 #endif

 }
@@ -965,14 +1089,61 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sumf;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    UNUSED(kmask1);
-    UNUSED(kmask2);
-    UNUSED(kmask3);
-    UNUSED(utmp);
-    ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            a += 32;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            a += 32; q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
 #endif
 }

@@ -1108,14 +1279,66 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sumf;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    UNUSED(kmask1);
-    UNUSED(kmask2);
-    UNUSED(kmask3);
-    UNUSED(utmp);
-    ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
 #endif
 }

@@ -1212,10 +1435,47 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    *s = sumf;

 #else
-    UNUSED(x);
-    UNUSED(y);
-    UNUSED(nb);
-    ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            }
+            a  += 128;
+            q4 += 64;
+            qh += 32;
+        }
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int scale = x[i].scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
 #endif
 }

--- a/ggml/src/ggml-cpu/arch/x86/quants.c
+++ b/ggml/src/ggml-cpu/arch/x86/quants.c
--- a/ggml/src/ggml-cpu/arch/x86/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -253,12 +253,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_Q8_1,
        .nrows                    = 1,
    },
-    [GGML_TYPE_MXFP4] = {
-        .from_float               = quantize_row_mxfp4,
-        .vec_dot                  = ggml_vec_dot_mxfp4_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-    },
    [GGML_TYPE_Q2_K] = {
        .from_float               = quantize_row_q2_K,
        .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
@@ -1676,10 +1670,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_add(params, tensor);
            } break;
-        case GGML_OP_ADD_ID:
-            {
-                ggml_compute_forward_add_id(params, tensor);
-            } break;
        case GGML_OP_ADD1:
            {
                ggml_compute_forward_add1(params, tensor);
@@ -1934,7 +1924,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            } break;
        case GGML_OP_FLASH_ATTN_EXT:
            {
-                ggml_compute_forward_flash_attn_ext(params, tensor);
+                ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
            } break;
        case GGML_OP_FLASH_ATTN_BACK:
            {
@@ -2022,11 +2012,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
                ggml_compute_forward_opt_step_adamw(params, tensor);
            }
            break;
-        case GGML_OP_OPT_STEP_SGD:
-            {
-                ggml_compute_forward_opt_step_sgd(params, tensor);
-            }
-            break;
        case GGML_OP_NONE:
            {
                // nop
@@ -2126,7 +2111,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
        case GGML_OP_DUP:
        case GGML_OP_CONT:
        case GGML_OP_ADD:
-        case GGML_OP_ADD_ID:
        case GGML_OP_ADD1:
        case GGML_OP_ACC:
            {
@@ -2188,7 +2172,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                case GGML_GLU_OP_REGLU:
                case GGML_GLU_OP_GEGLU:
                case GGML_GLU_OP_SWIGLU:
-                case GGML_GLU_OP_SWIGLU_OAI:
                case GGML_GLU_OP_GEGLU_ERF:
                case GGML_GLU_OP_GEGLU_QUICK:
                    {
@@ -2330,7 +2313,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
        case GGML_OP_CROSS_ENTROPY_LOSS:
        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
        case GGML_OP_OPT_STEP_ADAMW:
-        case GGML_OP_OPT_STEP_SGD:
            {
                n_tasks = n_threads;
            } break;
@@ -2691,7 +2673,6 @@ struct ggml_cplan ggml_graph_plan(
                        }
                    } break;
                case GGML_OP_ADD:
-                case GGML_OP_ADD_ID:
                case GGML_OP_ADD1:
                    {
                        if (ggml_is_quantized(node->src[0]->type)) {
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -35,7 +35,7 @@

 // ggml-backend interface

-std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types() {
+std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type() {
    static std::vector<ggml_backend_buffer_type_t> bufts = []() {
        std::vector<ggml_backend_buffer_type_t> bufts;

@@ -57,6 +57,8 @@ std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_type
        }
 #endif

+        bufts.push_back(NULL);
+
        return bufts;
    }();

@@ -64,20 +66,14 @@ std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_type
 }

 static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
-    static std::vector<ggml_backend_buffer_type_t> extra_bufts = [] {
-        std::vector<ggml_backend_buffer_type_t> bufts = ggml_backend_cpu_get_extra_buffer_types();
-        bufts.push_back(nullptr);
-        return bufts;
-    }();
-
-    return extra_bufts.data();
+    return ggml_backend_cpu_get_extra_buffers_type().data();

    GGML_UNUSED(device);
 }

 static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
-    for (auto * extra : ggml_backend_cpu_get_extra_buffer_types()) {
-        if (extra == buft) {
+    for (auto * extra : ggml_backend_cpu_get_extra_buffers_type()) {
+        if (extra && extra == buft) {
            return true;
        }
    }
@@ -214,10 +210,10 @@ ggml_backend_t ggml_backend_cpu_init(void) {
    ctx->abort_callback_data = NULL;

    ggml_backend_t cpu_backend = new ggml_backend {
-        /* .guid    = */ ggml_backend_cpu_guid(),
-        /* .iface   = */ ggml_backend_cpu_i,
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
-        /* .context = */ ctx,
+        /* .guid      = */ ggml_backend_cpu_guid(),
+        /* .interface = */ ggml_backend_cpu_i,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context   = */ ctx,
    };

    if (cpu_backend == NULL) {
@@ -401,13 +397,20 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
        return true;
    }

-    // check extra buffer types
-    // note: only the first sources are checked for extra buffer types to reduce overhead, increase if necessary
-    for (int i = 0; i < 4; i++) {
-        if (op->src[i] && op->src[i]->buffer &&
-            ggml_backend_cpu_is_extra_buffer_type(op->src[i]->buffer->buft)) {
-            auto * buf_extra = (ggml::cpu::extra_buffer_type *) op->src[i]->buffer->buft->context;
-            return buf_extra->supports_op(dev, op);
+    // extra_buffer_op?
+    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
+        if (extra) {
+            auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context;
+            if (buf_extra && buf_extra->supports_op(dev, op)) {
+                return true;
+            }
+        }
+    }
+
+    // the other case need host buffer.
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (op->src[i] && op->src[i]->buffer && !ggml_backend_buft_is_host(op->src[i]->buffer->buft)) {
+            return false;
        }
    }

--- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
@@ -259,10 +259,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
                const int64_t m_start      = 0;

                const int64_t n_step      = static_cast<int64_t>(kernel->get_n_step());
-                int64_t num_threads       = KAI_MIN(n / n_step, nth);
-                if (num_threads <= 0) {
-                    num_threads = 1;
-                }
+                const int64_t num_threads = KAI_MIN(n / n_step, nth);

                if (ith < num_threads) {
                    const int64_t num_n_per_thread0   = round_down(n / num_threads, n_step);
@@ -312,8 +309,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
        GGML_ASSERT(kernel);

        const int ith = params->ith;
-        const int nth_raw = params->nth;
-        const int nth = nth_raw > 0 ? nth_raw : 1;
+        const int nth = params->nth;

        const size_t k = ne00;
        const size_t m = ne11;
@@ -331,12 +327,9 @@ class tensor_traits : public ggml::cpu::tensor_traits {
        const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
        const size_t n_start = ith * num_n_per_thread;

-        size_t n_to_process = 0;
-        if (n_start < n) {
-            n_to_process = num_n_per_thread;
-            if ((n_start + n_to_process) > n) {
-                n_to_process = n - n_start;
-            }
+        size_t n_to_process = num_n_per_thread;
+        if ((n_start + n_to_process) > n) {
+            n_to_process = n - n_start;
        }

        // Calculate number of columns to be processed per thread
@@ -368,10 +361,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {
        const void* lhs_ptr            = (const void*)((const char *)lhs_packed + lhs_packed_offset);
        float *dst_ptr                 = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);

-        if (n_to_process > 0) {
-            variant_call<void>(kernel->run_kernel, m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
-                               sizeof(float), -FLT_MAX, FLT_MAX);
-        }
+        variant_call<void>(kernel->run_kernel, m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
+                           sizeof(float), -FLT_MAX, FLT_MAX);

        return true;
    }
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -8,7 +8,6 @@
 #include "vec.h"

 #include <float.h>
-#include <algorithm>

 // ggml_compute_forward_dup

@@ -1284,7 +1283,6 @@ void ggml_compute_forward_add(
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_MXFP4:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
@@ -1311,77 +1309,6 @@ void ggml_compute_forward_add(
    }
 }

-// ggml_compute_forward_add_id
-
-static void ggml_compute_forward_add_id_f32(
-        const ggml_compute_params * params,
-        ggml_tensor * dst) {
-
-    const ggml_tensor * src0 = dst->src[0];
-    const ggml_tensor * src1 = dst->src[1];
-    const ggml_tensor * src2 = dst->src[2];
-
-    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(src2->type == GGML_TYPE_I32);
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-    GGML_ASSERT(src1->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_TERNARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 indices
-        const int i3 = ir/(ne2*ne1);
-        const int i2 = (ir - i3*ne2*ne1)/ne1;
-        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-        // src1 indices
-        const int i11 = *(int32_t *) ((char *) src2->data + i1*nb20 + i2*nb21);
-
-        GGML_ASSERT(i11 >= 0 && i11 < ne11);
-
-        ggml_vec_add_f32(ne0,
-                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
-                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
-                (float *) ((char *) src1->data + i11*nb11));
-    }
-}
-
-void ggml_compute_forward_add_id(
-        const ggml_compute_params * params,
-        ggml_tensor * dst) {
-
-    const ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_add_id_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("unsupported type for ggml_compute_forward_add_id: %s", ggml_type_name(src0->type));
-            }
-    }
-}
-
 // ggml_compute_forward_add1

 static void ggml_compute_forward_add1_f32(
@@ -1733,7 +1660,6 @@ void ggml_compute_forward_add1(
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_MXFP4:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
@@ -1861,7 +1787,6 @@ void ggml_compute_forward_acc(
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_MXFP4:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
@@ -3689,93 +3614,6 @@ static void ggml_compute_forward_swiglu(
    }
 }

-// ggml_compute_forward_swiglu_oai
-
-static void ggml_compute_forward_swiglu_oai_f32(
-        const ggml_compute_params * params,
-        ggml_tensor * dst) {
-
-    const ggml_tensor * src0 = dst->src[0];
-    const ggml_tensor * src1 = dst->src[1];
-    char * src0_d = (char *) src0->data;
-    char * src1_d = (char *) (src1 ? src1->data : src0->data);
-    const size_t src0_o = src0->nb[1];
-    const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
-
-    GGML_ASSERT(ggml_is_contiguous_1(src0));
-    GGML_ASSERT(ggml_is_contiguous_1(dst));
-
-    if (src1) {
-        GGML_ASSERT(ggml_is_contiguous_1(src1));
-        GGML_ASSERT(src0->type == src1->type);
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
-    const int nr = ggml_nrows(src0);
-
-    GGML_ASSERT(dst->ne[0] == nc);
-    GGML_ASSERT(ggml_nrows(dst) == nr);
-
-    const int32_t swapped = ggml_get_op_params_i32(dst, 1);
-    const float alpha = ggml_get_op_params_f32(dst, 2);
-    const float limit = ggml_get_op_params_f32(dst, 3);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * src0_p = (float *) (src0_d + i1*src0_o);
-        float * src1_p = (float *) (src1_d + i1*src1_o);
-        float * dst_p  = (float *) ((char *) dst->data + i1*(dst->nb[1]));
-
-        if (!src1) {
-            src0_p += swapped ? nc : 0;
-            src1_p += swapped ? 0 : nc;
-        }
-
-        for (int k = 0; k < nc; k++) {
-            const float x = std::min(src0_p[k], limit);
-            const float y = std::clamp(src1_p[k], -limit, limit);
-            const float out_glu = x / (1.f + expf(alpha * (-x)));
-            dst_p[k] = out_glu * (y + 1.f);
-        }
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const float x = dst_p[k];
-            GGML_UNUSED(x);
-            assert(!isnan(x));
-            assert(!isinf(x));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_swiglu_oai(
-        const ggml_compute_params * params,
-        ggml_tensor * dst) {
-
-    const ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_swiglu_oai_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
 // ggml_compute_forward_geglu_erf

 static void ggml_compute_forward_geglu_erf_f32(
@@ -4761,7 +4599,6 @@ void ggml_compute_forward_out_prod(
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_MXFP4:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
@@ -5036,7 +4873,6 @@ void ggml_compute_forward_set(
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_MXFP4:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
@@ -5298,7 +5134,6 @@ void ggml_compute_forward_get_rows(
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_MXFP4:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
@@ -5688,7 +5523,6 @@ static void ggml_compute_forward_soft_max_f32(

    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
-    const ggml_tensor * src2 = dst->src[2];

    assert(ggml_is_contiguous(dst));
    assert(ggml_are_same_shape(src0, dst));
@@ -5723,9 +5557,6 @@ static void ggml_compute_forward_soft_max_f32(

    const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);

-    // sinks
-    const float * sk = src2 ? (float *)((char *) src2->data) : nullptr;
-
    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
@@ -5768,18 +5599,9 @@ static void ggml_compute_forward_soft_max_f32(
                float max = -INFINITY;
                ggml_vec_max_f32(ne00, &max, wp);

-                // if we have sinks, make a correction as if they were included in the softmax
-                if (sk) {
-                    max = MAX(max, sk[i02]);
-                }
-
                ggml_float sum = ggml_vec_soft_max_f32(ne00, dp, wp, max);
                assert(sum > 0.0);

-                if (sk) {
-                    sum += (ggml_float) expf(sk[i02] - max);
-                }
-
                sum = 1.0/sum;
                ggml_vec_scale_f32(ne00, dp, sum);

@@ -6014,7 +5836,6 @@ void ggml_compute_forward_clamp(
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_MXFP4:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
@@ -8168,14 +7989,12 @@ void ggml_compute_forward_argsort(

 static void ggml_compute_forward_flash_attn_ext_f16(
        const ggml_compute_params * params,
+        const ggml_tensor * q,
+        const ggml_tensor * k,
+        const ggml_tensor * v,
+        const ggml_tensor * mask,
        ggml_tensor * dst) {

-    const ggml_tensor * q     = dst->src[0];
-    const ggml_tensor * k     = dst->src[1];
-    const ggml_tensor * v     = dst->src[2];
-    const ggml_tensor * mask  = dst->src[3];
-    const ggml_tensor * sinks = dst->src[4];
-
    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
@@ -8370,23 +8189,6 @@ static void ggml_compute_forward_flash_attn_ext_f16(
            }
        }

-        // sinks
-        if (sinks) {
-            const float s = ((float *)((char *) sinks->data))[h];
-
-            float ms = 1.0f;
-            float vs = 1.0f;
-
-            if (s > M) {
-                ms = expf(M - s);
-                ggml_vec_scale_f32(DV, VKQ32, ms);
-            } else {
-                vs = expf(s - M);
-            }
-
-            S = S*ms + vs;
-        }
-
        // V /= S
        const float S_inv = 1.0f/S;
        ggml_vec_scale_f32(DV, VKQ32, S_inv);
@@ -8406,13 +8208,17 @@ static void ggml_compute_forward_flash_attn_ext_f16(

 void ggml_compute_forward_flash_attn_ext(
        const ggml_compute_params * params,
+        const ggml_tensor * q,
+        const ggml_tensor * k,
+        const ggml_tensor * v,
+        const ggml_tensor * mask,
        ggml_tensor * dst) {
    switch (dst->op_params[3]) {
        case GGML_PREC_DEFAULT:
        case GGML_PREC_F32:
            {
                // uses F32 accumulators
-                ggml_compute_forward_flash_attn_ext_f16(params, dst);
+                ggml_compute_forward_flash_attn_ext_f16(params, q, k, v, mask, dst);
            } break;
        default:
            {
@@ -9274,10 +9080,6 @@ void ggml_compute_forward_glu(
            {
                ggml_compute_forward_swiglu(params, dst);
            } break;
-        case GGML_GLU_OP_SWIGLU_OAI:
-            {
-                ggml_compute_forward_swiglu_oai(params, dst);
-            } break;
        case GGML_GLU_OP_GEGLU_ERF:
            {
                ggml_compute_forward_geglu_erf(params, dst);
@@ -10330,7 +10132,6 @@ static void ggml_compute_forward_opt_step_adamw_f32(
    const int ir1 = MIN(ir0 + dr, nr);

    const float * adamw_params_ptr = ggml_get_data_f32(adamw_params);
-
    const float alpha  = adamw_params_ptr[0];
    const float beta1  = adamw_params_ptr[1];
    const float beta2  = adamw_params_ptr[2];
@@ -10338,7 +10139,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
    const float wd     = adamw_params_ptr[4];
    const float beta1h = adamw_params_ptr[5];
    const float beta2h = adamw_params_ptr[6];
-    const float keep   = 1.f - alpha * wd;
+
    for (int ir = ir0; ir < ir1; ++ir) {
        const int64_t i03 = ir/(ne02*ne01);
        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
@@ -10361,7 +10162,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
            // The weight decay is applied independently of the Adam momenta m and v.
            // This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss.
            // See: https://arxiv.org/pdf/1711.05101v3.pdf
-            w[i00] = w[i00] * keep - alpha * mh / vh;
+            w[i00] = w[i00]*(1.0f - alpha*wd) - alpha*mh/vh;
        }
    }
 }
@@ -10383,63 +10184,3 @@ void ggml_compute_forward_opt_step_adamw(
            }
    }
 }
-
-static void ggml_compute_forward_opt_step_sgd_f32(const ggml_compute_params * params, ggml_tensor * dst) {
-    const ggml_tensor * src0       = dst->src[0];
-    const ggml_tensor * src0_grad  = dst->src[1];
-    const ggml_tensor * sgd_params = dst->src[2];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
-    GGML_ASSERT(ggml_nelements(sgd_params) == 2);
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(src0);
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1) / nth;
-
-    // row range for this thread
-    const int ir0 = dr * ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // using adamw param subset we care about - alpha, wd - could have a separate struct
-    const float * sgd_params_ptr   = ggml_get_data_f32(sgd_params);
-    const float   alpha            = sgd_params_ptr[0];
-    const float   keep             = 1.f - alpha * sgd_params_ptr[1];
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        const int64_t i03 = ir / (ne02 * ne01);
-        const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01;
-        const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
-
-        const size_t offset = i03 * nb03 + i02 * nb02 + i01 * nb01;
-
-        float *       w = (float *) ((char *) src0->data + offset);                   // weight
-        const float * g = (const float *) ((const char *) src0_grad->data + offset);  // grad
-
-        for (int i00 = 0; i00 < ne00; ++i00) {
-            w[i00] = w[i00] * keep - alpha * g[i00];
-        }
-    }
-}
-
-void ggml_compute_forward_opt_step_sgd(const ggml_compute_params * params, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_opt_step_sgd_f32(params, dst);
-            }
-            break;
-        default:
-            {
-                GGML_ABORT("fatal error - sgd is F32 only");
-            }
-    }
-}
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -29,7 +29,6 @@ extern "C" {

 void ggml_compute_forward_dup(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_add(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_add_id(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_add1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_acc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -83,7 +82,13 @@ void ggml_compute_forward_arange(const struct ggml_compute_params * params, stru
 void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_leaky_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_flash_attn_ext(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_flash_attn_ext(
+    const struct ggml_compute_params * params,
+    const struct ggml_tensor * q,
+    const struct ggml_tensor * k,
+    const struct ggml_tensor * v,
+    const struct ggml_tensor * mask,
+    struct ggml_tensor * dst);
 void ggml_compute_forward_flash_attn_back(
        const struct ggml_compute_params * params,
        const bool masked,
@@ -107,7 +112,7 @@ void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params *
 void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_opt_step_sgd(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-cpu/quants.c
+++ b/ggml/src/ggml-cpu/quants.c
@@ -46,10 +46,6 @@ void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRI
    quantize_row_q8_1_ref(x, y, k);
 }

-void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
-    quantize_row_mxfp4_ref(x, y, k);
-}
-
 //
 // 2-6 bit quantization in super-blocks
 //
@@ -185,37 +181,6 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
    *s = sumf;
 }

-void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-    assert(n % QK_MXFP4 == 0);
-    static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
-
-    const block_mxfp4 * GGML_RESTRICT x = vx;
-    const block_q8_0 * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_MXFP4;
-
-    int ib = 0;
-    float sumf = 0;
-
-    for (; ib < nb; ++ib) {
-        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
-
-        int sumi1 = 0;
-        int sumi2 = 0;
-        for (int j = 0; j < QK_MXFP4/2; ++j) {
-            sumi1 += y[ib].qs[j +          0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
-            sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >>  4];
-        }
-        sumf += d * (sumi1 + sumi2);
-    }
-    *s = sumf;
-}
-
 void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;
--- a/ggml/src/ggml-cpu/quants.h
+++ b/ggml/src/ggml-cpu/quants.h
@@ -19,8 +19,6 @@ void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
 void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

-void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-
 void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -41,8 +39,6 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
 void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

-void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
 void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
@@ -71,12 +67,8 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
 void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
-void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
 void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
 void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -206,9 +206,8 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
    const int ncols_interleaved = 4;
    const int blocklen = 4;

-    assert(nr == 1);
-    assert(n % qk == 0);
-    assert(nc % ncols_interleaved == 0);
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);

    UNUSED(s);
    UNUSED(bs);
@@ -308,28 +307,30 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
    UNUSED(ncols_interleaved);
    UNUSED(blocklen);

-    float sumf[8];
-    int sumi;
+    {
+        float sumf[8];
+        int sumi;

-    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-    for (int x = 0; x < nc / ncols_interleaved; x++) {
-        const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);

-        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
-        for (int l = 0; l < nb; l++) {
-            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                for (int j = 0; j < ncols_interleaved; j++) {
-                    sumi = 0;
-                    for (int i = 0; i < blocklen; ++i) {
-                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
-                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
-                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int j = 0; j < ncols_interleaved; j++) {
+                        sumi = 0;
+                        for (int i = 0; i < blocklen; ++i) {
+                            const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                            const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                        }
+                        sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
                    }
-                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
                }
            }
+            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
        }
-        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
    }
 }

@@ -411,11 +412,11 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
    }
 }

-void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
-    const int qk = QK_K;
+void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
    const int nb = n / qk;
-    const int ncols_interleaved = 8;
-    const int blocklen = 8;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;

    assert (n % qk == 0);
    assert (nc % ncols_interleaved == 0);
@@ -430,136 +431,30 @@ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
    UNUSED(ncols_interleaved);
    UNUSED(blocklen);

-    float sumf[8];
-    float sum_minf[8];
-    int sumi1,sumi2,sumi3,sumi4;
-    int sumi;
+    {
+        float sumf[4];
+        int sumi;

-    const block_q8_K * a_ptr = (const block_q8_K *)vy;
-    for(int x = 0; x < nc / ncols_interleaved; x++) {
-        const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
-        for (int j = 0; j < ncols_interleaved; j++) {
-            sumf[j] = 0.0;
-            sum_minf[j] = 0.0;
-        }
-        for (int l = 0; l < nb; l++) {
-            for (int k = 0; k < (qk / (4 * blocklen)); k++) {
-                const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
-                const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
-                const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
-                const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
-                for (int j = 0; j < ncols_interleaved; j++) {
-                    sumi1 = 0;
-                    sumi2 = 0;
-                    sumi3 = 0;
-                    sumi4 = 0;
-                    sumi = 0;
-                    int offset = ((k / 2) % 2) + j * 2;
-                    for (int i = 0; i < blocklen; ++i){
-                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
-                        const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
-                        const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
-                        const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
-                        sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
-                        sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
-                        sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
-                        sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);

-                        sumi1 = sumi1 * (scales_0[offset] & 0xF);
-                        sumi2 = sumi2 * (scales_1[offset] & 0xF);
-                        sumi3 = sumi3 * (scales_2[offset] & 0xF);
-                        sumi4 = sumi4 * (scales_3[offset] & 0xF);
-                        sumi += sumi1 + sumi2 + sumi3 + sumi4;
+            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int j = 0; j < ncols_interleaved; j++) {
+                        sumi = 0;
+                        for (int i = 0; i < blocklen; ++i) {
+                            const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
+                            const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
+                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
+                        }
+                        sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
                    }
-                    sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
-                }
-            }
-            for(int sb = 0; sb < 8; sb++) {
-                const uint8_t *mins = b_ptr[l].scales + sb * 16;
-                for(int j = 0; j < ncols_interleaved; j++){
-                    sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
                }
            }
+            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
        }
-        for (int j = 0; j < ncols_interleaved; j++) {
-            s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
-        }
-    }
-}
-
-void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-    const int ncols_interleaved = 4;
-    const int blocklen = 4;
-
-    assert(nr == 1);
-    assert(n % qk == 0);
-    assert(nc % ncols_interleaved == 0);
-
-    UNUSED(bs);
-    UNUSED(nr);
-
-    float sumf[4];
-    int sumi;
-
-    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-    for (int x = 0; x < nc / ncols_interleaved; x++) {
-        const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
-
-        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
-        for (int l = 0; l < nb; l++) {
-            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                for (int j = 0; j < ncols_interleaved; j++) {
-                    sumi = 0;
-                    for (int i = 0; i < blocklen; ++i) {
-                        const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
-                        const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
-                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
-                    }
-                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
-                }
-            }
-        }
-        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
-    }
-}
-
-void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-    const int ncols_interleaved = 8;
-    const int blocklen = 8;
-
-    assert(nr == 1);
-    assert(n % qk == 0);
-    assert(nc % ncols_interleaved == 0);
-
-    UNUSED(bs);
-    UNUSED(nr);
-
-    float sumf[8];
-    int sumi;
-
-    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-    for (int x = 0; x < nc / ncols_interleaved; x++) {
-        const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
-
-        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
-        for (int l = 0; l < nb; l++) {
-            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                for (int j = 0; j < ncols_interleaved; j++) {
-                    sumi = 0;
-                    for (int i = 0; i < blocklen; ++i) {
-                        const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
-                        const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
-                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
-                    }
-                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
-                }
-            }
-        }
-        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
    }
 }

@@ -816,97 +711,6 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
    }
 }

-void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
-    const int qk = QK_K;
-    const int nb = n / qk;
-    const int ncols_interleaved = 8;
-    const int blocklen = 8;
-
-    assert (n % qk == 0);
-    assert (nr % 4 == 0);
-    assert (nc % ncols_interleaved == 0);
-
-    UNUSED(s);
-    UNUSED(bs);
-    UNUSED(vx);
-    UNUSED(vy);
-    UNUSED(nr);
-    UNUSED(nc);
-    UNUSED(nb);
-    UNUSED(ncols_interleaved);
-    UNUSED(blocklen);
-
-    float sumf[4][8];
-    float sum_minf[4][8];
-    int sumi1, sumi2, sumi3, sumi4;
-    int sumi;
-
-    for (int y = 0; y < nr / 4; y++) {
-        const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
-        for (int x = 0; x < nc / ncols_interleaved; x++) {
-            const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
-            for (int m = 0; m < 4; m++) {
-                for (int j = 0; j < ncols_interleaved; j++) {
-                    sumf[m][j] = 0.0;
-                    sum_minf[m][j] = 0.0;
-                }
-            }
-            for (int l = 0; l < nb; l++) {
-                for (int k = 0; k < (qk / (4 * blocklen)); k++) {
-
-                    const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
-                    const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
-                    const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
-                    const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
-                    for (int m = 0; m < 4; m++) {
-                        for (int j = 0; j < ncols_interleaved; j++) {
-                            sumi1 = 0;
-                            sumi2 = 0;
-                            sumi3 = 0;
-                            sumi4 = 0;
-                            sumi = 0;
-                            int offset = ((k / 2) % 2) + j * 2;
-                            for (int i = 0; i < blocklen; ++i){
-                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
-                                const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
-                                const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
-                                const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
-                                sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
-                                sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512  + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
-                                sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512  + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
-                                sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512  + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
-                                sumi1 = sumi1 * (scales_0[offset] & 0xF);
-                                sumi2 = sumi2 * (scales_1[offset] & 0xF);
-                                sumi3 = sumi3 * (scales_2[offset] & 0xF);
-                                sumi4 = sumi4 * (scales_3[offset] & 0xF);
-                                sumi += sumi1 + sumi2 + sumi3 + sumi4;
-                            }
-                            sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
-                        }
-                    }
-                }
-                for(int sb = 0; sb < 8; sb++) {
-                    const uint8_t *mins = b_ptr[l].scales + sb * 16;
-                    for(int m = 0; m < 4; m++) {
-                        const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) *  6);
-                        for(int j = 0; j < ncols_interleaved; j++) {
-                            int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
-                            sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
-                        }
-                    }
-                }
-            }
-
-            for (int m = 0; m < 4; m++) {
-                for (int j = 0; j < ncols_interleaved; j++) {
-                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
-                }
-            }
-        }
-    }
-}
-
-
 void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@@ -963,50 +767,6 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
    }
 }

-void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-    const int ncols_interleaved = 8;
-    const int blocklen = 8;
-
-    assert(n % qk == 0);
-    assert(nr % 4 == 0);
-    assert(nc % ncols_interleaved == 0);
-
-    float sumf[4][8];
-    int sumi;
-
-    for (int y = 0; y < nr / 4; y++) {
-        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
-        for (int x = 0; x < nc / ncols_interleaved; x++) {
-            const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
-            for (int m = 0; m < 4; m++) {
-                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
-            }
-            for (int l = 0; l < nb; l++) {
-                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                    for (int m = 0; m < 4; m++) {
-                        for (int j = 0; j < ncols_interleaved; j++) {
-                            sumi = 0;
-                            for (int i = 0; i < blocklen; ++i) {
-                                const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
-                                const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
-                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
-                                         (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
-                            }
-                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
-                        }
-                    }
-                }
-            }
-            for (int m = 0; m < 4; m++) {
-                for (int j = 0; j < ncols_interleaved; j++)
-                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
-            }
-        }
-    }
-}
-
 } // extern "C"

 static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
@@ -1154,50 +914,6 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
    return out;
 }

-static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
-    block_q2_Kx8 out;
-
-    // Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
-    for (int i = 0; i < 8; i++) {
-        out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
-    }
-
-    for (int i = 0; i < 8; i++) {
-        out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
-    }
-
-    const int end = QK_K * 2 / blck_size_interleave;
-
-    // Interleave Q2_K quants by taking 8 bytes at a time
-    for (int i = 0; i < end; ++i) {
-        int src_id = i % 8;
-        int src_offset = (i / 8) * blck_size_interleave;
-        int dst_offset = i * blck_size_interleave;
-
-        uint64_t elems;
-        memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
-        memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
-    }
-
-    // The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
-    // Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
-    // The output Q2_Kx8 structure has 128 bytes for storing scales and mins
-    // Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
-    // For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
-
-    for(int i = 0; i < 128; i++){
-
-        // Index for selecting which q2k super block
-        int src1 = (i % 16) / 2;
-        // Index for selecting scale
-        int src2 = ((i / 16) * 2) + (i % 2);
-
-        out.scales[i] = in[src1].scales[src2];
-    }
-    return out;
-
-}
-
 static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
    GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
    GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
@@ -1259,37 +975,6 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
    GGML_UNUSED(data_size);
 }

-static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
-    GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
-    GGML_ASSERT(interleave_block == 8);
-    constexpr int nrows_interleaved = 8;
-
-    block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
-    const block_q2_K * src = (const block_q2_K*) data;
-    block_q2_K dst_tmp[8];
-    int nrow = ggml_nrows(t);
-    int nblocks = t->ne[0] / QK_K;
-
-    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
-
-    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
-        return -1;
-    }
-
-    for (int b = 0; b < nrow; b += nrows_interleaved) {
-        for (int64_t x = 0; x < nblocks; x++) {
-            for (int i  = 0; i < nrows_interleaved; i++ ) {
-                dst_tmp[i] = src[x + i * nblocks];
-            }
-            *dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
-        }
-        src += nrows_interleaved * nblocks;
-    }
-    return 0;
-
-    GGML_UNUSED(data_size);
-}
-
 static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
    GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
    GGML_ASSERT(interleave_block == 8);
@@ -1358,16 +1043,15 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s

 static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
    GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
+    //GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
    GGML_ASSERT(interleave_block == 4);

-    const block_iq4_nl   * src = (const block_iq4_nl   *)data;
-          block_iq4_nlx4 * dst = (      block_iq4_nlx4 *)t->data;
-
+    block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
+    const block_iq4_nl * src = (const block_iq4_nl *)data;
    block_iq4_nl dst_tmp[4];
-
    int nrow = ggml_nrows(t);
    int nrows_interleaved = 4;
-    int nblocks = t->ne[0] / QK4_NL;
+    int nblocks = t->ne[0] / QK4_0;

    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));

@@ -1389,63 +1073,6 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
    GGML_UNUSED(data_size);
 }

-static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) {
-    block_iq4_nlx8 out;
-
-    for (int i = 0; i < 8; i++) {
-        out.d[i] = in[i].d;
-    }
-
-    const int end = QK4_NL * 4 / blck_size_interleave;
-
-    if (blck_size_interleave == 8) {
-        for (int i = 0; i < end; ++i) {
-            int src_id = i % 8;
-            int src_offset = (i / 8) * blck_size_interleave;
-            int dst_offset = i * blck_size_interleave;
-
-            memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
-        }
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    return out;
-}
-
-static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
-    GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
-    GGML_ASSERT(interleave_block == 8);
-
-    const block_iq4_nl   * src = (const block_iq4_nl   *)data;
-          block_iq4_nlx8 * dst = (      block_iq4_nlx8 *)t->data;
-
-    block_iq4_nl dst_tmp[8];
-
-    int nrow = ggml_nrows(t);
-    int nrows_interleaved = 8;
-    int nblocks = t->ne[0] / QK4_NL;
-
-    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
-
-    if (t->ne[1] % nrows_interleaved != 0) {
-        return -1;
-    }
-
-    for (int b = 0; b < nrow; b += nrows_interleaved) {
-        for (int64_t x = 0; x < nblocks; x++) {
-            for (int i = 0; i < nrows_interleaved; i++) {
-                dst_tmp[i] = src[x + i * nblocks];
-            }
-            *dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
-        }
-        src += nrows_interleaved * nblocks;
-    }
-    return 0;
-
-    GGML_UNUSED(data_size);
-}
-
 namespace ggml::cpu::repack {
 // repack
 template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
@@ -1468,10 +1095,6 @@ template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * da
    return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
 }

-template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
-    return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
-}
-
 template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
    return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
 }
@@ -1481,10 +1104,6 @@ template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void *
 //    return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
 //}

-template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
-    return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
-}
-
 // gemv
 template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
 void gemv(int, float *, size_t, const void *, const void *, int, int);
@@ -1505,18 +1124,10 @@ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
    ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
 }

-template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
-    ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
-}
-
 template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
    ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
 }

-template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
-    ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
-}
-
 // gemm
 template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
 void gemm(int, float *, size_t, const void *, const void *, int, int);
@@ -1537,18 +1148,10 @@ template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
    ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
 }

-template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
-    ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
-}
-
 template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
    ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
 }

-template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
-    ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
-}
-
 class tensor_traits_base : public ggml::cpu::tensor_traits {
  public:
    virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
@@ -1818,12 +1421,8 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
    static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
    static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;

-    // instance for Q2
-    static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
-
    // instance for IQ4
    static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
-    static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;

    if (cur->type == GGML_TYPE_Q4_0) {
        if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
@@ -1847,18 +1446,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
                return &q4_K_8x8_q8_K;
            }
        }
-    } else if (cur->type == GGML_TYPE_Q2_K) {
-        if (ggml_cpu_has_avx512()) {
-            if (cur->ne[1] % 8 == 0) {
-                return &q2_K_8x8_q8_K;
-            }
-        }
    } else if (cur->type == GGML_TYPE_IQ4_NL) {
-        if (ggml_cpu_has_avx2()) {
-            if (cur->ne[1] % 8 == 0) {
-                return &iq4_nl_8x8_q8_0;
-            }
-        }
        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
            if (cur->ne[1] % 4 == 0) {
                return &iq4_nl_4x4_q8_0;
--- a/ggml/src/ggml-cpu/repack.h
+++ b/ggml/src/ggml-cpu/repack.h
@@ -44,14 +44,7 @@ struct block_q4_Kx8 {
 };

 static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
-struct block_q2_Kx8 {
-    ggml_half d[8];      // super-block scale for quantized scales
-    ggml_half dmin[8];   // super-block scale for quantized mins
-    uint8_t scales[128];  // scales and mins, quantized with 4 bits
-    uint8_t qs[512];    // 2--bit quants
-};

-static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding");
 struct block_q8_Kx4 {
    float d[4];              // delta
    int8_t qs[QK_K * 4];     // quants
@@ -67,13 +60,6 @@ struct block_iq4_nlx4 {

 static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");

-struct block_iq4_nlx8 {
-    ggml_half d[8];            // deltas for 8 iq4_nl blocks
-    uint8_t   qs[QK4_NL * 4];  // nibbles / quants for 8 iq4_nl blocks
-};
-
-static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding");
-
 #if defined(__cplusplus)
 extern "C" {
 #endif
@@ -85,16 +71,12 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
 void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);

 // Native implementations
 void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
@@ -104,16 +86,12 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
 void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);

 #if defined(__cplusplus)
 } // extern "C"
--- a/ggml/src/ggml-cpu/traits.cpp
+++ b/ggml/src/ggml-cpu/traits.cpp
@@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {}
 }  // namespace ggml::cpu

 bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
-    for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
+    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
        if (extra && extra->context) {
            auto buf_extra     = (ggml::cpu::extra_buffer_type *) extra->context;
            auto tensor_traits = buf_extra->get_tensor_traits(op);
@@ -23,7 +23,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct
 }

 bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
-    for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
+    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
        if (extra && extra->context) {
            auto buf_extra     = (ggml::cpu::extra_buffer_type *) extra->context;
            auto tensor_traits = buf_extra->get_tensor_traits(op);
--- a/ggml/src/ggml-cpu/traits.h
+++ b/ggml/src/ggml-cpu/traits.h
@@ -33,6 +33,6 @@ class extra_buffer_type {
 }  // namespace ggml::cpu

 // implemented in ggml-cpu.cpp.
-std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types();
+std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();

 #endif
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -55,22 +55,7 @@ inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x)

 inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
-    int i = 0;
-#if defined(__AVX2__)
-    for (; i + 7 < n; i += 8) {
-        __m256 vx = _mm256_loadu_ps(x + i);
-        __m256 vy = _mm256_loadu_ps(y + i);
-        __m256 vz = _mm256_add_ps(vx, vy);
-        _mm256_storeu_ps(z + i, vz);
-    }
-#endif
-    for (; i < n; ++i) {
-        z[i] = x[i] + y[i];
-    }
-}
-
+inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
 inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
    for (int i = 0; i < n; ++i) {
        z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
@@ -1007,9 +992,9 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *

 inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
    for (int i = 0; i < n; ++i) {
-        float xi = GGML_CPU_FP16_TO_FP32(x[i]);
-        float gi = GGML_CPU_FP16_TO_FP32(g[i]);
-        y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
+        float v = GGML_CPU_FP16_TO_FP32(x[i]);
+        float w = GGML_CPU_FP16_TO_FP32(g[i]);
+        y[i] = GGML_CPU_FP32_TO_FP16((v/(1.0f + expf(-v))) * w);
    }
 }

--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -120,10 +120,6 @@ if (CUDAToolkit_FOUND)

    set(CUDA_FLAGS -use_fast_math -extended-lambda)

-    if (GGML_CUDA_DEBUG)
-        list(APPEND CUDA_FLAGS -lineinfo)
-    endif()
-
    if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
        # Options are:
        # - none (not recommended)
--- a/ggml/src/ggml-cuda/add-id.cu
+++ b/ggml/src/ggml-cuda/add-id.cu
@@ -1,58 +0,0 @@
-#include "add-id.cuh"
-
-static __global__ void add_id_kernel(
-        const float * src0, const float * src1, const int32_t * src2, float * dst,
-        int64_t ne0, int64_t ne1,
-        size_t nb01, size_t nb02,
-        size_t nb11,
-        size_t nb21
-    ) {
-
-    const int64_t i1 = blockIdx.x;
-    const int64_t i2 = blockIdx.y;
-
-    const int i11 = *(int32_t *) ((char *) src2 + i1*sizeof(int32_t) + i2*nb21);
-
-    const size_t nb1 = ne0 * sizeof(float);
-    const size_t nb2 = ne1 * nb1;
-
-    float * dst_row = (float *)((char *)dst + i1*nb1 + i2*nb2);
-    const float * src0_row = (const float *)((char *)src0 +  i1*nb01 + i2*nb02);
-    const float * src1_row = (const float *)((char *)src1 + i11*nb11);
-
-    for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
-        dst_row[i0] = src0_row[i0] + src1_row[i0];
-    }
-}
-
-void ggml_cuda_op_add_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const ggml_tensor * src1 = dst->src[1];
-    const ggml_tensor * src2 = dst->src[2];
-
-    GGML_TENSOR_TERNARY_OP_LOCALS
-
-    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(src2->type == GGML_TYPE_I32);
-
-    GGML_ASSERT(nb00 == sizeof(float));
-    GGML_ASSERT(nb10 == sizeof(float));
-    GGML_ASSERT(nb20 == sizeof(int32_t));
-
-    const float * src0_d = (const float *)src0->data;
-    const float * src1_d = (const float *)src1->data;
-    const int32_t * src2_d = (const int32_t *)src2->data;
-    float * dst_d = (float *)dst->data;
-
-    int threads = std::min((int)ne00, 768); // cols
-    dim3 blocks(ne01, ne02); // n_experts_used, n_tokens
-    add_id_kernel<<<blocks, threads, 0, ctx.stream()>>>(
-        src0_d, src1_d, src2_d, dst_d,
-        ne0, ne1,
-        nb01, nb02,
-        nb11,
-        nb21
-    );
-}
--- a/ggml/src/ggml-cuda/add-id.cuh
+++ b/ggml/src/ggml-cuda/add-id.cuh
@@ -1,3 +0,0 @@
-#include "common.cuh"
-
-void ggml_cuda_op_add_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	a5801f408f	sync : ggml	2025-07-25 14:31:39 +03:00
Kai Pastor	2c1f810178	cmake : Indent ggml-config.cmake (ggml/1310)	2025-07-25 14:31:28 +03:00