server : simplify prompt state transition branches

fix : nullptr task dereference
cont : reduce parent checks
2026-02-12 14:03:20 +02:00 · 2026-01-09 17:46:03 +02:00 · 2026-01-09 17:32:39 +02:00 · 2026-01-09 16:44:07 +02:00 · 2026-01-09 16:42:29 +02:00 · 2026-01-09 15:36:58 +02:00
209 changed files with 9622 additions and 17956 deletions
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@@ -13,7 +13,7 @@ ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.
 FROM ${CANN_BASE_IMAGE} AS build

 # -- Install build dependencies --
-RUN yum install -y gcc g++ cmake make git openssl-devel python3 python3-pip && \
+RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
    yum clean all && \
    rm -rf /var/cache/yum

--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -5,7 +5,7 @@ FROM ubuntu:$UBUNTU_VERSION AS build
 ARG TARGETARCH

 RUN apt-get update && \
-    apt-get install -y build-essential git cmake libssl-dev
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev

 WORKDIR /app

--- a/.devops/cuda-new.Dockerfile
+++ b/.devops/cuda-new.Dockerfile
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 ARG CUDA_DOCKER_ARCH=default

 RUN apt-get update && \
-    apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
+    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1

 WORKDIR /app

--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 ARG CUDA_DOCKER_ARCH=default

 RUN apt-get update && \
-    apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
+    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1

 WORKDIR /app

--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -6,7 +6,7 @@ FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build

 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
-    apt-get install -y git libssl-dev
+    apt-get install -y git libcurl4-openssl-dev

 WORKDIR /app

--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -6,7 +6,7 @@ WORKDIR /app

 COPY . .

-RUN yum install -y gcc g++ cmake make openssl-devel
+RUN yum install -y gcc g++ cmake make libcurl-devel
 ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
 ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
 ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -18,7 +18,7 @@ RUN apt-get update && \
    python3 \
    python3-pip \
    git \
-    libssl-dev \
+    libcurl4-openssl-dev \
    libgomp1

 WORKDIR /app
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -32,6 +32,7 @@
  useMpi ? false,
  useRocm ? config.rocmSupport,
  rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
+  enableCurl ? true,
  useVulkan ? false,
  useRpc ? false,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
@@ -159,13 +160,15 @@ effectiveStdenv.mkDerivation (finalAttrs: {
    ++ optionals useMpi [ mpi ]
    ++ optionals useRocm rocmBuildInputs
    ++ optionals useBlas [ blas ]
-    ++ optionals useVulkan vulkanBuildInputs;
+    ++ optionals useVulkan vulkanBuildInputs
+    ++ optionals enableCurl [ curl ];

  cmakeFlags =
    [
      (cmakeBool "LLAMA_BUILD_SERVER" true)
      (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
      (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
+      (cmakeBool "LLAMA_CURL" enableCurl)
      (cmakeBool "GGML_NATIVE" false)
      (cmakeBool "GGML_BLAS" useBlas)
      (cmakeBool "GGML_CUDA" useCuda)
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -27,7 +27,7 @@ RUN apt-get update \
    build-essential \
    cmake \
    git \
-    libssl-dev \
+    libcurl4-openssl-dev \
    curl \
    libgomp1

--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -11,7 +11,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    apt install -y --no-install-recommends \
        git cmake ccache ninja-build \
        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
-        libopenblas-dev libssl-dev && \
+        libopenblas-dev libcurl4-openssl-dev && \
    rm -rf /var/lib/apt/lists/*

 WORKDIR /app
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -5,8 +5,8 @@ FROM ubuntu:$UBUNTU_VERSION AS build
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget xz-utils

-# Install SSL and Vulkan SDK dependencies
-RUN apt install -y libssl-dev curl \
+# Install cURL and Vulkan SDK dependencies
+RUN apt install -y libcurl4-openssl-dev curl \
    libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc

 # Build it
--- a/.github/actions/windows-setup-curl/action.yml
+++ b/.github/actions/windows-setup-curl/action.yml
@@ -0,0 +1,30 @@
+name: 'Windows - Setup CURL'
+description: 'Composite action, to be reused in other workflow'
+inputs:
+  curl_version:
+    description: 'CURL version'
+    required: false
+    default: '8.6.0_6'
+  architecture:
+    description: 'Architecture of the libcurl to download'
+    required: false
+    default: 'win64'
+outputs:
+  curl_path:
+    description: "Path to the downloaded libcurl"
+    value: ${{ steps.get_libcurl.outputs.curl_path }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: libCURL
+      id: get_libcurl
+      shell: powershell
+      env:
+        CURL_VERSION: ${{ inputs.curl_version }}
+        ARCHITECTURE: ${{ inputs.architecture }}
+      run: |
+        curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-${env:ARCHITECTURE}-mingw.zip"
+        mkdir $env:RUNNER_TEMP/libcurl
+        tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
+        echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT
--- a/.github/workflows/build-cmake-pkg.yml
+++ b/.github/workflows/build-cmake-pkg.yml
@@ -20,7 +20,7 @@ jobs:
        run: |
          PREFIX="$(pwd)"/inst
          cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
-                -DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
+                -DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
                -DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
          cmake --build build --config Release
          cmake --install build --prefix "$PREFIX" --config Release
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -30,7 +30,7 @@ jobs:

  #     - name: Build
  #       run: |
-  #         cmake -B build -DLLAMA_OPENSSL=OFF \
+  #         cmake -B build -DLLAMA_CURL=OFF \
  #                        -DCMAKE_BUILD_TYPE=Release \
  #                        -DGGML_OPENMP=OFF \
  #                        -DLLAMA_BUILD_EXAMPLES=ON \
@@ -76,7 +76,7 @@ jobs:

  #     - name: Build
  #       run: |
-  #         cmake -B build -DLLAMA_OPENSSL=OFF \
+  #         cmake -B build -DLLAMA_CURL=OFF \
  #                        -DCMAKE_BUILD_TYPE=Release \
  #                        -DGGML_VULKAN=ON \
  #                        -DGGML_OPENMP=OFF \
@@ -122,7 +122,7 @@ jobs:

  #     - name: Build
  #       run: |
-  #         cmake -B build -DLLAMA_OPENSSL=OFF \
+  #         cmake -B build -DLLAMA_CURL=OFF \
  #                        -DCMAKE_BUILD_TYPE=Release \
  #                        -DGGML_VULKAN=ON \
  #                        -DGGML_OPENMP=OFF \
@@ -178,7 +178,7 @@ jobs:

      - name: Build
        run: |
-          cmake -B build -DLLAMA_OPENSSL=OFF \
+          cmake -B build -DLLAMA_CURL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
@@ -235,7 +235,7 @@ jobs:

      - name: Build
        run: |
-          cmake -B build -DLLAMA_OPENSSL=OFF \
+          cmake -B build -DLLAMA_CURL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
                         -DGGML_VULKAN=ON \
                         -DGGML_OPENMP=OFF \
@@ -281,7 +281,7 @@ jobs:
      - name: Build
        run: |
          export RISCV_ROOT_PATH=${PWD}/spacemit_toolchain
-          cmake -B build -DLLAMA_OPENSSL=OFF \
+          cmake -B build -DLLAMA_CURL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -79,6 +79,7 @@ jobs:
          cmake -B build \
            -DCMAKE_BUILD_RPATH="@loader_path" \
            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=OFF \
            -DLLAMA_BUILD_BORINGSSL=ON \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=OFF \
@@ -91,7 +92,7 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest -L main --verbose --timeout 900
+          ctest -L 'main|curl' --verbose --timeout 900

  macOS-latest-cmake-x64:
    runs-on: macos-15-intel
@@ -117,6 +118,7 @@ jobs:
          cmake -B build \
            -DCMAKE_BUILD_RPATH="@loader_path" \
            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=OFF \
            -DLLAMA_BUILD_BORINGSSL=ON \
            -DGGML_METAL=OFF \
            -DGGML_RPC=ON \
@@ -225,6 +227,8 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DGGML_RPC=ON
          cmake --build build --config Release -j $(nproc)
@@ -233,7 +237,7 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest -L main --verbose --timeout 900
+          ctest -L 'main|curl' --verbose --timeout 900

      - name: Test llama2c conversion
        id: llama2c_test
@@ -289,6 +293,8 @@ jobs:
        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
@@ -299,6 +305,8 @@ jobs:
        if: ${{ matrix.sanitizer == 'THREAD' }}
        run: |
          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
@@ -328,10 +336,14 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          cmake -B build \
+          mkdir build
+          cd build
+          cmake .. \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_LLGUIDANCE=ON
-          cmake --build build --config Release -j $(nproc)
+          cmake --build . --config Release -j $(nproc)

      - name: Test
        id: cmake_test
@@ -365,6 +377,8 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DGGML_RPC=ON
          cmake --build build --config Release -j $(nproc)

@@ -398,6 +412,8 @@ jobs:
        id: cmake_configure
        run: |
          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
            -DGGML_BACKEND_DL=ON \
            -DGGML_CPU_ALL_VARIANTS=ON \
@@ -454,6 +470,8 @@ jobs:
        run: |
          source ./vulkan_sdk/setup-env.sh
          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DGGML_VULKAN=ON
          cmake --build build --config Release -j $(nproc)

@@ -527,6 +545,8 @@ jobs:
        run: |
          export Dawn_DIR=dawn/lib64/cmake/Dawn
          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DGGML_WEBGPU=ON
          cmake --build build --config Release -j $(nproc)

@@ -573,7 +593,7 @@ jobs:
          source emsdk/emsdk_env.sh
          emcmake cmake -B build-wasm \
            -DGGML_WEBGPU=ON \
-            -DLLAMA_OPENSSL=OFF \
+            -DLLAMA_CURL=OFF \
            -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg

          cmake --build build-wasm --target test-backend-ops -j $(nproc)
@@ -604,6 +624,8 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build -S . \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
            -DGGML_HIP_ROCWMMA_FATTN=ON \
            -DGGML_HIP=ON
@@ -635,6 +657,8 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build -S . \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DGGML_MUSA=ON
          cmake --build build --config Release -j $(nproc)

@@ -682,6 +706,8 @@ jobs:
        run: |
          source /opt/intel/oneapi/setvars.sh
          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DGGML_SYCL=ON \
            -DCMAKE_C_COMPILER=icx \
            -DCMAKE_CXX_COMPILER=icpx
@@ -731,6 +757,8 @@ jobs:
        run: |
          source /opt/intel/oneapi/setvars.sh
          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DGGML_SYCL=ON \
            -DCMAKE_C_COMPILER=icx \
            -DCMAKE_CXX_COMPILER=icpx \
@@ -865,7 +893,7 @@ jobs:
          cmake -B build -G Xcode \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_OPENSSL=OFF \
+            -DLLAMA_CURL=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
@@ -1015,7 +1043,7 @@ jobs:
        id: cmake_build
        run: |
          cmake -S . -B build ${{ matrix.defines }} `
-            -DLLAMA_BUILD_BORINGSSL=ON
+            -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}

      - name: Add libopenblas.dll
@@ -1073,6 +1101,8 @@ jobs:
          # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
          run: |
            cmake -S . -B build -G Ninja \
+              -DLLAMA_CURL=OFF \
+              -DLLAMA_OPENSSL=ON \
              -DLLAMA_FATAL_WARNINGS=ON \
              -DCMAKE_BUILD_TYPE=Release \
              -DCMAKE_CUDA_ARCHITECTURES=89-real \
@@ -1120,6 +1150,7 @@ jobs:
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
          cmake -S . -B build -G "Ninja Multi-Config" ^
            -DLLAMA_BUILD_SERVER=ON ^
+            -DLLAMA_CURL=OFF ^
            -DLLAMA_BUILD_BORINGSSL=ON ^
            -DGGML_NATIVE=OFF ^
            -DGGML_BACKEND_DL=ON ^
@@ -1227,6 +1258,7 @@ jobs:
            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" `
            -DCMAKE_BUILD_TYPE=Release `
+            -DLLAMA_CURL=OFF `
            -DLLAMA_BUILD_BORINGSSL=ON `
            -DROCM_DIR="${env:HIP_PATH}" `
            -DGGML_HIP=ON `
@@ -1253,7 +1285,7 @@ jobs:
          cmake -B build -G Xcode \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_OPENSSL=OFF \
+            -DLLAMA_CURL=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
@@ -1320,7 +1352,7 @@ jobs:
      matrix:
        include:
          - build: 'arm64-cpu'
-            defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
+            defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_CURL=OFF -D GGML_OPENMP=OFF'
          - build: 'arm64-snapdragon'
            defines: '--preset arm64-android-snapdragon-release'

@@ -1431,7 +1463,7 @@ jobs:
            "${{ steps.cann-image.outputs.image }}" \
            bash -lc '
              set -e
-              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
+              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
              yum clean all && rm -rf /var/cache/yum
              git config --global --add safe.directory "/workspace"
              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
@@ -1465,7 +1497,7 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential
+          sudo apt-get install build-essential libcurl4-openssl-dev

      - name: Test
        id: ggml-ci
@@ -1491,7 +1523,7 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential
+          sudo apt-get install build-essential libcurl4-openssl-dev

      - name: Test
        id: ggml-ci
@@ -1517,7 +1549,7 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential
+          sudo apt-get install build-essential libcurl4-openssl-dev

      - name: Test
        id: ggml-ci
@@ -1543,7 +1575,7 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential
+          sudo apt-get install build-essential libcurl4-openssl-dev

      - name: Test
        id: ggml-ci
@@ -1569,7 +1601,7 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential
+          sudo apt-get install build-essential libcurl4-openssl-dev

      - name: Test
        id: ggml-ci
@@ -1733,7 +1765,7 @@ jobs:
         id: depends
         run: |
           sudo apt-get update
-           sudo apt-get install -y build-essential
+           sudo apt-get install -y build-essential libcurl4-openssl-dev

       - name: Test
         id: ggml-ci
@@ -1800,6 +1832,8 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_OPENMP=OFF \
            -DLLAMA_BUILD_EXAMPLES=ON \
@@ -1817,7 +1851,7 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest -L main --verbose --timeout 900
+          ctest -L 'main|curl' --verbose --timeout 900

      - name: Test llama2c conversion
        id: llama2c_test
@@ -1892,7 +1926,7 @@ jobs:
        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
-            -DLLAMA_OPENSSL=OFF \
+            -DLLAMA_CURL=OFF \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
            -DGGML_OPENMP=ON \
            -DLLAMA_BUILD_EXAMPLES=ON \
@@ -1911,7 +1945,7 @@ jobs:
        if: ${{ matrix.sanitizer == 'THREAD' }}
        run: |
          cmake -B build \
-            -DLLAMA_OPENSSL=OFF \
+            -DLLAMA_CURL=OFF \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
            -DGGML_OPENMP=OFF \
            -DLLAMA_BUILD_EXAMPLES=ON \
@@ -1982,7 +2016,7 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build \
-            -DLLAMA_OPENSSL=OFF \
+            -DLLAMA_CURL=OFF \
            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_OPENMP=OFF \
            -DLLAMA_BUILD_EXAMPLES=ON \
@@ -2056,6 +2090,8 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_OPENMP=OFF \
            -DLLAMA_BUILD_EXAMPLES=ON \
@@ -2091,6 +2127,7 @@ jobs:
           sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
           apt-get install -y \
            build-essential \
+            libcurl4-openssl-dev \
            python3-venv \
            gpg \
            wget \
--- a/.github/workflows/copilot-setup-steps.yml
+++ b/.github/workflows/copilot-setup-steps.yml
@@ -38,7 +38,7 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential libssl-dev
+          sudo apt-get install build-essential libcurl4-openssl-dev
          # Install git-clang-format script for formatting only changed code
          wget -O /tmp/git-clang-format https://raw.githubusercontent.com/llvm/llvm-project/release/18.x/clang/tools/clang-format/git-clang-format
          sudo cp /tmp/git-clang-format /usr/local/bin/git-clang-format
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -37,6 +37,13 @@ jobs:
          key: macOS-latest-cmake-arm64
          evict-old-files: 1d

+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+          brew install curl
+
      - name: Build
        id: cmake_build
        run: |
@@ -45,7 +52,6 @@ jobs:
            -DCMAKE_INSTALL_RPATH='@loader_path' \
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_BUILD_BORINGSSL=ON \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DGGML_RPC=ON \
@@ -84,6 +90,13 @@ jobs:
          key: macOS-latest-cmake-x64
          evict-old-files: 1d

+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+          brew install curl
+
      - name: Build
        id: cmake_build
        run: |
@@ -94,7 +107,6 @@ jobs:
            -DCMAKE_INSTALL_RPATH='@loader_path' \
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_BUILD_BORINGSSL=ON \
            -DGGML_METAL=OFF \
            -DGGML_RPC=ON \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
@@ -147,7 +159,7 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential libssl-dev
+          sudo apt-get install build-essential libcurl4-openssl-dev

      - name: Build
        id: cmake_build
@@ -200,7 +212,7 @@ jobs:
          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libssl-dev
+          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev

      - name: Build
        id: cmake_build
@@ -257,23 +269,34 @@ jobs:
        run: |
          choco install ninja

+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+        with:
+          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
+
      - name: Build
        shell: cmd
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
          cmake -S . -B build -G "Ninja Multi-Config" ^
            -D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
-            -DLLAMA_BUILD_BORINGSSL=ON ^
            -DGGML_NATIVE=OFF ^
            -DGGML_BACKEND_DL=ON ^
            -DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^
            -DGGML_OPENMP=ON ^
+            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release

      - name: Pack artifacts
        id: pack_artifacts
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
+          Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
          Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
          7z a -snl llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*

@@ -351,7 +374,7 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_BUILD_BORINGSSL=ON
+          cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF
          cmake --build build --config Release --target ${{ matrix.target }}

      - name: Pack artifacts
@@ -405,7 +428,7 @@ jobs:
            -DGGML_NATIVE=OFF ^
            -DGGML_CPU=OFF ^
            -DGGML_CUDA=ON ^
-            -DLLAMA_BUILD_BORINGSSL=ON ^
+            -DLLAMA_CURL=OFF ^
            -DGGML_CUDA_CUB_3DOT2=ON
          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
          cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
@@ -474,7 +497,7 @@ jobs:
            -DCMAKE_BUILD_TYPE=Release ^
            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
-            -DLLAMA_BUILD_BORINGSSL=ON
+            -DLLAMA_CURL=OFF
          cmake --build build --target ggml-sycl -j

      - name: Build the release package
@@ -601,7 +624,7 @@ jobs:
            -DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
            -DGGML_HIP_ROCWMMA_FATTN=ON `
            -DGGML_HIP=ON `
-            -DLLAMA_BUILD_BORINGSSL=ON
+            -DLLAMA_CURL=OFF
          cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
          md "build\bin\rocblas\library\"
          md "build\bin\hipblaslt\library"
@@ -642,7 +665,7 @@ jobs:
          cmake -B build -G Xcode \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_OPENSSL=OFF \
+            -DLLAMA_CURL=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
@@ -721,7 +744,7 @@ jobs:
            "${{ steps.cann-image.outputs.image }}" \
            bash -lc '
              set -e
-              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
+              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
              yum clean all && rm -rf /var/cache/yum
              git config --global --add safe.directory "/workspace"
              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
--- a/.github/workflows/server-webui.yml
+++ b/.github/workflows/server-webui.yml
@@ -168,6 +168,8 @@ jobs:
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
+              -DLLAMA_CURL=OFF \
+              -DLLAMA_OPENSSL=ON \
              -DLLAMA_BUILD_SERVER=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
@@ -180,6 +182,8 @@ jobs:
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
+              -DLLAMA_CURL=OFF \
+              -DLLAMA_OPENSSL=ON \
              -DLLAMA_BUILD_SERVER=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
@@ -191,6 +195,8 @@ jobs:
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
+              -DLLAMA_CURL=OFF \
+              -DLLAMA_OPENSSL=ON \
              -DLLAMA_BUILD_SERVER=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -72,7 +72,7 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          cmake -B build -DLLAMA_BUILD_BORINGSSL=ON
+          cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
          cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server

      - name: Python setup
@@ -108,7 +108,7 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          cmake -B build -DLLAMA_BUILD_BORINGSSL=ON
+          cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server

      - name: Python setup
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -111,16 +111,11 @@ option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
 option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_DEFAULT})

 # 3rd party libs
-option(LLAMA_HTTPLIB    "llama: httplib for downloading functionality" ON)
-option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" ON)
+option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
+option(LLAMA_HTTPLIB    "llama: if libcurl is disabled, use httplib to download model from an URL" ON)
+option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" OFF)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)

-# deprecated
-option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
-if (LLAMA_CURL)
-    message(WARNING "LLAMA_CURL option is deprecated and will be ignored")
-endif()
-
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
@@ -187,9 +182,6 @@ if (NOT MSVC)
    endif()
 endif()

-include("cmake/license.cmake")
-license_add_file("llama.cpp" "LICENSE")
-
 #
 # 3rd-party
 #
@@ -217,6 +209,11 @@ add_subdirectory(src)
 # utils, programs, examples and tests
 #

+if (NOT LLAMA_BUILD_COMMON)
+    message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL")
+    set(LLAMA_CURL OFF)
+endif()
+
 if (LLAMA_BUILD_COMMON)
    add_subdirectory(common)
    if (LLAMA_HTTPLIB)
@@ -238,19 +235,6 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
    add_subdirectory(tools)
 endif()

-# Automatically add all files from the 'licenses' directory
-file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
-
-foreach(FILE_PATH ${EXTRA_LICENSES})
-    get_filename_component(FILE_NAME "${FILE_PATH}" NAME)
-    string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}")
-    license_add_file("${NAME}" "${FILE_PATH}")
-endforeach()
-
-if (LLAMA_BUILD_COMMON)
-    license_generate(common)
-endif()
-
 #
 # install
 #
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -20,7 +20,7 @@ If AI is used to generate any portion of the code, contributors must adhere to t
 1. Explicitly disclose the manner in which AI was employed.
 2. Perform a comprehensive manual review prior to submitting the pull request.
 3. Be prepared to explain every line of code they submitted when asked about it by a maintainer.
-4. Using AI to write pull request descriptions or to respond to human reviewers is strictly prohibited.
+4. Using AI to respond to human reviewers is strictly prohibited.

 For more info, please refer to the [AGENTS.md](AGENTS.md) file.

--- a/README.md
+++ b/README.md
@@ -200,7 +200,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*

 - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [BonzAI App](https://apps.apple.com/us/app/bonzai-your-local-ai-agent/id6752847988) (proprietary)
 - [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
 - [Dot](https://github.com/alexpinel/Dot) (GPL)
 - [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
@@ -586,5 +585,6 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
 - [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
 - [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
 - [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
+- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
 - [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
 - [subprocess.h](https://github.com/sheredom/subprocess.h) - Single-header process launching solution for C and C++ - Public domain
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -1,52 +1,12 @@
 # Security Policy

- - [**Reporting a vulnerability**](#reporting-a-vulnerability)
- - [**Requirements**](#requirements)
- - [**Covered Topics**](#covered-topics)
 - [**Using llama.cpp securely**](#using-llamacpp-securely)
   - [Untrusted models](#untrusted-models)
   - [Untrusted inputs](#untrusted-inputs)
   - [Data privacy](#data-privacy)
   - [Untrusted environments or networks](#untrusted-environments-or-networks)
   - [Multi-Tenant environments](#multi-tenant-environments)
-
-## Reporting a vulnerability
-
-If you have discovered a security vulnerability in this project that falls inside the [covered topics](#covered-topics), please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
-
-Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
-
-A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
-
-> [!IMPORTANT]
-> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
-
-## Requirements
-
-Before submitting your report, ensure you meet the following requirements:
-
- You have read this policy and fully understand it.
- AI is only permitted in an assistive capacity as stated in [AGENTS.md](AGENTS.md). We do not accept reports that are written exclusively by AI.
- Your report must include a working Proof-of-Concept in the form of a script and/or attached files.
-
-Maintainers reserve the right to close the report if these requirements are not fulfilled.
-
-## Covered Topics
-
-Only vulnerabilities that fall within these parts of the project are considered valid. For problems falling outside of this list, please report them as issues.
-
- `src/**/*`
- `ggml/**/*`
- `gguf-py/**/*`
- `tools/server/*`, **excluding** the following topics:
-    - Web UI
-    - Features marked as experimental
-    - Features not recommended for use in untrusted environments (e.g., router, MCP)
-    - Bugs that can lead to Denial-of-Service attack
-
-Note that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities in LLaMA C++.
-
-For vulnerabilities that fall within the `vendor` directory, please report them directly to the third-party project.
+ - [**Reporting a vulnerability**](#reporting-a-vulnerability)

 ## Using llama.cpp securely

@@ -95,3 +55,19 @@ If you intend to run multiple models in parallel with shared memory, it is your
 3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.

 4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
+
+## Reporting a vulnerability
+
+Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
+
+<!-- normal version -->
+However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
+
+Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
+
+Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report.
+
+A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
+
+> [!IMPORTANT]
+> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -414,7 +414,7 @@ cmake -B build-ios-sim -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_OPENSSL=OFF \
+    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-ios-sim --config Release -- -quiet

@@ -428,7 +428,7 @@ cmake -B build-ios-device -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_OPENSSL=OFF \
+    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-ios-device --config Release -- -quiet

@@ -439,7 +439,7 @@ cmake -B build-macos -G Xcode \
    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_OPENSSL=OFF \
+    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-macos --config Release -- -quiet

@@ -453,7 +453,7 @@ cmake -B build-visionos -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-    -DLLAMA_OPENSSL=OFF \
+    -DLLAMA_CURL=OFF \
    -DLLAMA_HTTPLIB=OFF \
    -DLLAMA_BUILD_SERVER=OFF \
    -S .
@@ -469,7 +469,7 @@ cmake -B build-visionos-sim -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-    -DLLAMA_OPENSSL=OFF \
+    -DLLAMA_CURL=OFF \
    -DLLAMA_HTTPLIB=OFF \
    -DLLAMA_BUILD_SERVER=OFF \
    -S .
@@ -487,7 +487,7 @@ cmake -B build-tvos-sim -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_OPENSSL=OFF \
+    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-tvos-sim --config Release -- -quiet

@@ -502,7 +502,7 @@ cmake -B build-tvos-device -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_OPENSSL=OFF \
+    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-tvos-device --config Release -- -quiet

--- a/ci/run.sh
+++ b/ci/run.sh
@@ -45,7 +45,7 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`

-CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"

 if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
@@ -297,8 +297,7 @@ function gg_sum_test_scripts {
 }

 function gg_get_model {
-    #local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
-    local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-q4_0.gguf"
+    local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
    if [[ -s $gguf_0 ]]; then
        echo -n "$gguf_0"
    else
--- a/cmake/download-models.cmake
+++ b/cmake/download-models.cmake
@@ -1,21 +0,0 @@
-get_filename_component(DEST_DIR "${DEST}" DIRECTORY)
-file(MAKE_DIRECTORY "${DEST_DIR}")
-
-if(NOT EXISTS "${DEST}")
-    message(STATUS "Downloading ${NAME} from ggml-org/models...")
-endif()
-
-file(DOWNLOAD
-    "https://huggingface.co/ggml-org/models/resolve/main/${NAME}?download=true"
-    "${DEST}"
-    TLS_VERIFY ON
-    EXPECTED_HASH ${HASH}
-    STATUS status
-)
-
-list(GET status 0 code)
-
-if(NOT code EQUAL 0)
-    list(GET status 1 msg)
-    message(FATAL_ERROR "Failed to download ${NAME}: ${msg}")
-endif()
--- a/cmake/license.cmake
+++ b/cmake/license.cmake
@@ -1,40 +0,0 @@
-define_property(GLOBAL PROPERTY LICENSE_TEXT
-    BRIEF_DOCS "Embedded licenses"
-    FULL_DOCS  "Global string containing all aggregated licenses"
-)
-
-function(license_add_file NAME FILE)
-    if(NOT IS_ABSOLUTE "${FILE}")
-        set(FILE "${CMAKE_CURRENT_SOURCE_DIR}/${FILE}")
-    endif()
-    if(EXISTS "${FILE}")
-        set(TITLE "License for ${NAME}")
-        string(REGEX REPLACE "." "=" UNDERLINE "${TITLE}")
-        file(READ "${FILE}" TEXT)
-        get_property(TMP GLOBAL PROPERTY LICENSE_TEXT)
-        string(APPEND TMP "R\"=L=(${TITLE}\n${UNDERLINE}\n\n${TEXT})=L=\",\n")
-        set_property(GLOBAL PROPERTY LICENSE_TEXT "${TMP}")
-    else()
-        message(WARNING "License file '${FILE}' not found")
-    endif()
-endfunction()
-
-function(license_generate TARGET_NAME)
-    message(STATUS "Generating embedded license file for target: ${TARGET_NAME}")
-    get_property(TEXT GLOBAL PROPERTY LICENSE_TEXT)
-
-    set(CPP_CONTENT "// Generated by CMake\n\n")
-    string(APPEND CPP_CONTENT "const char* LICENSES[] = {\n")
-    string(APPEND CPP_CONTENT "${TEXT}")
-    string(APPEND CPP_CONTENT "nullptr\n")
-    string(APPEND CPP_CONTENT "};\n")
-
-    set(CPP_FILE "${CMAKE_BINARY_DIR}/license.cpp")
-    file(WRITE "${CPP_FILE}" "${CPP_CONTENT}")
-
-    if(TARGET ${TARGET_NAME})
-        target_sources(${TARGET_NAME} PRIVATE "${CPP_FILE}")
-    else()
-        message(FATAL_ERROR "Target '${TARGET_NAME}' does not exist")
-    endif()
-endfunction()
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -60,8 +60,6 @@ add_library(${TARGET} STATIC
    common.h
    console.cpp
    console.h
-    debug.cpp
-    debug.h
    download.cpp
    download.h
    http.h
@@ -97,7 +95,17 @@ endif()
 # TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...)
 set(LLAMA_COMMON_EXTRA_LIBS build_info)

-if (LLAMA_HTTPLIB)
+if (LLAMA_CURL)
+    # Use curl to download model url
+    find_package(CURL)
+    if (NOT CURL_FOUND)
+        message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF")
+    endif()
+    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
+    include_directories(${CURL_INCLUDE_DIRS})
+    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
+elseif (LLAMA_HTTPLIB)
+    # otherwise, use cpp-httplib
    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_HTTPLIB)
    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
 endif()
@@ -147,3 +155,27 @@ if (LLAMA_LLGUIDANCE)
 endif ()

 target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
+
+
+#
+# copy the license files
+#
+
+# Check if running in GitHub Actions
+if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
+    message(STATUS "Running inside GitHub Actions - copying license files")
+
+    # Copy all files from licenses/ to build/bin/
+    file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
+    foreach(LICENSE_FILE ${LICENSE_FILES})
+        get_filename_component(FILENAME ${LICENSE_FILE} NAME)
+        add_custom_command(
+            POST_BUILD
+            TARGET ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                "${LICENSE_FILE}"
+                "$<TARGET_FILE_DIR:llama>/${FILENAME}"
+            COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
+        message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
+    endforeach()
+endif()
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2,10 +2,10 @@

 #include "chat.h"
 #include "common.h"
-#include "download.h"
 #include "json-schema-to-grammar.h"
 #include "log.h"
 #include "sampling.h"
+#include "download.h"
 #include "preset.h"

 // fix problem with std::min and std::max
@@ -48,8 +48,6 @@

 #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083

-extern const char * LICENSES[];
-
 using json = nlohmann::ordered_json;
 using namespace common_arg_utils;

@@ -281,20 +279,12 @@ static std::string clean_file_name(const std::string & fname) {
 static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
    GGML_ASSERT(!params.model.hf_repo.empty());

-    // the returned hf_repo is without tag
-    auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
-
-    // "latest" tag (default if not specified) is translated to "default" preset
-    if (hf_tag == "latest") {
-        hf_tag = "default";
-    }
-
    const bool offline = params.offline;
    std::string model_endpoint = get_model_endpoint();
-    auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
+    auto preset_url = model_endpoint + params.model.hf_repo + "/resolve/main/preset.ini";

    // prepare local path for caching
-    auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
+    auto preset_fname = clean_file_name(params.model.hf_repo + "_preset.ini");
    auto preset_path = fs_get_cache_file(preset_fname);
    const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
    const bool has_preset = status >= 200 && status < 400;
@@ -303,15 +293,14 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
    if (has_preset) {
        LOG_INF("applying remote preset from %s\n", preset_url.c_str());
        common_preset_context ctx(ex, /* only_remote_allowed */ true);
-        common_preset global;
+        common_preset global; // unused for now
        auto remote_presets = ctx.load_from_ini(preset_path, global);
-        remote_presets = ctx.cascade(global, remote_presets);
-        if (remote_presets.find(hf_tag) != remote_presets.end()) {
-            common_preset preset = remote_presets.at(hf_tag);
+        if (remote_presets.find(COMMON_PRESET_DEFAULT_NAME) != remote_presets.end()) {
+            common_preset & preset = remote_presets.at(COMMON_PRESET_DEFAULT_NAME);
            LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
            preset.apply_to_params(params);
        } else {
-            throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
+            throw std::runtime_error("Remote preset.ini does not contain [" + std::string(COMMON_PRESET_DEFAULT_NAME) + "] section");
        }
    } else {
        LOG_INF("%s", "no remote preset found, skipping\n");
@@ -341,7 +330,7 @@ static handle_model_result common_params_handle_model(
                if (model.path.empty()) {
                    auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
                    if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
-                        exit(1); // error message already printed
+                        exit(1); // built without CURL, error message already printed
                    }
                    model.name    = model.hf_repo;      // repo name with tag
                    model.hf_repo = auto_detected.repo; // repo name without tag
@@ -1041,16 +1030,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            exit(0);
        }
    ));
-    add_opt(common_arg(
-        {"--license"},
-        "show source code license and dependencies",
-        [](common_params &) {
-            for (int i = 0; LICENSES[i]; ++i) {
-                printf("%s\n", LICENSES[i]);
-            }
-            exit(0);
-        }
-    ));
    add_opt(common_arg(
        {"-cl", "--cache-list"},
        "show list of models in cache",
@@ -1295,7 +1274,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.kv_unified = true;
        }
-    ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED}));
+    ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY}));
    add_opt(common_arg(
        {"--context-shift"},
        {"--no-context-shift"},
@@ -2877,18 +2856,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.n_threads_http = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
-    add_opt(common_arg(
-        {"--cache-prompt"},
-        {"--no-cache-prompt"},
-        string_format("whether to enable prompt caching (default: %s)", params.cache_prompt ? "enabled" : "disabled"),
-        [](common_params & params, bool value) {
-            params.cache_prompt = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_PROMPT"));
    add_opt(common_arg(
        {"--cache-reuse"}, "N",
        string_format(
-            "min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: %d)\n"
+            "min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
            "[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
        ),
        [](common_params & params, int value) {
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@@ -1403,118 +1403,6 @@ static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
    builder.add_content(builder.consume_rest());
 }

-static void common_chat_parse_exaone_moe_content(common_chat_msg_parser & builder) {
-    // 1) <tool_call>{ "name": "...", "arguments": {...} }</tool_call>
-    // 2) <tool_call>{ "id": "...", "type": "function", "function": { "name": "...", "arguments": {...} } }</tool_call>
-    static const common_regex tool_call_open(R"(<tool_call[^>]*>)");
-
-    if (!builder.syntax().parse_tool_calls) {
-        LOG_DBG("%s: not parse_tool_calls\n", __func__);
-        builder.add_content(builder.consume_rest());
-        return;
-    }
-
-    LOG_DBG("%s: parse_tool_calls\n", __func__);
-
-    // Find all <tool_call></tool_call> blocks
-    while (auto first = builder.try_find_regex(tool_call_open, std::string::npos, /* add_prelude_to_content= */ true)) {
-        builder.move_to(first->groups[0].end);
-        builder.consume_spaces();
-
-        builder.try_consume_literal("```json");
-        builder.try_consume_literal("```");
-        builder.consume_spaces();
-
-        // Consume JSON object
-        auto data = builder.consume_json();
-
-        builder.consume_spaces();
-        builder.try_consume_literal("```");
-        builder.consume_spaces();
-
-        if (!builder.try_consume_literal("</tool_call>")) {
-            throw common_chat_msg_partial_exception("incomplete tool call");
-        }
-        builder.consume_spaces();
-
-        // Extract name and arguments
-        std::string name;
-        std::string id;
-        nlohmann::ordered_json arguments;
-
-        const auto extract_args = [&](const nlohmann::ordered_json & obj) -> bool {
-            if (!obj.contains("name") || !obj.contains("arguments")) {
-                return false;
-            }
-            name = obj.at("name").get<std::string>();
-            arguments = obj.at("arguments");
-            if (obj.contains("id") && obj.at("id").is_string()) {
-                id = obj.at("id").get<std::string>();
-            }
-            return true;
-        };
-
-        if (!extract_args(data.json)) {
-            if (data.json.contains("function") && data.json.at("function").is_object()) {
-                auto fn = data.json.at("function");
-                extract_args(fn);
-                if (id.empty() && data.json.contains("id") && data.json.at("id").is_string()) {
-                    id = data.json.at("id").get<std::string>();
-                }
-            }
-        }
-
-        // If name is empty, treat the JSON object as content
-        if (name.empty()) {
-            LOG_DBG("%s: tool call missing name, treating as content\n", __func__);
-            builder.add_content(data.json.dump());
-            continue;
-        }
-
-        std::string args_str = arguments.dump();
-        if (!builder.add_tool_call(name, id, args_str)) {
-            throw common_chat_msg_partial_exception("incomplete tool call");
-        }
-    }
-
-    builder.add_content(builder.consume_rest());
-}
-
-static void common_chat_parse_exaone_moe(common_chat_msg_parser & builder) {
-    LOG_DBG("%s: parsing exaone_moe\n", __func__);
-    // EXAONE MoE outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
-    // First try to parse using the standard reasoning parsing method
-    LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
-
-    auto start_pos = builder.pos();
-    auto found_end_think = builder.try_find_literal("</think>");
-    builder.move_to(start_pos);
-
-    if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
-        LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
-        common_chat_parse_exaone_moe_content(builder);
-    } else if (builder.try_parse_reasoning("<think>", "</think>")) {
-        // If reasoning was parsed successfully, the remaining content is regular content
-        LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
-        common_chat_parse_exaone_moe_content(builder);
-    } else {
-        if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
-          LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
-          common_chat_parse_exaone_moe_content(builder);
-          return;
-        }
-        // If no reasoning tags found, check if we should treat everything as reasoning
-        if (builder.syntax().thinking_forced_open) {
-            // If thinking is forced open but no tags found, treat everything as reasoning
-            LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
-            builder.add_reasoning_content(builder.consume_rest());
-        } else {
-            LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
-            common_chat_parse_exaone_moe_content(builder);
-        }
-    }
-}
-
 static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
    builder.try_parse_reasoning("<think>", "</think>");
    builder.add_content(builder.consume_rest());
@@ -1602,9 +1490,6 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
        case COMMON_CHAT_FORMAT_SOLAR_OPEN:
            common_chat_parse_solar_open(builder);
            break;
-        case COMMON_CHAT_FORMAT_EXAONE_MOE:
-            common_chat_parse_exaone_moe(builder);
-            break;
        default:
            throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
    }
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -670,7 +670,6 @@ const char * common_chat_format_name(common_chat_format format) {
        case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
        case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
        case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
-        case COMMON_CHAT_FORMAT_EXAONE_MOE: return "EXAONE MoE";
        case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple";
        case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native";
        case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed";
@@ -2540,65 +2539,6 @@ static common_chat_params common_chat_params_init_solar_open(const common_chat_t
    return data;
 }

-static common_chat_params common_chat_params_init_exaone_moe(const common_chat_template & tmpl, const struct templates_params & inputs) {
-    common_chat_params data;
-
-    data.prompt = apply(tmpl, inputs);
-    data.format = COMMON_CHAT_FORMAT_EXAONE_MOE;
-    if (string_ends_with(data.prompt, "<think>\n")) {
-        if (!inputs.enable_thinking) {
-            data.prompt += "</think>\n\n";
-        } else {
-            data.thinking_forced_open = true;
-        }
-    }
-
-    if (inputs.tools.is_array() && !inputs.tools.empty()) {
-        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
-        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-            std::vector<std::string> tool_rules;
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                std::string name = function.at("name");
-                auto parameters = function.at("parameters");
-                builder.resolve_refs(parameters);
-                // Expect: <tool_call>{"name": "<name>", "arguments": {...}}</tool_call>
-                tool_rules.push_back(builder.add_rule(
-                    name + "-call",
-                    "\"<tool_call>\" space " +
-                        builder.add_schema(name + "-obj", json{
-                            {"type", "object"},
-                            {"properties", {
-                                {"name",      json{{"const", name}}},
-                                {"arguments", parameters},
-                            }},
-                            {"required", json::array({"name", "arguments"})},
-                        }) +
-                    " space \"</tool_call>\" space"));
-            });
-
-            auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
-            builder.add_rule("root",
-                std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
-                (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
-
-            data.grammar_triggers.push_back({
-                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
-                std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)?" : "") +
-                    "(<tool_call>)[\\s\\S]*"
-            });
-            data.preserved_tokens = {
-                "<think>",
-                "</think>",
-                "<tool_call>",
-                "</tool_call>",
-            };
-        });
-    }
-
-    return data;
-}
-
 static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
    common_chat_params data;
    data.prompt = apply(tmpl, inputs);
@@ -2769,13 +2709,6 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_xiaomi_mimo(tmpl, params);
    }

-    // EXAONE MoE format detection
-    if (src.find("<tool_call>") != std::string::npos &&
-        src.find("<tool_result>") != std::string::npos &&
-        src.find("<|tool_declare|>") != std::string::npos) {
-        return common_chat_params_init_exaone_moe(tmpl, params);
-    }
-
    // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
    if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
        return common_chat_params_init_hermes_2_pro(tmpl, params);
--- a/common/chat.h
+++ b/common/chat.h
@@ -125,7 +125,6 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_APRIEL_1_5,
    COMMON_CHAT_FORMAT_XIAOMI_MIMO,
    COMMON_CHAT_FORMAT_SOLAR_OPEN,
-    COMMON_CHAT_FORMAT_EXAONE_MOE,

    // These are intended to be parsed by the PEG parser
    COMMON_CHAT_FORMAT_PEG_SIMPLE,
--- a/common/common.h
+++ b/common/common.h
@@ -80,7 +80,6 @@ int32_t cpu_get_num_math();
 //

 enum llama_example {
-    LLAMA_EXAMPLE_BATCHED,
    LLAMA_EXAMPLE_DEBUG,
    LLAMA_EXAMPLE_COMMON,
    LLAMA_EXAMPLE_SPECULATIVE,
@@ -476,7 +475,6 @@ struct common_params {
    int32_t timeout_write     = timeout_read; // http write timeout in seconds
    int32_t n_threads_http    = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
    int32_t n_cache_reuse     = 0;            // min chunk size to reuse from the cache via KV shifting
-    bool    cache_prompt      = true;         // whether to enable prompt caching
    int32_t n_ctx_checkpoints = 8;            // max number of context checkpoints per slot
    int32_t cache_ram_mib     = 8192;         // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.

--- a/common/debug.cpp
+++ b/common/debug.cpp
@@ -1,165 +0,0 @@
-#include "debug.h"
-
-#include "log.h"
-
-#include <cmath>
-#include <string>
-
-static std::string common_ggml_ne_string(const ggml_tensor * t) {
-    std::string str;
-    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
-        str += std::to_string(t->ne[i]);
-        if (i + 1 < GGML_MAX_DIMS) {
-            str += ", ";
-        }
-    }
-    return str;
-}
-
-static float common_ggml_get_float_value(const uint8_t * data,
-                           ggml_type       type,
-                           const size_t *  nb,
-                           size_t          i0,
-                           size_t          i1,
-                           size_t          i2,
-                           size_t          i3) {
-    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
-    float  v;
-    if (type == GGML_TYPE_F16) {
-        v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
-    } else if (type == GGML_TYPE_F32) {
-        v = *(const float *) &data[i];
-    } else if (type == GGML_TYPE_I64) {
-        v = (float) *(const int64_t *) &data[i];
-    } else if (type == GGML_TYPE_I32) {
-        v = (float) *(const int32_t *) &data[i];
-    } else if (type == GGML_TYPE_I16) {
-        v = (float) *(const int16_t *) &data[i];
-    } else if (type == GGML_TYPE_I8) {
-        v = (float) *(const int8_t *) &data[i];
-    } else if (type == GGML_TYPE_BF16) {
-        v = ggml_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
-    } else {
-        GGML_ABORT("fatal error");
-    }
-    return v;
-}
-
-template <bool abort>
-void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
-    GGML_ASSERT(n > 0);
-    float sum = 0;
-    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
-        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
-            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
-                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
-                    const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
-                    sum += v;
-                }
-            }
-        }
-    }
-    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
-        LOG_ERR("                                     [\n");
-        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
-            if (i2 == n && ne[2] > 2 * n) {
-                LOG_ERR("                                      ..., \n");
-                i2 = ne[2] - n;
-            }
-            LOG_ERR("                                      [\n");
-            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
-                if (i1 == n && ne[1] > 2 * n) {
-                    LOG_ERR("                                       ..., \n");
-                    i1 = ne[1] - n;
-                }
-                LOG_ERR("                                       [");
-                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
-                    if (i0 == n && ne[0] > 2 * n) {
-                        LOG_ERR("..., ");
-                        i0 = ne[0] - n;
-                    }
-                    const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
-                    LOG_ERR("%12.4f", v);
-                    if (i0 < ne[0] - 1) {
-                        LOG_ERR(", ");
-                    }
-                }
-                LOG_ERR("],\n");
-            }
-            LOG_ERR("                                      ],\n");
-        }
-        LOG_ERR("                                     ]\n");
-        LOG_ERR("                                     sum = %f\n", sum);
-    }
-
-    if constexpr (abort) {
-        if (std::isnan(sum)) {
-            LOG_ERR("encountered NaN - aborting\n");
-            exit(0);
-        }
-    }
-}
-
-/**
- * GGML operations callback during the graph execution.
- *
- * @param t current tensor
- * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
- *            if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
- *            see ggml_backend_sched_eval_callback
- * @param user_data user data to pass at each call back
- * @return true to receive data or continue the graph, false otherwise
- */
-template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
-    auto * cb_data = (base_callback_data *) user_data;
-
-    const struct ggml_tensor * src0 = t->src[0];
-    const struct ggml_tensor * src1 = t->src[1];
-
-    if (ask) {
-        return true;  // Always retrieve data
-    }
-
-    bool matches_filter = cb_data->tensor_filters.empty();
-
-    if (!matches_filter) {
-        for (const auto & filter : cb_data->tensor_filters) {
-            if (std::regex_search(t->name, filter)) {
-                matches_filter = true;
-                break;
-            }
-        }
-    }
-
-    char src1_str[128] = { 0 };
-    if (src1) {
-        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str());
-    }
-
-    if (matches_filter) {
-        LOG_ERR("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type),
-                ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "",
-                common_ggml_ne_string(t).c_str());
-    }
-
-    const bool is_host = ggml_backend_buffer_is_host(t->buffer);
-
-    if (!is_host) {
-        auto n_bytes = ggml_nbytes(t);
-        cb_data->data.resize(n_bytes);
-        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
-    }
-
-    if (!ggml_is_quantized(t->type) && matches_filter) {
-        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
-        common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
-    }
-
-    return true;
-}
-
-// Explicit template instantiations
-template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
-template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
-template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
-template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
--- a/common/debug.h
+++ b/common/debug.h
@@ -1,43 +0,0 @@
-#pragma once
-#include "common.h"
-#include <string>
-#include <vector>
-#include <regex>
-
-// common debug functions and structs
-
-// Print a tensor's detailed data
-// data - the tensor's data in byte format
-// type - the tensor's quantization type
-// ne   - the tensor dimensions array
-// nb   - the tensor strides array
-// n    - the number of rows/columns to fully print
-template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);
-
-// Intended to use as callback for ggml_backend_sched_eval_callback
-// prints tensors that are processed in the computation graph
-// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
-// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
-// The template parameter determins whether an error should be thrown whenever a NaN is encountered
-// in a tensor (useful for stopping debug sessions on first erroneous tensor)
-// The callback data will be passed as the third parameter (user_data)
-template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
-struct base_callback_data {
-    std::vector<uint8_t>    data;
-    std::vector<std::regex> tensor_filters;
-
-    base_callback_data() = default;
-
-    base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
-        for (const auto & pattern : filter_patterns) {
-            try {
-                std::string anchored_pattern = "^" + pattern;
-                tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
-            } catch (const std::regex_error & e) {
-                throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
-            }
-        }
-        params.cb_eval           = common_debug_cb_eval<false>;
-        params.cb_eval_user_data = this;
-    }
-};
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -19,7 +19,10 @@
 #include <thread>
 #include <vector>

-#if defined(LLAMA_USE_HTTPLIB)
+#if defined(LLAMA_USE_CURL)
+#include <curl/curl.h>
+#include <curl/easy.h>
+#elif defined(LLAMA_USE_HTTPLIB)
 #include "http.h"
 #endif

@@ -158,17 +161,336 @@ static bool is_http_status_ok(int status) {
    return status >= 200 && status < 400;
 }

-std::pair<std::string, std::string> common_download_split_repo_tag(const std::string & hf_repo_with_tag) {
-    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
-    std::string tag = parts.size() > 1 ? parts.back() : "latest";
-    std::string hf_repo = parts[0];
-    if (string_split<std::string>(hf_repo, '/').size() != 2) {
-        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
+#ifdef LLAMA_USE_CURL
+
+//
+// CURL utils
+//
+
+using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
+
+// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
+struct curl_slist_ptr {
+    struct curl_slist * ptr = nullptr;
+    ~curl_slist_ptr() {
+        if (ptr) {
+            curl_slist_free_all(ptr);
+        }
    }
-    return {hf_repo, tag};
+};
+
+static CURLcode common_curl_perf(CURL * curl) {
+    CURLcode res = curl_easy_perform(curl);
+    if (res != CURLE_OK) {
+        LOG_ERR("%s: curl_easy_perform() failed\n", __func__);
+    }
+
+    return res;
 }

-#if defined(LLAMA_USE_HTTPLIB)
+// Send a HEAD request to retrieve the etag and last-modified headers
+struct common_load_model_from_url_headers {
+    std::string etag;
+    std::string last_modified;
+    std::string accept_ranges;
+};
+
+struct FILE_deleter {
+    void operator()(FILE * f) const { fclose(f); }
+};
+
+static size_t common_header_callback(char * buffer, size_t, size_t n_items, void * userdata) {
+    common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
+    static std::regex                    header_regex("([^:]+): (.*)\r\n");
+    static std::regex                    etag_regex("ETag", std::regex_constants::icase);
+    static std::regex                    last_modified_regex("Last-Modified", std::regex_constants::icase);
+    static std::regex                    accept_ranges_regex("Accept-Ranges", std::regex_constants::icase);
+    std::string                          header(buffer, n_items);
+    std::smatch                          match;
+    if (std::regex_match(header, match, header_regex)) {
+        const std::string & key   = match[1];
+        const std::string & value = match[2];
+        if (std::regex_match(key, match, etag_regex)) {
+            headers->etag = value;
+        } else if (std::regex_match(key, match, last_modified_regex)) {
+            headers->last_modified = value;
+        } else if (std::regex_match(key, match, accept_ranges_regex)) {
+            headers->accept_ranges = value;
+        }
+    }
+
+    return n_items;
+}
+
+static size_t common_write_callback(void * data, size_t size, size_t nmemb, void * fd) {
+    return std::fwrite(data, size, nmemb, static_cast<FILE *>(fd));
+}
+
+// helper function to hide password in URL
+static std::string llama_download_hide_password_in_url(const std::string & url) {
+    // Use regex to match and replace the user[:password]@ pattern in URLs
+    // Pattern: scheme://[user[:password]@]host[...]
+    static const std::regex url_regex(R"(^(?:[A-Za-z][A-Za-z0-9+.-]://)(?:[^/@]+@)?.$)");
+    std::smatch             match;
+
+    if (std::regex_match(url, match, url_regex)) {
+        // match[1] = scheme (e.g., "https://")
+        // match[2] = user[:password]@ part
+        // match[3] = rest of URL (host and path)
+        return match[1].str() + "********@" + match[3].str();
+    }
+
+    return url;  // No credentials found or malformed URL
+}
+
+static void common_curl_easy_setopt_head(CURL * curl, const std::string & url) {
+    // Set the URL, allow to follow http redirection
+    curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+
+#    if defined(_WIN32)
+    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
+    //   operating system. Currently implemented under MS-Windows.
+    curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#    endif
+
+    curl_easy_setopt(curl, CURLOPT_NOBODY, 1L);      // will trigger the HEAD verb
+    curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L);  // hide head request progress
+    curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, common_header_callback);
+}
+
+static void common_curl_easy_setopt_get(CURL * curl) {
+    curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
+    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, common_write_callback);
+
+    //  display download progress
+    curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
+}
+
+static bool common_pull_file(CURL * curl, const std::string & path_temporary) {
+    if (std::filesystem::exists(path_temporary)) {
+        const std::string partial_size = std::to_string(std::filesystem::file_size(path_temporary));
+        LOG_INF("%s: server supports range requests, resuming download from byte %s\n", __func__, partial_size.c_str());
+        const std::string range_str = partial_size + "-";
+        curl_easy_setopt(curl, CURLOPT_RANGE, range_str.c_str());
+    }
+
+    // Always open file in append mode could be resuming
+    std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "ab"));
+    if (!outfile) {
+        LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path_temporary.c_str());
+        return false;
+    }
+
+    common_curl_easy_setopt_get(curl);
+    curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile.get());
+
+    return common_curl_perf(curl) == CURLE_OK;
+}
+
+static bool common_download_head(CURL *              curl,
+                                 curl_slist_ptr &    http_headers,
+                                 const std::string & url,
+                                 const std::string & bearer_token) {
+    if (!curl) {
+        LOG_ERR("%s: error initializing libcurl\n", __func__);
+        return false;
+    }
+
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
+    // Check if hf-token or bearer-token was specified
+    if (!bearer_token.empty()) {
+        std::string auth_header = "Authorization: Bearer " + bearer_token;
+        http_headers.ptr        = curl_slist_append(http_headers.ptr, auth_header.c_str());
+    }
+
+    curl_easy_setopt(curl, CURLOPT_HTTPHEADER, http_headers.ptr);
+    common_curl_easy_setopt_head(curl, url);
+    return common_curl_perf(curl) == CURLE_OK;
+}
+
+// download one single file from remote URL to local path
+// returns status code or -1 on error
+static int common_download_file_single_online(const std::string & url,
+                                               const std::string & path,
+                                               const std::string & bearer_token,
+                                               const common_header_list & custom_headers) {
+    static const int max_attempts        = 3;
+    static const int retry_delay_seconds = 2;
+
+    for (int i = 0; i < max_attempts; ++i) {
+        std::string etag;
+
+        // Check if the file already exists locally
+        const auto file_exists = std::filesystem::exists(path);
+        if (file_exists) {
+            etag = read_etag(path);
+        } else {
+            LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
+        }
+
+        bool head_request_ok = false;
+        bool should_download = !file_exists;  // by default, we should download if the file does not exist
+
+        // Initialize libcurl
+        curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
+        common_load_model_from_url_headers headers;
+        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
+        curl_slist_ptr http_headers;
+
+        for (const auto & h : custom_headers) {
+             std::string s = h.first + ": " + h.second;
+             http_headers.ptr = curl_slist_append(http_headers.ptr, s.c_str());
+        }
+        const bool     was_perform_successful = common_download_head(curl.get(), http_headers, url, bearer_token);
+        if (!was_perform_successful) {
+            head_request_ok = false;
+        }
+
+        long http_code = 0;
+        curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
+        if (http_code == 200) {
+            head_request_ok = true;
+        } else {
+            LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
+            head_request_ok = false;
+        }
+
+        // if head_request_ok is false, we don't have the etag or last-modified headers
+        // we leave should_download as-is, which is true if the file does not exist
+        bool should_download_from_scratch = false;
+        if (head_request_ok) {
+            // check if ETag or Last-Modified headers are different
+            // if it is, we need to download the file again
+            if (!etag.empty() && etag != headers.etag) {
+                LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(),
+                        headers.etag.c_str());
+                should_download              = true;
+                should_download_from_scratch = true;
+            }
+        }
+
+        const bool accept_ranges_supported = !headers.accept_ranges.empty() && headers.accept_ranges != "none";
+        if (should_download) {
+            if (file_exists &&
+                !accept_ranges_supported) {  // Resumable downloads not supported, delete and start again.
+                LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
+                if (remove(path.c_str()) != 0) {
+                    LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
+                    return -1;
+                }
+            }
+
+            const std::string path_temporary = path + ".downloadInProgress";
+            if (should_download_from_scratch) {
+                if (std::filesystem::exists(path_temporary)) {
+                    if (remove(path_temporary.c_str()) != 0) {
+                        LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
+                        return -1;
+                    }
+                }
+
+                if (std::filesystem::exists(path)) {
+                    if (remove(path.c_str()) != 0) {
+                        LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
+                        return -1;
+                    }
+                }
+            }
+            if (head_request_ok) {
+                write_etag(path, headers.etag);
+            }
+
+            // start the download
+            LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n",
+                    __func__, llama_download_hide_password_in_url(url).c_str(), path_temporary.c_str(),
+                    headers.etag.c_str(), headers.last_modified.c_str());
+            const bool was_pull_successful = common_pull_file(curl.get(), path_temporary);
+            if (!was_pull_successful) {
+                if (i + 1 < max_attempts) {
+                    const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000;
+                    LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay);
+                    std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
+                } else {
+                    LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
+                }
+
+                continue;
+            }
+
+            long http_code = 0;
+            curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
+
+            int status = static_cast<int>(http_code);
+            if (!is_http_status_ok(http_code)) {
+                LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
+                return status; // TODO: maybe only return on certain codes
+            }
+
+            if (rename(path_temporary.c_str(), path.c_str()) != 0) {
+                LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
+                return -1;
+            }
+
+            return static_cast<int>(http_code);
+        } else {
+            LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
+
+            return 304; // Not Modified - fake cached response
+        }
+    }
+
+    return -1; // max attempts reached
+}
+
+std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
+    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_slist_ptr http_headers;
+    std::vector<char> res_buffer;
+
+    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
+    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
+    curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 0L);
+    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
+    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
+        auto data_vec = static_cast<std::vector<char> *>(data);
+        data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
+        return size * nmemb;
+    };
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
+#if defined(_WIN32)
+    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#endif
+    if (params.timeout > 0) {
+        curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
+    }
+    if (params.max_size > 0) {
+        curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
+    }
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
+
+    for (const auto & header : params.headers) {
+        std::string header_ = header.first + ": " + header.second;
+        http_headers.ptr = curl_slist_append(http_headers.ptr, header_.c_str());
+    }
+    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
+
+    CURLcode res = curl_easy_perform(curl.get());
+
+    if (res != CURLE_OK) {
+        std::string error_msg = curl_easy_strerror(res);
+        throw std::runtime_error("error: cannot make GET request: " + error_msg);
+    }
+
+    long res_code;
+    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
+
+    return { res_code, std::move(res_buffer) };
+}
+
+#elif defined(LLAMA_USE_HTTPLIB)

 class ProgressBar {
    static inline std::mutex mutex;
@@ -465,6 +787,10 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
    return { res->status, std::move(buf) };
 }

+#endif // LLAMA_USE_CURL
+
+#if defined(LLAMA_USE_CURL) || defined(LLAMA_USE_HTTPLIB)
+
 int common_download_file_single(const std::string & url,
                                const std::string & path,
                                const std::string & bearer_token,
@@ -596,8 +922,12 @@ common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag,
                                      const std::string & bearer_token,
                                      bool offline,
                                      const common_header_list & custom_headers) {
-    // the returned hf_repo is without tag
-    auto [hf_repo, tag] = common_download_split_repo_tag(hf_repo_with_tag);
+    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
+    std::string tag = parts.size() > 1 ? parts.back() : "latest";
+    std::string hf_repo = parts[0];
+    if (string_split<std::string>(hf_repo, '/').size() != 2) {
+        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
+    }

    std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;

@@ -815,7 +1145,7 @@ int common_download_file_single(const std::string &,
    throw std::runtime_error("download functionality is not enabled in this build");
 }

-#endif // defined(LLAMA_USE_HTTPLIB)
+#endif // LLAMA_USE_CURL || LLAMA_USE_HTTPLIB

 std::vector<common_cached_model_info> common_list_cached_models() {
    std::vector<common_cached_model_info> models;
--- a/common/download.h
+++ b/common/download.h
@@ -17,12 +17,6 @@ struct common_remote_params {
 // get remote file content, returns <http_code, raw_response_body>
 std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);

-// split HF repo with tag into <repo, tag>
-// for example: "user/model:tag" -> <"user/model", "tag">
-// if tag is not present, default to "latest"
-// example: "user/model" -> <"user/model", "latest">
-std::pair<std::string, std::string> common_download_split_repo_tag(const std::string & hf_repo_with_tag);
-
 struct common_cached_model_info {
    std::string manifest_path;
    std::string user;
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -32,10 +32,8 @@ static std::set<std::string> get_remote_preset_whitelist(const std::map<std::str
        "batch-size",
        "ubatch-size",
        "cache-reuse",
-        "chat-template-kwargs",
-        "mmap",
        // note: sampling params are automatically allowed by default
-        // negated args will be added automatically if the positive arg is specified above
+        // negated args will be added automatically
    };

    std::set<std::string> allowed_keys;
@@ -320,11 +318,6 @@ common_presets common_preset_context::load_from_ini(const std::string & path, co
        }
        LOG_DBG("loading preset: %s\n", preset.name.c_str());
        for (const auto & [key, value] : section.second) {
-            if (key == "version") {
-                // skip version key (reserved for future use)
-                continue;
-            }
-
            LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
            if (filter_allowed_keys && allowed_keys.find(key) == allowed_keys.end()) {
                throw std::runtime_error(string_format(
@@ -341,10 +334,7 @@ common_presets common_preset_context::load_from_ini(const std::string & path, co
                }
                LOG_DBG("accepted option: %s = %s\n", key.c_str(), preset.options[opt].c_str());
            } else {
-                throw std::runtime_error(string_format(
-                    "option '%s' not recognized in preset '%s'",
-                    key.c_str(), preset.name.c_str()
-                ));
+                // TODO: maybe warn about unknown key?
            }
        }

--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -528,11 +528,7 @@ class ModelBase:
        return ()

    def prepare_tensors(self):
-        # Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
-        if self.tensor_map.mapping:
-            max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
-        else:
-            max_name_len = len("vision_encoder.weight,")  # Default reasonable length
+        max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")

        for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
            # we don't need these
@@ -1252,9 +1248,6 @@ class TextModel(ModelBase):
        if chkhsh == "16389f0a1f51ee53e562ffd51c371dc508639ab0e4261502071836e50e223e91":
            # ref: https://huggingface.co/upstage/Solar-Open-100B
            res = "solar-open"
-        if chkhsh == "6c81ce329e0802883b22eabab0d3fa48357337ef1ecb45443828bf1f6254833f":
-            # ref: https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B
-            res = "exaone-moe"

        if res is None:
            logger.warning("\n")
@@ -4370,37 +4363,7 @@ class Qwen3NextModel(Qwen2MoeModel):
        elif name.endswith("norm.weight") and not name.endswith("linear_attn.norm.weight"):
            data_torch = data_torch + 1

-        if "in_proj_qkvz.weight" in name:
-            # original order:  [q, k, v, z] * head_count
-            # corrected order: [q * head_count, k * head_count, v * head_count, z * head_count]
-            head_k_dim = self.hparams["linear_key_head_dim"]
-            head_v_dim = self.hparams["linear_value_head_dim"]
-            num_v_heads = self.hparams["linear_num_value_heads"]
-            num_k_heads = self.hparams["linear_num_key_heads"]
-            hidden_size = self.hparams["hidden_size"]
-            split_arg_list_qkvz = [
-                head_k_dim, # q partition
-                head_k_dim, # k partition
-                (num_v_heads // num_k_heads * head_v_dim), # v partition
-                (num_v_heads // num_k_heads * head_v_dim), # z partition
-            ]
-            # view as (n_embd, head_count, [q+k+v+z])
-            data_torch = data_torch.permute(1, 0).contiguous()
-            data_torch = data_torch.view(-1, num_k_heads, sum(split_arg_list_qkvz))
-            # split into q, k, v, z
-            q, k, v, z = torch.split(data_torch, split_arg_list_qkvz, dim=-1)
-            # flatten dim + head_count
-            q = q.contiguous().view(hidden_size, -1)
-            k = k.contiguous().view(hidden_size, -1)
-            v = v.contiguous().view(hidden_size, -1)
-            z = z.contiguous().view(hidden_size, -1)
-            # stack back
-            qkv = torch.cat([q, k, v], dim=-1).permute(1, 0).contiguous()
-            z = z.permute(1, 0).contiguous()
-            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV,  bid, ".weight"), qkv)
-            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_GATE, bid, ".weight"), z)
-        else:
-            yield from super().modify_tensors(data_torch, name, bid)
+        yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("RND1")
@@ -6075,175 +6038,7 @@ class Gemma3VisionModel(MmprojModel):
        return [] # skip other tensors


-class ConformerAudioModel(MmprojModel):
-    _batch_norm_tensors: list[dict[str, Tensor]] | None = None
-
-    @staticmethod
-    def is_audio_tensor(name: str):
-        return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
-
-    def tensor_force_quant(self, name, new_name, bid, n_dims):
-        if ConformerAudioModel.is_audio_tensor(name):
-            if ".conv" in name or "_conv" in name and ".weight" in name:
-                return gguf.GGMLQuantizationType.F32
-        return super().tensor_force_quant(name, new_name, bid, n_dims)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # fold running_mean, running_var and eps into weight and bias for batch_norm
-        if "batch_norm" in name:
-            if self._batch_norm_tensors is None:
-                self._batch_norm_tensors = [{} for _ in range(self.block_count)]
-            assert bid is not None
-            self._batch_norm_tensors[bid][name] = data_torch
-
-            if len(self._batch_norm_tensors[bid]) < 5:
-                return []
-
-            weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
-            bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
-            running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
-            running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
-            eps = 1e-5 # default value
-
-            a = weight / torch.sqrt(running_var + eps)
-            b = bias - running_mean * a
-            return [
-                (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a),
-                (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
-            ]
-
-        # reshape conv weights
-        if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
-            data_torch = data_torch[:, None, None]
-        if "conv.depthwise_conv" in name and name.endswith(".weight"):
-            assert data_torch.shape[1] == 1
-            data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
-        if "conv.pointwise_conv" in name and name.endswith(".weight"):
-            assert data_torch.shape[2] == 1
-            data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-
@ModelBase.register("Gemma3nForConditionalGeneration")
-class Gemma3nVisionAudioModel(ConformerAudioModel):
-    has_audio_encoder = True
-    has_vision_encoder = True
-
-    # Double indexed mapping for MobileNetV5 blocks (not supported by tensor_mapping.py)
-    # This is the only known model having this, so we prefer implementing it outside of tensor_mapping.py
-    block_tensor_mapping = {
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight":             "v.blk.{bid}.{sid}.conv_exp.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight":                  "v.blk.{bid}.{sid}.bn1.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_pwl.weight":             "v.blk.{bid}.{sid}.conv_pwl.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn2.weight":                  "v.blk.{bid}.{sid}.bn2.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.conv.weight":        "v.blk.{bid}.{sid}.dw_start.conv.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.bn.weight":          "v.blk.{bid}.{sid}.dw_start.bn.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.conv.weight":          "v.blk.{bid}.{sid}.dw_mid.conv.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.bn.weight":            "v.blk.{bid}.{sid}.dw_mid.bn.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.conv.weight":          "v.blk.{bid}.{sid}.pw_exp.conv.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.bn.weight":            "v.blk.{bid}.{sid}.pw_exp.bn.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.conv.weight":         "v.blk.{bid}.{sid}.pw_proj.conv.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.bn.weight":           "v.blk.{bid}.{sid}.pw_proj.bn.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.layer_scale.gamma":           "v.blk.{bid}.{sid}.layer_scale.gamma",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.query.proj.weight":      "v.blk.{bid}.{sid}.attn.query.proj.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.proj.weight":        "v.blk.{bid}.{sid}.attn.key.proj.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.proj.weight":      "v.blk.{bid}.{sid}.attn.value.proj.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.output.proj.weight":     "v.blk.{bid}.{sid}.attn.output.proj.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.down_conv.weight":   "v.blk.{bid}.{sid}.attn.key.down_conv.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.norm.weight":        "v.blk.{bid}.{sid}.attn.key.norm.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.down_conv.weight": "v.blk.{bid}.{sid}.attn.value.down_conv.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.norm.weight":      "v.blk.{bid}.{sid}.attn.value.norm.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight":                 "v.blk.{bid}.{sid}.norm.weight",
-    }
-
-    def __init__(self, *args, **kwargs):
-        # Parent init will call find_hparam which now returns 0 for empty keys
-        super().__init__(*args, **kwargs)
-        assert self.hparams_vision is not None
-        self.hparams_vision["n_layers"] = 128 # fake value for audio encoder, vision encoder doesn't use it
-        self.hparams_vision["intermediate_size"] = self.hparams_vision.get("intermediate_size", 2048) * 4
-        self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_attention_heads", 8)
-
-        # MobileNetV5 does not use image_mean/std
-        self.preprocessor_config["image_mean"] = [0.0 ,0.0 , 0.0]
-        self.preprocessor_config["image_std"] = [1.0 ,1.0 ,1.0]
-        self.hparams_vision["image_size"] = self.preprocessor_config.get(
-            "size", {"height": 768, "width": 768}
-        )["height"]
-
-        # Image sequence length (256 tokens = 16x16 for Gemma3n)
-        image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
-        image_size = self.hparams_vision["image_size"]
-        self.hparams_vision["patch_size"] = image_size // image_seq_length
-
-        # remap audio hparams
-        assert self.hparams_audio is not None
-        self.hparams_audio["n_layers"] = self.hparams_audio["conf_num_hidden_layers"]
-        self.hparams_audio["num_attention_heads"] = self.hparams_audio["conf_num_attention_heads"]
-        self.hparams_audio["feat_in"] = self.hparams_audio["input_feat_size"]
-        self.hparams_audio["intermediate_size"] = self.hparams_audio.get("intermediate_size", 6144)
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-
-        # vision params
-        self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA3NV)
-        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
-
-        # audio params
-        assert self.hparams_audio is not None
-        self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA3NA)
-        self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
-        self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
-
-    def tensor_force_quant(self, name, new_name, bid, n_dims):
-        # Force quantization settings for specific tensor types
-        if "input_projection" in name or "input_proj" in name:
-            return gguf.GGMLQuantizationType.F16
-        if ".embeddings." in name or "stem" in name:
-            return gguf.GGMLQuantizationType.F32
-        return super().tensor_force_quant(name, new_name, bid, n_dims)
-
-    def custom_map(self, name: str) -> str:
-        """Parses names like model.vision_tower.timm_model.blocks.1.2.suffix and applies template mapping."""
-        parts = name.split(".")
-        # MobileNet blocks have at least 7 parts: model, vision_tower, timm_model, blocks, bid, sid, and suffix
-        if len(parts) >= 7:
-            bid, sid = parts[4], parts[5]
-            suffix = ".".join(parts[6:])
-            template = f"model.vision_tower.timm_model.blocks.{{bid}}.{{sid}}.{suffix}"
-            if template in self.block_tensor_mapping:
-                return self.block_tensor_mapping[template].format(bid=bid, sid=sid)
-
-        raise ValueError(f"Unknown name: {name}")
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if (ConformerAudioModel.is_audio_tensor(name)):
-            name = name.replace("model.audio_tower.conformer.", "conformer.layers.")
-            return super().modify_tensors(data_torch, name, bid)
-
-        # Gemma3n uses
-        # - model.embed_vision.* for projection layers
-        # - model.vision_tower.* for vision encoder
-        # Skip non-vision tensors
-        if not (name.startswith("model.embed_vision.") or name.startswith("model.vision_tower.")):
-            return []
-
-        if name.startswith("model.vision_tower.timm_model.blocks."):
-            # Double-indexed block tensors through custom logic
-            new_name = self.custom_map(name)
-        else:
-            # Route non-repeating (conv_stem, msfa, embedding, etc.) and un-catched through tensor_mapping.py
-            new_name = self.map_tensor_name(name)
-
-        if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"):
-            data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1]
-
-        return [(new_name, data_torch)]
-
-
-@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration")
 class Gemma3NModel(Gemma3Model):
    model_arch = gguf.MODEL_ARCH.GEMMA3N
    norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
@@ -6266,25 +6061,8 @@ class Gemma3NModel(Gemma3Model):
        ]

    def set_vocab(self):
-        # For Gemma3n multimodal models, we need the FULL vocab_size (262400)
-        # which includes special tokens from 262144-262399 for vision/audio.
-        # The vocab_size_per_layer_input (262144) is only the embedding size per layer.
-        # Temporarily override the hparams lookup order to prioritize vocab_size.
-
-        # Store original vocab_size_per_layer_input if it exists
-        vocab_size_per_layer_input = self.hparams.get("vocab_size_per_layer_input")
-
-        # Temporarily remove vocab_size_per_layer_input to force using vocab_size
-        if vocab_size_per_layer_input is not None:
-            del self.hparams["vocab_size_per_layer_input"]
-
-        # Call parent set_vocab which will now use vocab_size (262400)
        super().set_vocab()

-        # Restore vocab_size_per_layer_input for later use
-        if vocab_size_per_layer_input is not None:
-            self.hparams["vocab_size_per_layer_input"] = vocab_size_per_layer_input
-
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
@@ -6320,32 +6098,8 @@ class Gemma3NModel(Gemma3Model):
        if "language_model." not in name:
            return [] # skip non-language model tensors

-        # Pad token embeddings for vision/audio special tokens (262144-262399)
-        if "embed_tokens.weight" in name or "embed_tokens_per_layer" in name:
-            # Move to CPU to avoid meta device issues during padding
-            data_torch = data_torch.to(device="cpu")
-
-            vocab_size = self.hparams.get("vocab_size", 262400)
-            current_size = data_torch.shape[0]  # First dimension is vocab_size
-
-            if current_size < vocab_size:
-                # Pad with zeros for vision/audio tokens (they get embeddings from vision tower)
-                padding_size = vocab_size - current_size
-                tensor_type = "per-layer embeddings" if "per_layer" in name else "token embeddings"
-                logger.info(f"Padding {tensor_type} shape {list(data_torch.shape)} from {current_size} to {vocab_size} (adding {padding_size} vision/audio token slots)")
-
-                # Create padding with zeros (vision tokens won't use these embeddings)
-                padding = torch.zeros((padding_size, data_torch.shape[1]), dtype=data_torch.dtype, device=data_torch.device)
-                data_torch = torch.cat([data_torch, padding], dim=0)
-
-            # Continue with normal processing
-            name = name.replace("language_model.", "")
-            return [(self.map_tensor_name(name), data_torch)]
-
        if "altup_unembed_projections" in name:
            data_torch = data_torch.to(device="cpu")
-            # altup_unembed matrices are [hidden_size, hidden_size], NOT vocab-based
-            # They should NOT be padded
            if ".0." in name:
                self._altup_unembd[0] = data_torch
            elif ".1." in name:
@@ -8751,102 +8505,6 @@ class Exaone4Model(TextModel):
                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))


-@ModelBase.register("ExaoneMoEForCausalLM")
-class ExaoneMoEModel(Exaone4Model):
-    model_arch = gguf.MODEL_ARCH.EXAONE_MOE
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
-        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_expert_count(self.hparams["num_experts"])
-        moe_intermediate_size = self.hparams["moe_intermediate_size"]
-        num_shared_experts = self.hparams["num_shared_experts"]
-        self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
-        self.gguf_writer.add_expert_shared_count(num_shared_experts)
-        self.gguf_writer.add_expert_shared_feed_forward_length(moe_intermediate_size * num_shared_experts)
-        self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
-        self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
-        n_dense_layer = self.hparams.get("first_k_dense_replace", self.hparams.get("first_last_k_dense_replace", 0))
-        self.gguf_writer.add_leading_dense_block_count(n_dense_layer)
-        self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 0))
-
-        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
-
-    _experts: list[dict[str, Tensor]] | None = None
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if name.startswith("mtp."):
-            if name.find("layers.") != -1:
-                # `mtp.layers.0.[module_name]` format
-                name = name.replace(f"mtp.layers.{bid}", f"model.layers.{bid + self.hparams['num_hidden_layers']}")
-            else:
-                # mtp fc/norm weights
-                remapper = {
-                    "mtp.fc": "model.layers.{bid}.eh_proj",
-                    "mtp.pre_fc_norm_embedding": "model.layers.{bid}.enorm",
-                    "mtp.pre_fc_norm_hidden": "model.layers.{bid}.hnorm",
-                    "mtp.norm": "model.layers.{bid}.shared_head.norm",
-                }
-                _n = Path(name)
-                new_name = remapper[_n.stem] + _n.suffix
-
-                # set shared weights for all NextN/MTP layers
-                tensors = []
-                for bid in range(self.hparams['num_hidden_layers'], self.block_count):
-                    new_name = new_name.format(bid=bid)
-                    tensors.append((self.map_tensor_name(new_name), data_torch))
-                return tensors
-
-        if name.endswith("e_score_correction_bias"):
-            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
-
-        if name.find("mlp.experts") != -1:
-            n_experts = self.hparams["num_experts"]
-            assert bid is not None
-
-            if self._experts is None:
-                self._experts = [{} for _ in range(self.block_count)]
-
-            self._experts[bid][name] = data_torch
-
-            if len(self._experts[bid]) >= n_experts * 3:
-                tensors: list[tuple[str, Tensor]] = []
-
-                # merge the experts into a single 3d tensor
-                for w_name in ["down_proj", "gate_proj", "up_proj"]:
-                    datas: list[Tensor] = []
-
-                    for xid in range(n_experts):
-                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
-                        datas.append(self._experts[bid][ename])
-                        del self._experts[bid][ename]
-
-                    data_torch = torch.stack(datas, dim=0)
-
-                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
-
-                    new_name = self.map_tensor_name(merged_name)
-
-                    tensors.append((new_name, data_torch))
-                return tensors
-            else:
-                return []
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-    def prepare_tensors(self):
-        super().prepare_tensors()
-        if self._experts is not None:
-            # flatten `list[dict[str, Tensor]]` into `list[str]`
-            experts = [k for d in self._experts for k in d.keys()]
-            if len(experts) > 0:
-                raise ValueError(f"Unprocessed experts: {experts}")
-
-
@ModelBase.register("GraniteForCausalLM")
 class GraniteModel(LlamaModel):
    """Conversion for IBM's GraniteForCausalLM"""
@@ -10278,7 +9936,7 @@ class LFM2Model(TextModel):
        self._add_feed_forward_length()

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if self._is_vision_tensor(name) or ConformerAudioModel.is_audio_tensor(name):
+        if self._is_vision_tensor(name) or self._is_audio_tensor(name):
            # skip multimodal tensors
            return []

@@ -10294,6 +9952,9 @@ class LFM2Model(TextModel):
    def _is_vision_tensor(self, name: str) -> bool:
        return "vision_tower" in name or "multi_modal_projector" in name

+    def _is_audio_tensor(self, name: str):
+        return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
+

@ModelBase.register("Lfm2Model")
 class LFM2ColBertModel(LFM2Model):
@@ -10421,11 +10082,13 @@ class LFM2VLModel(MmprojModel):


@ModelBase.register("Lfm2AudioForConditionalGeneration")
-class LFM2AudioModel(ConformerAudioModel):
+class LFM2AudioModel(MmprojModel):
    has_vision_encoder = False
    has_audio_encoder = True
    model_name = "Lfm2AudioEncoder"

+    _batch_norm_tensors: list[dict[str, Tensor]] | None = None
+
    def get_audio_config(self) -> dict[str, Any] | None:
        return self.global_config.get("encoder")

@@ -10439,7 +10102,12 @@ class LFM2AudioModel(ConformerAudioModel):
        self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
        self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)

-    def modify_tensors(self, data_torch, name, bid):
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        if ".conv" in name and ".weight" in name:
+            return gguf.GGMLQuantizationType.F32
+        return super().tensor_force_quant(name, new_name, bid, n_dims)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        # skip language model tensors
        if name.startswith("lfm."):
            return []
@@ -10452,7 +10120,40 @@ class LFM2AudioModel(ConformerAudioModel):
        if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]):
            return []

-        return super().modify_tensors(data_torch, name, bid)
+        # fold running_mean, running_var and eps into weight and bias for batch_norm
+        if "batch_norm" in name:
+            if self._batch_norm_tensors is None:
+                self._batch_norm_tensors = [{} for _ in range(self.block_count)]
+            assert bid is not None
+            self._batch_norm_tensors[bid][name] = data_torch
+
+            if len(self._batch_norm_tensors[bid]) < 5:
+                return []
+
+            weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
+            bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
+            running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
+            running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
+            eps = 1e-5 # default value
+
+            a = weight / torch.sqrt(running_var + eps)
+            b = bias - running_mean * a
+            return [
+                (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a),
+                (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
+            ]
+
+        # reshape conv weights
+        if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
+            data_torch = data_torch[:, None, None]
+        if "conv.depthwise_conv" in name and name.endswith(".weight"):
+            assert data_torch.shape[1] == 1
+            data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
+        if "conv.pointwise_conv" in name and name.endswith(".weight"):
+            assert data_torch.shape[2] == 1
+            data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
+
+        return [(self.map_tensor_name(name), data_torch)]


@ModelBase.register("SmallThinkerForCausalLM")
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -147,7 +147,6 @@ models = [
    {"name": "kormo",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/KORMo-Team/KORMo-tokenizer", },
    {"name": "youtu",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Youtu-LLM-2B", },
    {"name": "solar-open",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/upstage/Solar-Open-100B", },
-    {"name": "exaone-moe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
--- a/docs/backend/hexagon/CMakeUserPresets.json
+++ b/docs/backend/hexagon/CMakeUserPresets.json
@@ -1,4 +1,4 @@
-{
+{
  "version": 4,
  "configurePresets": [
    {
@@ -23,7 +23,7 @@
            "GGML_OPENCL":      "ON",
            "GGML_HEXAGON":     "ON",
            "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
-            "LLAMA_OPENSSL":    "OFF"
+            "LLAMA_CURL":       "OFF"
        }
    },

@@ -38,7 +38,7 @@
            "GGML_OPENCL":      "ON",
            "GGML_HEXAGON":     "ON",
            "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
-            "LLAMA_OPENSSL":    "OFF"
+            "LLAMA_CURL":       "OFF"
        }
    },

--- a/docs/backend/hexagon/README.md
+++ b/docs/backend/hexagon/README.md
@@ -210,10 +210,6 @@ build: 6a8cf8914 (6733)
  Controls whether the Hexagon backend allocates host buffers. By default, all buffers except for REPACK are host buffers.
  This option is required for testing Ops that require REPACK buffers (MUL_MAT and MUL_MAT_ID).

- `GGML_HEXAGON_EXPERIMENTAL=1`
-  Controls whether the Hexagon backend enables experimental features.
-  This option is required for enabling/testing experimental Ops (FLASH_ATTN_EXT).
-
 - `GGML_HEXAGON_VERBOSE=1`
  Enables verbose logging of Ops from the backend. Example output:

--- a/docs/build-riscv64-spacemit.md
+++ b/docs/build-riscv64-spacemit.md
@@ -15,7 +15,7 @@ Below is the build script: it requires utilizing RISC-V vector instructions for
 cmake -B build \
    -DCMAKE_BUILD_TYPE=Release \
    -DGGML_CPU_RISCV64_SPACEMIT=ON \
-    -DLLAMA_OPENSSL=OFF \
+    -DLLAMA_CURL=OFF \
    -DGGML_RVV=ON \
    -DGGML_RV_ZFH=ON \
    -DGGML_RV_ZICBOP=ON \
--- a/docs/build.md
+++ b/docs/build.md
@@ -65,10 +65,10 @@ cmake --build build --config Release
      cmake --preset x64-windows-llvm-release
      cmake --build build-x64-windows-llvm-release
      ```
- If you want HTTPS/TLS features, you may install OpenSSL development libraries. If not installed, the project will build and run without SSL support.
-  - **Debian / Ubuntu:** `sudo apt-get install libssl-dev`
-  - **Fedora / RHEL / Rocky / Alma:** `sudo dnf install openssl-devel`
-  - **Arch / Manjaro:** `sudo pacman -S openssl`
+- Curl usage is enabled by default and can be turned off with `-DLLAMA_CURL=OFF`. Otherwise you need to install development libraries for libcurl.
+  - **Debian / Ubuntu:** `sudo apt-get install libcurl4-openssl-dev`  # (or `libcurl4-gnutls-dev` if you prefer GnuTLS)
+  - **Fedora / RHEL / Rocky / Alma:** `sudo dnf install libcurl-devel`
+  - **Arch / Manjaro:** `sudo pacman -S curl`  # includes libcurl headers

 ## BLAS Build

--- a/docs/ops.md
+++ b/docs/ops.md
@@ -57,6 +57,7 @@ Legend:
 |                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|               GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                           IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
@@ -70,9 +71,10 @@ Legend:
 |                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                             NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
+|                     NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                     OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 |
+|                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | ❌ |
 |                              PAD | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
@@ -97,6 +99,7 @@ Legend:
 |                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
+|                          SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
 |                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
--- a/docs/ops/BLAS.csv
+++ b/docs/ops/BLAS.csv
@@ -965,7 +965,6 @@
 "BLAS","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,1,2560],ne_kernel=[3,3,1,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
 "BLAS","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,2,2560],ne_kernel=[3,3,2,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
 "BLAS","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[5,5,1,32],ne_kernel=[3,4,1,32],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
-"BLAS","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[2,2,1536,729],ne_kernel=[2,2,1536,4096],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
 "BLAS","IM2COL_3D","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","BLAS"
 "BLAS","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","BLAS"
 "BLAS","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","BLAS"
@@ -4965,7 +4964,6 @@
 "BLAS","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","0","no","BLAS"
 "BLAS","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","0","no","BLAS"
 "BLAS","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","0","no","BLAS"
-"BLAS","CONV_TRANSPOSE_2D","ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","0","no","BLAS"
 "BLAS","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","0","no","BLAS"
 "BLAS","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","0","no","BLAS"
 "BLAS","ARGMAX","type=f32,ne=[32,1,1,1]","support","0","no","BLAS"
@@ -5717,15 +5715,15 @@
 "BLAS","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","BLAS"
 "BLAS","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=1","support","0","no","BLAS"
 "BLAS","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[3,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[6,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[3,1024,4,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[3,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[6,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[3,1536,4,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[3,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[6,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[3,2048,4,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[4,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[8,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[4,1536,4,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
+"BLAS","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
 "BLAS","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[4,1024,1,1]","support","0","no","BLAS"
 "BLAS","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[4,1024,1,1]","support","0","no","BLAS"
 "BLAS","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[4,1024,1,1]","support","0","no","BLAS"
@@ -5735,15 +5733,6 @@
 "BLAS","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[4,2048,1,1]","support","0","no","BLAS"
 "BLAS","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[4,2048,1,1]","support","0","no","BLAS"
 "BLAS","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[4,2048,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[9,1024,1,1],ne_b=[9,1024,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[18,1024,1,1],ne_b=[9,1024,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[9,1024,4,1],ne_b=[9,1024,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[9,1536,1,1],ne_b=[9,1536,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[18,1536,1,1],ne_b=[9,1536,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[9,1536,4,1],ne_b=[9,1536,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[9,2048,1,1],ne_b=[9,2048,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[18,2048,1,1],ne_b=[9,2048,1,1]","support","0","no","BLAS"
-"BLAS","SSM_CONV","type=f32,ne_a=[9,2048,4,1],ne_b=[9,2048,1,1]","support","0","no","BLAS"
 "BLAS","SSM_SCAN","type=f32,d_state=16,head_dim=1,n_head=1024,n_group=1,n_seq_tokens=32,n_seqs=4","support","0","no","BLAS"
 "BLAS","SSM_SCAN","type=f32,d_state=128,head_dim=64,n_head=16,n_group=2,n_seq_tokens=32,n_seqs=4","support","0","no","BLAS"
 "BLAS","SSM_SCAN","type=f32,d_state=256,head_dim=64,n_head=8,n_group=2,n_seq_tokens=32,n_seqs=4","support","0","no","BLAS"
@@ -6603,30 +6592,6 @@
 "BLAS","MUL_MAT","type_a=f16,type_b=f32,m=1056,n=1,k=67,bs=[1,1],nr=[4,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","BLAS"
 "BLAS","MUL_MAT","type_a=f32,type_b=f32,m=64,n=77,k=77,bs=[12,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","BLAS"
 "BLAS","MUL_MAT","type_a=q4_0,type_b=f32,m=576,n=512,k=576,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","BLAS"
-"BLAS","MUL_MAT","type_a=q4_0,type_b=f32,m=1,n=2048,k=8192,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=f32,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=f16,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=bf16,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=q4_0,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=q4_1,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=q5_0,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=q5_1,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=q8_0,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=mxfp4,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=q2_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=q3_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=q4_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=q5_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=q6_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=iq2_xxs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=iq2_xs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=iq2_s,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=iq3_xxs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=iq1_s,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=iq1_m,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=iq4_nl,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=iq3_s,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
-"BLAS","MUL_MAT","type_a=iq4_xs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
 "BLAS","MUL_MAT","type_a=f16,type_b=f32,m=1056,n=1,k=128,bs=[1,1],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","BLAS"
 "BLAS","MUL_MAT","type_a=f16,type_b=f32,m=128,n=1,k=1056,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=2112,o=1","support","0","no","BLAS"
 "BLAS","MUL_MAT","type_a=bf16,type_b=f32,m=1056,n=1,k=128,bs=[1,1],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","BLAS"
@@ -8951,11 +8916,6 @@
 "BLAS","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=0,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
 "BLAS","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=1,m_prec=f32,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
 "BLAS","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=1,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
-"BLAS","SOFT_MAX","type=f32,ne=[200001,2,3,1],mask=1,sinks=1,m_prec=f32,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
-"BLAS","SOFT_MAX","type=f32,ne=[200001,2,3,1],mask=1,sinks=1,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
-"BLAS","SOFT_MAX","type=f32,ne=[200000,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
-"BLAS","SOFT_MAX","type=f32,ne=[200000,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
-"BLAS","SOFT_MAX","type=f32,ne=[643251,3,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
 "BLAS","SOFT_MAX_BACK","type=f32,ne=[16,16,1,1],scale=1.000000,max_bias=0.000000","support","0","no","BLAS"
 "BLAS","SOFT_MAX_BACK","type=f32,ne=[15,15,1,1],scale=1.000000,max_bias=0.000000","support","0","no","BLAS"
 "BLAS","SOFT_MAX_BACK","type=f32,ne=[16,16,2,3],scale=1.000000,max_bias=0.000000","support","0","no","BLAS"
@@ -9008,7 +8968,6 @@
 "BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@@ -9018,7 +8977,6 @@
 "BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@@ -9029,13 +8987,11 @@
 "BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
@@ -9045,7 +9001,6 @@
 "BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
@@ -9056,13 +9011,11 @@
 "BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
@@ -9072,7 +9025,6 @@
 "BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
@@ -9083,13 +9035,11 @@
 "BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
@@ -9099,7 +9049,6 @@
 "BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
@@ -9110,7 +9059,6 @@
 "BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@@ -9236,7 +9184,6 @@
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@@ -9246,7 +9193,6 @@
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@@ -9257,13 +9203,11 @@
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
@@ -9273,7 +9217,6 @@
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
@@ -9284,13 +9227,11 @@
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
@@ -9300,7 +9241,6 @@
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
@@ -9311,13 +9251,11 @@
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
@@ -9327,7 +9265,6 @@
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
@@ -9338,7 +9275,6 @@
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
-"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
 "BLAS","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@@ -9606,333 +9542,333 @@
 "BLAS","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","0","no","BLAS"
 "BLAS","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","0","no","BLAS"
 "BLAS","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=500,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","0","no","BLAS"
-"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[12,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=9999","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=9999","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=9999","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=9999","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=9999","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=9999","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=9999","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=9999","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=9999","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=9999","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=100","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=500","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1023","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=9999","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=9999","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=1","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=2","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=3","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=7","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=15","support","0","no","BLAS"
+"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=15","support","0","no","BLAS"
 "BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","0","no","BLAS"
 "BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=nearest","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=nearest,flags=none","support","0","no","BLAS"
 "BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=0","support","0","no","BLAS"
 "BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=1","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","0","no","BLAS"
 "BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=0","support","0","no","BLAS"
 "BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear|antialias,transpose=0","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear|antialias,transpose=1","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear|antialias","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear|antialias","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear|align_corners","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear|align_corners","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear|align_corners","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic|align_corners","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bicubic|align_corners","support","0","no","BLAS"
-"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bicubic|align_corners","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=0","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=1","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=align_corners","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bicubic,flags=align_corners","support","0","no","BLAS"
+"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bicubic,flags=align_corners","support","0","no","BLAS"
 "BLAS","SUM","type=f32,ne=[10,5,4,3]","support","0","no","BLAS"
 "BLAS","SUM_ROWS","type=f32,ne=[10,5,4,3],permute=0,slice=0","support","0","no","BLAS"
 "BLAS","SUM","type=f32,ne=[11,5,6,3],permute=[0,2,1,3]","support","0","no","BLAS"
@@ -9955,9 +9891,8 @@
 "BLAS","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","0","no","BLAS"
 "BLAS","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","0","no","BLAS"
 "BLAS","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","0","no","BLAS"
-"BLAS","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","0","no","BLAS"
-"BLAS","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","0","no","BLAS"
-"BLAS","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0,circular=0","support","0","no","BLAS"
+"BLAS","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","0","no","BLAS"
+"BLAS","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","0","no","BLAS"
 "BLAS","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","0","no","BLAS"
 "BLAS","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","0","no","BLAS"
 "BLAS","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","0","no","BLAS"
@@ -9979,7 +9914,6 @@
 "BLAS","CUMSUM","type=f32,ne=[2048,5,4,3]","support","0","no","BLAS"
 "BLAS","CUMSUM","type=f32,ne=[242004,1,1,1]","support","0","no","BLAS"
 "BLAS","CUMSUM","type=f32,ne=[375960,1,1,1]","support","0","no","BLAS"
-"BLAS","CUMSUM","type=f32,ne=[20481,4,1,1]","support","0","no","BLAS"
 "BLAS","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","BLAS"
 "BLAS","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","BLAS"
 "BLAS","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","BLAS"
@@ -9989,41 +9923,17 @@
 "BLAS","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","0","no","BLAS"
 "BLAS","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","0","no","BLAS"
 "BLAS","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","0","no","BLAS"
-"BLAS","DIAG","type=f32,ne=[10,1,4,3]","support","0","no","BLAS"
-"BLAS","DIAG","type=f32,ne=[79,1,19,13]","support","0","no","BLAS"
-"BLAS","DIAG","type=f32,ne=[256,1,8,16]","support","0","no","BLAS"
 "BLAS","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","BLAS"
 "BLAS","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","BLAS"
 "BLAS","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","BLAS"
 "BLAS","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","0","no","BLAS"
 "BLAS","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","0","no","BLAS"
 "BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","0","no","BLAS"
-"BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[64,64,2,2]","support","0","no","BLAS"
-"BLAS","SOLVE_TRI","type=f32,ne_lhs=[79,79,5,3],ne_rhs=[417,79,5,3]","support","0","no","BLAS"
-"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,2],ne_rhs=[32,128,4,2]","support","0","no","BLAS"
-"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[80,80,2,8]","support","0","no","BLAS"
-"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[79,80,2,8]","support","0","no","BLAS"
-"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[81,80,2,8]","support","0","no","BLAS"
-"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[80,80,8,8]","support","0","no","BLAS"
-"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[79,80,8,8]","support","0","no","BLAS"
-"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[81,80,8,8]","support","0","no","BLAS"
-"BLAS","SOLVE_TRI","type=f32,ne_lhs=[84,84,4,4],ne_rhs=[32,84,4,4]","support","0","no","BLAS"
-"BLAS","SOLVE_TRI","type=f32,ne_lhs=[95,95,8,8],ne_rhs=[40,95,8,8]","support","0","no","BLAS"
 "BLAS","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","0","no","BLAS"
-"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[31,128,4,4]","support","0","no","BLAS"
-"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[32,128,4,4]","support","0","no","BLAS"
-"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,3,4],ne_rhs=[32,128,3,4]","support","0","no","BLAS"
-"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,1],ne_rhs=[32,128,4,1]","support","0","no","BLAS"
-"BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[200,64,4,4]","support","0","no","BLAS"
-"BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[384,64,4,4]","support","0","no","BLAS"
-"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=0","support","0","no","BLAS"
-"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=0","support","0","no","BLAS"
-"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=1","support","0","no","BLAS"
-"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=1","support","0","no","BLAS"
-"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=0","support","0","no","BLAS"
-"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=0","support","0","no","BLAS"
-"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=1","support","0","no","BLAS"
-"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=1","support","0","no","BLAS"
+"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","0","no","BLAS"
+"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","0","no","BLAS"
+"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1","support","0","no","BLAS"
+"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1","support","0","no","BLAS"
 "BLAS","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","0","no","BLAS"
 "BLAS","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","BLAS"
 "BLAS","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","BLAS"
--- a/docs/ops/zDNN.csv
+++ b/docs/ops/zDNN.csv
--- a/docs/preset.md
+++ b/docs/preset.md
@@ -58,40 +58,3 @@ temp = 0.8
 ctx-size = 1024
 ; (and other configurations)
 ```
-
-### Named presets
-
-If you want to define multiple preset configurations for one or more GGUF models, you can create a blank HF repo containing a single `preset.ini` file that references the actual model(s):
-
-```ini
-[*]
-mmap = 1
-
-[gpt-oss-20b-hf]
-hf          = ggml-org/gpt-oss-20b-GGUF
-batch-size  = 2048
-ubatch-size = 2048
-top-p       = 1.0
-top-k       = 0
-min-p       = 0.01
-temp        = 1.0
-chat-template-kwargs = {"reasoning_effort": "high"}
-
-[gpt-oss-120b-hf]
-hf          = ggml-org/gpt-oss-120b-GGUF
-batch-size  = 2048
-ubatch-size = 2048
-top-p       = 1.0
-top-k       = 0
-min-p       = 0.01
-temp        = 1.0
-chat-template-kwargs = {"reasoning_effort": "high"}
-```
-
-You can then use it via `llama-cli` or `llama-server`, example:
-
-```sh
-llama-server -hf user/repo:gpt-oss-120b-hf
-```
-
-Please make sure to provide the correct `hf-repo` for each child preset. Otherwise, you may get error: `The specified tag is not a valid quantization scheme.`
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -21,7 +21,7 @@ int main(int argc, char ** argv) {
    params.prompt = "Hello my name is";
    params.n_predict = 32;

-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BATCHED, print_usage)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
        return 1;
    }

--- a/examples/debug/debug.cpp
+++ b/examples/debug/debug.cpp
@@ -1,9 +1,11 @@
-#include "debug.h"
 #include "arg.h"
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "ggml.h"

+#include <cmath>
+#include <cstdint>
 #include <cstdlib>
 #include <string>
 #include <vector>
@@ -11,7 +13,7 @@
 #include <fstream>
 #include <regex>

-static void print_usage(int /*argc*/, char ** argv) {
+static void print_usage(int, char ** argv) {
    const std::string usage_template = R"(
        example usage:

@@ -33,21 +35,33 @@ static void print_usage(int /*argc*/, char ** argv) {
    LOG("%s\n", usage.c_str());
 }

-static bool has_pooling(llama_context * ctx) {
-    switch (llama_pooling_type(ctx)) {
-        case LLAMA_POOLING_TYPE_NONE:
-        case LLAMA_POOLING_TYPE_UNSPECIFIED:
-            return false;
-        default:
-            return true;
+static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data);
+
+struct callback_data {
+    std::vector<uint8_t>    data;
+    std::vector<std::regex> tensor_filters;
+
+    callback_data() = default;
+
+    callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
+        for (const auto & pattern : filter_patterns) {
+            try {
+                std::string anchored_pattern = "^" + pattern;
+                tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
+            } catch (const std::regex_error & e) {
+                throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
+            }
+        }
+        params.cb_eval           = ggml_debug;
+        params.cb_eval_user_data = this;
    }
-}
+};

 struct output_data {
    float *                  data_ptr    = nullptr;
    int                      data_size   = 0;
    std::string              type_suffix;
-    std::vector<float>       embd_norm;
+    std::vector<float>       storage;
    std::string              prompt;
    std::vector<llama_token> tokens;

@@ -59,32 +73,24 @@ struct output_data {
        prompt = params.prompt;

        if (params.embedding) {
-            const int n_embd       = llama_model_n_embd_out(model);
-            const bool pooling     = has_pooling(ctx);
-            const int n_embd_count = pooling ? 1 : tokens.size();
-            const int n_floats     = n_embd * n_embd_count;
+            const int  n_embd          = llama_model_n_embd_out(model);
+            const bool pooling_enabled = llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE;
+            const int  n_embd_count    = pooling_enabled ? 1 : tokens.size();
+            const int  n_embeddings    = n_embd * n_embd_count;

-            float * embd_raw = pooling ? llama_get_embeddings_seq(ctx, 0) : llama_get_embeddings(ctx);
-            if (embd_raw == nullptr) {
-                throw std::runtime_error("failed to get embeddings from the model");
+            float * embeddings;
+            if (pooling_enabled) {
+                embeddings = llama_get_embeddings_seq(ctx, 0);
+                storage.resize(n_embeddings);
+                common_embd_normalize(embeddings, storage.data(), n_embeddings, params.embd_normalize);
+                embeddings = storage.data();
+            } else {
+                embeddings = llama_get_embeddings(ctx);
            }

-            LOG_DBG("pooling_enabled: %s\n", pooling ? "true" : "false");
-            LOG_DBG("n_embd: %d\n", n_embd);
-            LOG_DBG("n_floats: %d\n", n_floats);
-            LOG_DBG("n_embd_count: %d\n", n_embd_count);
-
-            data_ptr    = embd_raw;
-            data_size   = n_floats;
+            data_ptr = embeddings;
+            data_size = n_embeddings;
            type_suffix = "-embeddings";
-
-            if (params.embd_normalize >= 0) {
-                embd_norm.resize(n_floats);
-                for (int i = 0; i < n_embd_count; i++) {
-                    common_embd_normalize(embd_raw+i*n_embd, embd_norm.data()+i*n_embd, n_embd, params.embd_normalize);
-                }
-                data_ptr = embd_norm.data();
-            }
        } else {
            const float * logits = llama_get_logits_ith(ctx, tokens.size() - 1);
            const int n_logits = llama_vocab_n_tokens(vocab);
@@ -96,6 +102,168 @@ struct output_data {
    }
 };

+static std::string ggml_ne_string(const ggml_tensor * t) {
+    std::string str;
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        str += std::to_string(t->ne[i]);
+        if (i + 1 < GGML_MAX_DIMS) {
+            str += ", ";
+        }
+    }
+    return str;
+}
+
+static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.i = (uint32_t)h.bits << 16;
+    return u.f;
+}
+
+static float ggml_get_float_value(const uint8_t * data, ggml_type type,
+        const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
+    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+    switch (type) {
+        case GGML_TYPE_F16:
+            return ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
+        case GGML_TYPE_F32:
+            return *(const float *) &data[i];
+        case GGML_TYPE_I64:
+            return (float) *(const int64_t *) &data[i];
+        case GGML_TYPE_I32:
+            return (float) *(const int32_t *) &data[i];
+        case GGML_TYPE_I16:
+            return (float) *(const int16_t *) &data[i];
+        case GGML_TYPE_I8:
+            return (float) *(const int8_t *) &data[i];
+        case GGML_TYPE_BF16:
+            return ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
+        default:
+            GGML_ABORT("fatal error");
+    }
+}
+
+static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
+    GGML_ASSERT(n > 0);
+    float sum    = 0;
+    float sum_sq = 0.0;
+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+                    const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
+                    sum    += v;
+                    sum_sq += v * v;
+                }
+            }
+        }
+    }
+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+        LOG_DBG("                                     [\n");
+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+            if (i2 == n && ne[2] > 2*n) {
+                LOG_DBG("                                      ..., \n");
+                i2 = ne[2] - n;
+            }
+            LOG_DBG("                                      [\n");
+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+                if (i1 == n && ne[1] > 2*n) {
+                    LOG_DBG("                                       ..., \n");
+                    i1 = ne[1] - n;
+                }
+                LOG_DBG("                                       [");
+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+                    if (i0 == n && ne[0] > 2*n) {
+                        LOG_DBG("..., ");
+                        i0 = ne[0] - n;
+                    }
+                    const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
+                    LOG_DBG("%12.4f", v);
+                    if (i0 < ne[0] - 1) {
+                        LOG_DBG(", ");
+                    }
+                }
+                LOG_DBG("],\n");
+            }
+            LOG_DBG("                                      ],\n");
+        }
+        LOG_DBG("                                     ]\n");
+        LOG_DBG("                                     sum    = %f\n", sum);
+        LOG_DBG("                                     sum_sq = %f\n", sum_sq);
+    }
+
+    if (std::isnan(sum)) {
+        LOG_ERR("encountered NaN - aborting\n");
+        exit(0);
+    }
+}
+
+/**
+ * GGML operations callback during the graph execution.
+ *
+ * @param t current tensor
+ * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
+ *            if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
+ *            see ggml_backend_sched_eval_callback
+ * @param user_data user data to pass at each call back
+ * @return true to receive data or continue the graph, false otherwise
+ */
+static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
+    auto * cb_data = (callback_data *) user_data;
+
+    const struct ggml_tensor * src0 = t->src[0];
+    const struct ggml_tensor * src1 = t->src[1];
+
+    if (ask) {
+        return true; // Always retrieve data
+    }
+
+    bool matches_filter = cb_data->tensor_filters.empty();
+
+    if (!matches_filter) {
+        for (const auto & filter : cb_data->tensor_filters) {
+            if (std::regex_search(t->name, filter)) {
+                matches_filter = true;
+                break;
+            }
+        }
+    }
+
+    char src1_str[128] = {0};
+    if (src1) {
+        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
+    }
+
+    if (matches_filter) {
+        LOG_DBG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
+             t->name,
+             ggml_type_name(t->type),
+             ggml_op_desc(t),
+             src0->name,
+             ggml_ne_string(src0).c_str(),
+             src1 ? src1_str : "",
+             ggml_ne_string(t).c_str());
+    }
+
+    const bool is_host = ggml_backend_buffer_is_host(t->buffer);
+
+    if (!is_host) {
+        auto n_bytes = ggml_nbytes(t);
+        cb_data->data.resize(n_bytes);
+        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
+    }
+
+    if (!ggml_is_quantized(t->type) && matches_filter) {
+        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
+        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
+    }
+
+    return true;
+}
+
+
 static void save_output_data(const output_data & output, const std::string & model_name, const std::string & output_dir) {
    std::filesystem::create_directory(output_dir);
    auto base_path = std::filesystem::path{output_dir} / ("llamacpp-" + model_name + output.type_suffix);
@@ -222,7 +390,7 @@ int main(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

-    base_callback_data cb_data(params, params.tensor_filter);
+    callback_data cb_data(params, params.tensor_filter);

    auto llama_init = common_init_from_params(params);

--- a/examples/eval-callback/CMakeLists.txt
+++ b/examples/eval-callback/CMakeLists.txt
@@ -4,23 +4,12 @@ install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

-if(LLAMA_BUILD_TESTS)
-    if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
-        set(MODEL_NAME "tinyllamas/stories15M-q4_0.gguf")
-        set(MODEL_HASH "SHA256=66967fbece6dbe97886593fdbb73589584927e29119ec31f08090732d1861739")
-    else()
-        set(MODEL_NAME "tinyllamas/stories15M-be.Q4_0.gguf")
-        set(MODEL_HASH "SHA256=9aec857937849d976f30397e97eb1cabb53eb9dcb1ce4611ba8247fb5f44c65d")
-    endif()
-    set(MODEL_DEST "${CMAKE_BINARY_DIR}/${MODEL_NAME}")
-    set(TEST_TARGET test-eval-callback)
-    add_test(NAME ${TEST_TARGET}-download-model COMMAND ${CMAKE_COMMAND}
-        -DDEST=${MODEL_DEST}
-        -DNAME=${MODEL_NAME}
-        -DHASH=${MODEL_HASH}
-        -P ${CMAKE_SOURCE_DIR}/cmake/download-models.cmake
-    )
-    set_tests_properties(${TEST_TARGET}-download-model PROPERTIES FIXTURES_SETUP ${TEST_TARGET}-download-model)
-    add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback -m "${MODEL_DEST}" --prompt hello --seed 42 -ngl 0)
-    set_tests_properties(${TEST_TARGET} PROPERTIES FIXTURES_REQUIRED ${TEST_TARGET}-download-model)
+set(TEST_TARGET test-eval-callback)
+if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
+        add_test(NAME ${TEST_TARGET}
+                        COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+else()
+        add_test(NAME ${TEST_TARGET}
+                        COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K-be.gguf --model stories260K-be.gguf --prompt hello --seed 42 -ngl 0)
 endif()
+set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -1,12 +1,165 @@
 #include "arg.h"
 #include "common.h"
-#include "debug.h"
 #include "log.h"
 #include "llama.h"
-#include "llama-cpp.h"
+#include "ggml.h"
+
+#include <cmath>
+#include <cstdio>
 #include <string>
 #include <vector>

+/**
+ * This the arbitrary data which will be passed to each callback.
+ * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
+ */
+struct callback_data {
+    std::vector<uint8_t> data;
+};
+
+static std::string ggml_ne_string(const ggml_tensor * t) {
+    std::string str;
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        str += std::to_string(t->ne[i]);
+        if (i + 1 < GGML_MAX_DIMS) {
+            str += ", ";
+        }
+    }
+    return str;
+}
+
+static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.i = (uint32_t)h.bits << 16;
+    return u.f;
+}
+
+static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
+    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+    float v;
+    if (type == GGML_TYPE_F16) {
+        v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
+    } else if (type == GGML_TYPE_F32) {
+        v = *(const float *) &data[i];
+    } else if (type == GGML_TYPE_I64) {
+        v = (float) *(const int64_t *) &data[i];
+    } else if (type == GGML_TYPE_I32) {
+        v = (float) *(const int32_t *) &data[i];
+    } else if (type == GGML_TYPE_I16) {
+        v = (float) *(const int16_t *) &data[i];
+    } else if (type == GGML_TYPE_I8) {
+        v = (float) *(const int8_t *) &data[i];
+    } else if (type == GGML_TYPE_BF16) {
+        v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
+    } else {
+        GGML_ABORT("fatal error");
+    }
+    return v;
+}
+
+static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
+    GGML_ASSERT(n > 0);
+    float sum = 0;
+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+                    const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
+                    sum += v;
+                }
+            }
+        }
+    }
+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+        LOG("                                     [\n");
+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+            if (i2 == n && ne[2] > 2*n) {
+                LOG("                                      ..., \n");
+                i2 = ne[2] - n;
+            }
+            LOG("                                      [\n");
+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+                if (i1 == n && ne[1] > 2*n) {
+                    LOG("                                       ..., \n");
+                    i1 = ne[1] - n;
+                }
+                LOG("                                       [");
+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+                    if (i0 == n && ne[0] > 2*n) {
+                        LOG("..., ");
+                        i0 = ne[0] - n;
+                    }
+                    const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
+                    LOG("%12.4f", v);
+                    if (i0 < ne[0] - 1) LOG(", ");
+                }
+                LOG("],\n");
+            }
+            LOG("                                      ],\n");
+        }
+        LOG("                                     ]\n");
+        LOG("                                     sum = %f\n", sum);
+    }
+
+    // TODO: make this abort configurable/optional?
+    if (std::isnan(sum)) {
+        LOG_ERR("encountered NaN - aborting\n");
+        exit(0);
+    }
+}
+
+/**
+ * GGML operations callback during the graph execution.
+ *
+ * @param t current tensor
+ * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
+ *            if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
+ *            see ggml_backend_sched_eval_callback
+ * @param user_data user data to pass at each call back
+ * @return true to receive data or continue the graph, false otherwise
+ */
+static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
+    auto * cb_data = (callback_data *) user_data;
+
+    const struct ggml_tensor * src0 = t->src[0];
+    const struct ggml_tensor * src1 = t->src[1];
+
+    if (ask) {
+        return true; // Always retrieve data
+    }
+
+    char src1_str[128] = {0};
+    if (src1) {
+        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
+    }
+
+    LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
+         t->name, ggml_type_name(t->type), ggml_op_desc(t),
+         src0->name, ggml_ne_string(src0).c_str(),
+         src1 ? src1_str : "",
+         ggml_ne_string(t).c_str());
+
+
+    // copy the data from the GPU memory if needed
+    const bool is_host = ggml_backend_buffer_is_host(t->buffer);
+
+    if (!is_host) {
+        auto n_bytes = ggml_nbytes(t);
+        cb_data->data.resize(n_bytes);
+        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
+    }
+
+    if (!ggml_is_quantized(t->type)) {
+        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
+        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
+    }
+
+    return true;
+}
+
 static bool run(llama_context * ctx, const common_params & params) {
    const llama_model * model = llama_get_model(ctx);
    const llama_vocab * vocab = llama_model_get_vocab(model);
@@ -29,7 +182,7 @@ static bool run(llama_context * ctx, const common_params & params) {
 }

 int main(int argc, char ** argv) {
-    base_callback_data cb_data;
+    callback_data cb_data;

    common_params params;

@@ -44,7 +197,7 @@ int main(int argc, char ** argv) {

    // pass the callback to the backend scheduler
    // it will be executed for each node during the graph computation
-    params.cb_eval = common_debug_cb_eval<false>;
+    params.cb_eval = ggml_debug;
    params.cb_eval_user_data = &cb_data;
    params.warmup = false;

--- a/examples/llama.android/lib/build.gradle.kts
+++ b/examples/llama.android/lib/build.gradle.kts
@@ -26,7 +26,7 @@ android {

                arguments += "-DBUILD_SHARED_LIBS=ON"
                arguments += "-DLLAMA_BUILD_COMMON=ON"
-                arguments += "-DLLAMA_OPENSSL=OFF"
+                arguments += "-DLLAMA_CURL=OFF"

                arguments += "-DGGML_NATIVE=OFF"
                arguments += "-DGGML_BACKEND_DL=ON"
--- a/examples/model-conversion/scripts/causal/modelcard.template
+++ b/examples/model-conversion/scripts/causal/modelcard.template
@@ -7,7 +7,7 @@ base_model:
 Recommended way to run this model:

 ```sh
-llama-server -hf {namespace}/{model_name}-GGUF
+llama-server -hf {namespace}/{model_name}-GGUF -c 0
 ```

 Then, access http://localhost:8080
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@@ -8,10 +8,10 @@ cd build
 source /opt/intel/oneapi/setvars.sh

 #for FP16
-#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_OPENSSL=OFF # faster for long-prompt inference
+#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_CURL=OFF # faster for long-prompt inference

 #for FP32
-cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_OPENSSL=OFF
+cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=OFF

 #build example/main
 #cmake --build . --config Release --target main
--- a/examples/sycl/win-build-sycl.bat
+++ b/examples/sycl/win-build-sycl.bat
@@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR

 ::  for FP16
 ::  faster for long-prompt inference
-::  cmake -G "MinGW Makefiles" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
+::  cmake -G "MinGW Makefiles" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON

 ::  for FP32
-cmake -G "Ninja" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
+cmake -G "Ninja" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
 if %errorlevel% neq 0 goto ERROR

 ::  build all binary
--- a/ggml/src/ggml-blas/CMakeLists.txt
+++ b/ggml/src/ggml-blas/CMakeLists.txt
@@ -32,12 +32,14 @@ if (BLAS_FOUND)
                pkg_check_modules(DepBLAS openblas)
            endif()
        elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
+            add_compile_definitions(GGML_BLAS_USE_BLIS)
            pkg_check_modules(DepBLAS blis)
        elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
            pkg_check_modules(DepBLAS blas-atlas)
        elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
            pkg_check_modules(DepBLAS flexiblas_api)
        elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
+            add_compile_definitions(GGML_BLAS_USE_MKL)
            # all Intel* libraries share the same include path
            pkg_check_modules(DepBLAS mkl-sdl)
        elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
@@ -72,26 +74,10 @@ if (BLAS_FOUND)

    target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})

-    if ("${GGML_BLAS_VENDOR}" STREQUAL "")
-        message(WARNING "GGML_BLAS_VENDOR is not set; some methods may not link properly.")
-    endif()
-
-    if ("${GGML_BLAS_VENDOR}" MATCHES "Intel" OR ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND "${GGML_BLAS_VENDOR}" MATCHES "Generic"))
+    if ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
        add_compile_definitions(GGML_BLAS_USE_MKL)
    endif()

-    if ("${GGML_BLAS_VENDOR}" MATCHES "OpenBLAS")
-        add_compile_definitions(GGML_BLAS_USE_OPENBLAS)
-    endif()
-
-    if ("${GGML_BLAS_VENDOR}" MATCHES "FLAME" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL_mt")
-        add_compile_definitions(GGML_BLAS_USE_BLIS)
-    endif()
-
-    if ("${GGML_BLAS_VENDOR}" MATCHES "NVPL")
-        add_compile_definitions(GGML_BLAS_USE_NVPL)
-    endif()
-
    target_link_libraries     (ggml-blas PRIVATE ${BLAS_LIBRARIES})
    target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
 else()
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -115,11 +115,15 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
 #endif
    }

-#if defined(GGML_BLAS_USE_OPENBLAS)
+#if defined(OPENBLAS_VERSION)
    openblas_set_num_threads(ctx->n_threads);
-#elif defined(GGML_BLAS_USE_BLIS)
+#endif
+
+#if defined(GGML_BLAS_USE_BLIS)
    bli_thread_set_num_threads(ctx->n_threads);
-#elif defined(GGML_BLAS_USE_NVPL)
+#endif
+
+#if defined(GGML_BLAS_USE_NVPL)
    nvpl_blas_set_num_threads(ctx->n_threads);
 #endif

@@ -284,7 +288,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
        /* .context = */ ctx,
    };

-#if defined(GGML_BLAS_USE_OPENBLAS) && defined(GGML_USE_OPENMP)
+#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
    if (openblas_get_parallel() != OPENBLAS_OPENMP) {
        GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
    }
@@ -325,7 +329,7 @@ static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t
        return "BLIS";
    #elif defined(GGML_BLAS_USE_NVPL)
        return "NVPL";
-    #elif defined(GGML_BLAS_USE_OPENBLAS)
+    #elif defined(OPENBLAS_VERSION)
        return "OpenBLAS";
    #else
        return "BLAS";
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -262,10 +262,6 @@ static const char * cu_get_error_str(CUresult err) {
 #define FLASH_ATTN_AVAILABLE
 #endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)

-#if defined(TURING_MMA_AVAILABLE)
-#define LDMATRIX_TRANS_AVAILABLE
-#endif // defined(TURING_MMA_AVAILABLE)
-
 static bool fp16_available(const int cc) {
    return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL ||
        (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_PH1);
@@ -530,86 +526,6 @@ static __device__ __forceinline__ half2 warp_prefix_inclusive_sum(half2 a) {
 #endif // FP16_AVAILABLE
 }

-enum class block_reduce_method {
-    MAX,
-    SUM,
-};
-
-template<block_reduce_method method_t, typename T>
-struct block_reduce_policy;
-
-template <typename T, typename... Ts>
-inline constexpr bool is_any = (std::is_same_v<T, Ts> || ...);
-
-template<typename...>
-inline constexpr bool ggml_cuda_dependent_false_v = false;
-
-template <typename T> struct block_reduce_policy<block_reduce_method::SUM, T> {
-    static __device__ T reduce(T val) {
-        if constexpr(is_any<T, float, float2, half2, int>) {
-            return warp_reduce_sum(val);
-        } else {
-            static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce sum");
-        }
-    }
-
-    static __device__ T sentinel() {
-        if constexpr (std::is_same_v<T, float>) {
-            return 0.0f;
-        } else if constexpr (std::is_same_v<T, float2>) {
-            return make_float2(0.0f, 0.0f);
-        } else if constexpr (std::is_same_v<T, half2>) {
-            return make_half2(0.0f, 0.0f);
-        } else if constexpr (std::is_same_v<T, int>) {
-            return 0;
-        } else {
-            static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce sum");
-        }
-    }
-};
-
-template <typename T> struct block_reduce_policy<block_reduce_method::MAX, T> {
-    static __device__ T reduce(T val) {
-        if constexpr (is_any<T, float, half2>) {
-            return warp_reduce_max(val);
-        } else {
-            static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce max");
-        }
-    }
-
-    static __device__ T sentinel() {
-        if constexpr (std::is_same_v<T, float>) {
-            return -INFINITY;
-        } else if constexpr (std::is_same_v<T, half2>) {
-            return make_half2(-INFINITY, -INFINITY);
-        } else {
-            static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce max");
-        }
-    }
-};
-
-template <block_reduce_method reduce_method_t, const unsigned int block_size_template = 0, typename T>
-static __device__ T block_reduce(T val, T * shared_vals) {
-    val                           = block_reduce_policy<reduce_method_t, T>::reduce(val);
-    const unsigned int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
-    if (block_size > WARP_SIZE) {
-        assert((block_size <= 1024) && (block_size % WARP_SIZE) == 0);
-        const int warp_id = threadIdx.x / WARP_SIZE;
-        const int lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            shared_vals[warp_id] = val;
-        }
-        __syncthreads();
-        val = block_reduce_policy<reduce_method_t, T>::sentinel();
-        if (lane_id < (static_cast<int>(block_size) / WARP_SIZE)) {
-            val = shared_vals[lane_id];
-        }
-        return block_reduce_policy<reduce_method_t, T>::reduce(val);
-    }
-
-    return val;
-}
-
 static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
 #ifdef FP16_AVAILABLE

--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -914,7 +914,7 @@ void launch_fattn(

        const int nblocks_stream_k = max_blocks;

-        const bool use_stream_k = cc >= GGML_CUDA_CC_ADA_LOVELACE || amd_wmma_available(cc) || tiles_efficiency_percent < 75;
+        const bool use_stream_k = cc >= GGML_CUDA_CC_ADA_LOVELACE || tiles_efficiency_percent < 75;

        blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_total;
        blocks_num.y = 1;
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -98,19 +98,6 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
    return ggml_cuda_fattn_mma_get_config_ampere(DKQ, DV, ncols);
 }

-static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_config_rdna(const int DKQ, const int DV, const int ncols) {
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 16, 128, 2,  64, 128, 128, 128, 2, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2,  64, 128, 128,  64, 2, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2,  64, 128, 128,  64, 2, true);
-
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16,  64, 4,  32,  96,  64, 128, 1, false);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 128, 2,  32, 160, 128, 128, 1, false);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 256, 1,  32, 160, 128, 128, 1, false);
-
-    // TODO tune specifically for RDNA
-    return ggml_cuda_fattn_mma_get_config_ampere(DKQ, DV, ncols);
-}
-
 static __host__ fattn_mma_config ggml_cuda_fattn_mma_get_config(const int DKQ, const int DV, const int ncols, const int cc) {
    if (ampere_mma_available(cc)) {
        return ggml_cuda_fattn_mma_get_config_ampere(DKQ, DV, ncols);
@@ -118,9 +105,6 @@ static __host__ fattn_mma_config ggml_cuda_fattn_mma_get_config(const int DKQ, c
    if (turing_mma_available(cc)) {
        return ggml_cuda_fattn_mma_get_config_turing(DKQ, DV, ncols);
    }
-    if (amd_wmma_available(cc)) {
-        return ggml_cuda_fattn_mma_get_config_rdna(DKQ, DV, ncols);
-    }
    GGML_ASSERT(volta_mma_available(cc));
    return ggml_cuda_fattn_mma_get_config_volta(DKQ, DV, ncols);
 }
@@ -132,8 +116,6 @@ static constexpr __device__ fattn_mma_config ggml_cuda_fattn_mma_get_config(cons
    return ggml_cuda_fattn_mma_get_config_turing(DKQ, DV, ncols);
 #elif defined(VOLTA_MMA_AVAILABLE)
    return ggml_cuda_fattn_mma_get_config_volta(DKQ, DV, ncols);
-#elif defined(AMD_WMMA_AVAILABLE)
-    return ggml_cuda_fattn_mma_get_config_rdna(DKQ, DV, ncols);
 #else
    GGML_UNUSED_VARS(DKQ, DV, ncols);
    return fattn_mma_config(32, 1, 0, 0, 0, 0, 0, false);
@@ -204,23 +186,6 @@ static constexpr __device__ bool ggml_cuda_fattn_mma_get_Q_in_reg(const int DKQ,
    return ggml_cuda_fattn_mma_get_config(DKQ, DV, ncols).Q_in_reg;
 }

-static constexpr __device__ int get_cols_per_thread() {
-#if defined(AMD_WMMA_AVAILABLE)
-    return 1; // RDNA has a single column.
-#else
-    return 2; // This is specifically KQ columns, Volta only has a single VKQ column.
-#endif // defined(AMD_WMMA_AVAILABLE)
-}
-
-static __host__ int get_cols_per_warp(const int cc) {
-    if (turing_mma_available(cc) || amd_wmma_available(cc)) {
-        return 16;
-    } else {
-        // Volta
-        return 32;
-    }
-}
-
 // ------------------------------------------------------------------------------------------------------------------

 static __host__ int ggml_cuda_fattn_mma_get_nstages(const int DKQ, const int DV, const int ncols1, const int ncols2, const int cc) {
@@ -428,10 +393,10 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
        const int jt,
        const int kb0,
        const int k_VKQ_sup) {
-#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
+#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
    constexpr int  ncols           = ncols1 * ncols2;
    constexpr int  cols_per_warp   = T_B_KQ::I;
-    constexpr int  cols_per_thread = get_cols_per_thread();
+    constexpr int  cols_per_thread = 2; // This is specifically KQ columns, Volta only has a single VKQ column.
    constexpr int  np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
    constexpr int  nbatch_fa       = ggml_cuda_fattn_mma_get_nbatch_fa(DKQ, DV, ncols);
    constexpr int  nbatch_K2       = ggml_cuda_fattn_mma_get_nbatch_K2(DKQ, DV, ncols);
@@ -448,8 +413,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
    const int k_VKQ_0 = kb0 * nbatch_fa;
 #if defined(TURING_MMA_AVAILABLE)
    T_C_KQ KQ_C[nbatch_fa/(np*(cols_per_warp == 8 ? T_C_KQ::I : T_C_KQ::J))];
-#elif defined(AMD_WMMA_AVAILABLE)
-    T_C_KQ KQ_C[nbatch_fa/(np*T_C_KQ::J)];
 #else // Volta
    T_C_KQ KQ_C[nbatch_fa/(np*T_C_KQ::J)];
 #endif // defined(TURING_MMA_AVAILABLE)
@@ -498,14 +461,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                    if constexpr (cols_per_warp == 8) {
                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[k_KQ_0/T_A_KQ::J]);
                    } else {
-                        // Wide version of KQ_C is column-major
-#if defined(AMD_WMMA_AVAILABLE)
-                        // RDNA matrix C is column-major.
-                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[k_KQ_0/T_A_KQ::J]);
-#else
-                        // swap A and B for CUDA.
+                        // Wide version of KQ_C is column-major => swap A and B.
                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[k_KQ_0/T_A_KQ::J], K_A);
-#endif // defined(AMD_WMMA_AVAILABLE)
                    }
                }
            }
@@ -522,14 +479,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                    T_A_KQ K_A;
                    load_ldmatrix(K_A, tile_K + i_KQ_0*stride_tile_K + (k_KQ_0 - k0_start), stride_tile_K);

-                    // Wide version of KQ_C is column-major
-#if defined(AMD_WMMA_AVAILABLE)
-                    // RDNA matrix C is column-major.
-                    mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[0]);
-#else
-                    // swap A and B for CUDA.
+                    // Wide version of KQ_C is column-major => swap A and B.
                    mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[0], K_A);
-#endif // defined(AMD_WMMA_AVAILABLE)
                }
            }
        }
@@ -581,13 +532,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
 #pragma unroll
            for (int l = 0; l < T_C_KQ::ne; ++l) {
                if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::I + T_C_KQ::get_i(l) < k_VKQ_sup) {
-#if defined(AMD_WMMA_AVAILABLE)
-                    constexpr int KQ_idx = 0;
-#else
-                    // Turing + Volta:
-                    const int KQ_idx = l % 2;
-#endif // defined(AMD_WMMA_AVAILABLE)
-                    KQ_max_new[KQ_idx] = fmaxf(KQ_max_new[KQ_idx], KQ_C[k0/(np*T_C_KQ::I)].x[l] + FATTN_KQ_MAX_OFFSET);
+                    KQ_max_new[l % 2] = fmaxf(KQ_max_new[l % 2], KQ_C[k0/(np*T_C_KQ::I)].x[l] + FATTN_KQ_MAX_OFFSET);
                }
            }
        }
@@ -607,14 +552,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
 #pragma unroll
            for (int l = 0; l < T_C_KQ::ne; ++l) {
                if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::I + T_C_KQ::get_i(l) < k_VKQ_sup) {
-#if defined(AMD_WMMA_AVAILABLE)
-                    constexpr int KQ_idx = 0;
-#else
-                    // Turing + Volta:
-                    const int KQ_idx = l % 2;
-#endif // defined(AMD_WMMA_AVAILABLE)
-                    KQ_C[k0/(np*T_C_KQ::I)].x[l] = expf(KQ_C[k0/(np*T_C_KQ::I)].x[l] - KQ_max_new[KQ_idx]);
-                    KQ_rowsum_add[KQ_idx] += KQ_C[k0/(np*T_C_KQ::I)].x[l];
+                    KQ_C[k0/(np*T_C_KQ::I)].x[l] = expf(KQ_C[k0/(np*T_C_KQ::I)].x[l] - KQ_max_new[l % 2]);
+                    KQ_rowsum_add[l % 2] += KQ_C[k0/(np*T_C_KQ::I)].x[l];
                } else {
                    KQ_C[k0/(np*T_C_KQ::I)].x[l] = 0.0f;
                }
@@ -645,13 +584,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
 #pragma unroll
            for (int l = 0; l < T_C_KQ::ne; ++l) {
                if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::J + T_C_KQ::get_j(l) < k_VKQ_sup) {
-#if defined(AMD_WMMA_AVAILABLE)
-                    constexpr int KQ_idx = 0;
-#else
                    // Turing + Volta:
-                    const int KQ_idx = (l/2) % 2;
-#endif // defined(AMD_WMMA_AVAILABLE)
-                    KQ_max_new[KQ_idx] = fmaxf(KQ_max_new[KQ_idx], KQ_C[(k0/(np*T_C_KQ::J))].x[l] + FATTN_KQ_MAX_OFFSET);
+                    KQ_max_new[(l/2) % 2] = fmaxf(KQ_max_new[(l/2) % 2], KQ_C[(k0/(np*T_C_KQ::J))].x[l] + FATTN_KQ_MAX_OFFSET);
                }
            }
        }
@@ -662,11 +596,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
            // Values per KQ column are spread across 4 threads:
            constexpr int offset_first = 2;
            constexpr int offset_last  = 1;
-#elif defined(AMD_WMMA_AVAILABLE)
-            // Values per KQ column are spread across 2 threads:
-            constexpr int offset_first = 16;
-            constexpr int offset_last  = 16;
-#else // Volta
+#else
            // Values per KQ column are spread across 2 threads:
            constexpr int offset_first = 2;
            constexpr int offset_last  = 2;
@@ -682,15 +612,10 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
        for (int k0 = 0; k0 < nbatch_fa; k0 += np*T_C_KQ::J) {
 #pragma unroll
            for (int l = 0; l < T_C_KQ::ne; ++l) {
+                // Turing + Volta:
                if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::J + T_C_KQ::get_j(l) < k_VKQ_sup) {
-#if defined(AMD_WMMA_AVAILABLE)
-                    constexpr int KQ_idx = 0;
-#else
-                    // Turing + Volta:
-                    const int KQ_idx = (l/2) % 2;
-#endif // defined(AMD_WMMA_AVAILABLE)
-                    KQ_C[(k0/(np*T_C_KQ::J))].x[l] = expf(KQ_C[(k0/(np*T_C_KQ::J))].x[l] - KQ_max_new[KQ_idx]);
-                    KQ_rowsum_add[KQ_idx] += KQ_C[(k0/(np*T_C_KQ::J))].x[l];
+                    KQ_C[(k0/(np*T_C_KQ::J))].x[l] = expf(KQ_C[(k0/(np*T_C_KQ::J))].x[l] - KQ_max_new[(l/2) % 2]);
+                    KQ_rowsum_add[(l/2) % 2] += KQ_C[(k0/(np*T_C_KQ::J))].x[l];
                } else {
                    KQ_C[(k0/(np*T_C_KQ::J))].x[l] = 0.0f;
                }
@@ -714,7 +639,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(

 #if defined(TURING_MMA_AVAILABLE)
        if constexpr (cols_per_warp == 8) {
-            const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[cols_per_thread - 1]);
+            const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[1]);
 #pragma unroll
            for (int i = 0; i < DV/T_C_VKQ::I; ++i) {
 #pragma unroll
@@ -735,16 +660,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                }
            }
        }
-#elif defined(AMD_WMMA_AVAILABLE)
-        const half2 KQ_max_scale_h2 = make_half2(
-            KQ_max_scale[0], KQ_max_scale[0]);
-#pragma unroll
-        for (int i = 0; i < (DV/2)/T_C_VKQ::J; ++i) {
-#pragma unroll
-            for (int l = 0; l < T_C_VKQ::ne; ++l) {
-                VKQ_C[i].x[l] *= KQ_max_scale_h2;
-            }
-        }
 #else // Volta
        const half2 KQ_max_scale_h2 = make_half2(
            KQ_max_scale[(threadIdx.x / 2) % 2], KQ_max_scale[(threadIdx.x / 2) % 2]);
@@ -792,10 +707,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
    // Therefore, iterate over V in reverse and re-use the data if possible.
    static_assert(!mla || nstages <= 1, "combination of MLA and multi-stage loading not implemented");
    constexpr int reusable_cutoff = mla ? (DKQ - 1) - (DKQ - 1) % (2*nbatch_K2) - (DKQ - DV) : DV;
-#if defined(AMD_WMMA_AVAILABLE) && !defined(LDMATRIX_TRANS_AVAILABLE)
-    T_A_VKQ A_identity;
-    make_identity_mat(A_identity);
-#endif // defined(AMD_WMMA_AVAILABLE) && !defined(LDMATRIX_TRANS_AVAILABLE)

    // Calculate VKQ tile, need to use logical rather than physical elements for i0 due to transposition of V:
 #pragma unroll
@@ -816,7 +727,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
        }
        const half2 * tile_V_i = i0_start < reusable_cutoff ? tile_V : tile_V + (i0_start - reusable_cutoff)/2;

-#if defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
+#if defined(TURING_MMA_AVAILABLE)
        constexpr int i0_stride = cols_per_warp == 8 ? T_C_VKQ::I : 2*T_C_VKQ::J;
 #pragma unroll
        for (int i_VKQ_0 = i0_start; i_VKQ_0 < i0_stop; i_VKQ_0 += i0_stride) {
@@ -826,26 +737,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                const int k0 = k00 + (threadIdx.y % np)*T_A_VKQ::J;

                T_A_VKQ A; // Transposed in SRAM but not in registers, gets transposed on load.
-#if defined(LDMATRIX_TRANS_AVAILABLE)
                load_ldmatrix_trans(A, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
-#else
-                // TODO: Try to transpose tile_V when loading gmem to smem.
-                // Use mma to transpose T_A_VKQ for RDNA.
-                T_A_VKQ A_trans;
-                load_ldmatrix(A_trans, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
-                mma(A, A_trans, A_identity);
-#endif // defined(TURING_MMA_AVAILABLE)
                if constexpr (T_B_KQ::I == 8) {
                    mma(VKQ_C[i_VKQ_0/i0_stride], A, B[k00/(np*T_A_VKQ::J)]);
                } else {
-                    // Wide version of VKQ_C is column-major.
-#if defined(AMD_WMMA_AVAILABLE)
-                    // RDNA matrix C is column-major.
-                    mma(VKQ_C[i_VKQ_0/i0_stride], A, B[k00/(np*T_A_VKQ::J)]);
-#else
-                    // swap A and B for CUDA.
+                    // Wide version of VKQ_C is column-major => swap A and B.
                    mma(VKQ_C[i_VKQ_0/i0_stride], B[k00/(np*T_A_VKQ::J)], A);
-#endif // defined(AMD_WMMA_AVAILABLE)
                }
            }
        }
@@ -864,7 +761,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                mma(VKQ_C[i_VKQ_0/i0_stride], B[k00/(np*T_A_VKQ::I)], A);
            }
        }
-#endif // defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
+#endif // defined(TURING_MMA_AVAILABLE)

        if constexpr (nstages <= 1) {
            __syncthreads(); // Only needed if tile_K == tile_V.
@@ -877,7 +774,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
        tile_Q, tile_K, tile_V, tile_mask,
        Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0);
    NO_DEVICE_CODE;
-#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
+#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
 }

 #if defined(TURING_MMA_AVAILABLE)
@@ -897,15 +794,6 @@ template<> struct mma_tile_sizes<8> {
    using T_B_VKQ = tile< 8,  8, half2>; // column-major
    using T_C_VKQ = tile<16,  4, half2>; // row-major
 };
-#elif defined(AMD_WMMA_AVAILABLE)
-template<int ncols> struct mma_tile_sizes {
-    using T_A_KQ  = tile<16,  8, half2>; // row-major
-    using T_B_KQ  = tile<16,  8, half2>; // column-major
-    using T_C_KQ  = tile<16, 16, float>; // column-major
-    using T_A_VKQ = tile<16,  8, half2>; // row-major
-    using T_B_VKQ = tile<16,  8, half2>; // column-major
-    using T_C_VKQ = tile<16,  8, half2>; // column-major
-};
 #else // Volta
 template<int ncols> struct mma_tile_sizes {
    using T_A_KQ  = tile< 8,  4, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // row-major
@@ -940,7 +828,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
        const int jt,
        const int kb0_start,
        const int kb0_stop) {
-#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
+#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.

    constexpr int ncols = ncols1 * ncols2;
@@ -952,7 +840,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
    using     T_C_VKQ   = typename mma_tile_sizes<ncols>::T_C_VKQ;

    constexpr int  cols_per_warp   = T_B_KQ::I;
-    constexpr int  cols_per_thread = get_cols_per_thread();
+    constexpr int  cols_per_thread = 2; // This is specifically KQ columns, Volta only has a single VKQ column.
    constexpr int  np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
    constexpr int  nbatch_fa       = ggml_cuda_fattn_mma_get_nbatch_fa     (DKQ, DV, ncols);
    constexpr int  nbatch_K2       = ggml_cuda_fattn_mma_get_nbatch_K2     (DKQ, DV, ncols);
@@ -983,8 +871,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
    T_B_KQ    Q_B[(Q_in_reg ? DKQ/(2*T_B_KQ::J) : 1)];
 #if defined(TURING_MMA_AVAILABLE)
    T_C_VKQ VKQ_C[cols_per_warp == 8 ? DV/T_C_VKQ::I : DV/(2*T_C_VKQ::J)];
-#elif defined(AMD_WMMA_AVAILABLE)
-    T_C_VKQ VKQ_C[                                     DV/(2*T_C_VKQ::J)];
 #else // Volta
    T_C_VKQ VKQ_C[                                     DV/(2*T_C_VKQ::J)];
 #endif // defined(TURING_MMA_AVAILABLE)
@@ -1124,10 +1010,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
        // The partial sums are spread across 8/4 threads.
        constexpr int offset_first = cols_per_warp == 8 ? 16 : 2;
        constexpr int offset_last  = cols_per_warp == 8 ?  4 : 1;
-#elif defined(AMD_WMMA_AVAILABLE)
-        // The partial sums are spread across 2 threads.
-        constexpr int offset_first = 16;
-        constexpr int offset_last  = 16;
 #else // Volta
        // The partial sums are spread across 2 threads.
        constexpr int offset_first = 2;
@@ -1165,7 +1047,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(

 #if defined(TURING_MMA_AVAILABLE)
        if constexpr (cols_per_warp == 8) {
-            const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[cols_per_thread - 1]);
+            const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[1]);
 #pragma unroll
            for (int i = 0; i < DV/T_C_VKQ::I; ++i) {
 #pragma unroll
@@ -1186,15 +1068,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
                }
            }
        }
-#elif defined(AMD_WMMA_AVAILABLE)
-        const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[0]);
-#pragma unroll
-        for (int i = 0; i < (DV/2)/T_C_VKQ::J; ++i) {
-#pragma unroll
-            for (int l = 0; l < T_C_VKQ::ne; ++l) {
-                VKQ_C[i].x[l] *= KQ_max_scale_h2;
-            }
-        }
 #else // Volta
        const int col = (threadIdx.x / 2) % 2;
        const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[col], KQ_max_scale[col]);
@@ -1246,10 +1119,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
        const int jc_cwm = threadIdx.y*cols_per_warp + T_C_VKQ::get_i(threadIdx.x % 4);
        const float2 KQ_cmr = make_float2(KQ_max[threadIdx.x % cols_per_thread], KQ_rowsum[threadIdx.x % cols_per_thread]);
        const bool thread_should_write = threadIdx.x % 4 < cols_per_thread;
-#elif defined(AMD_WMMA_AVAILABLE)
-        const int jc_cwm = threadIdx.y*cols_per_warp + T_C_VKQ::get_i(0);
-        const float2 KQ_cmr = make_float2(KQ_max[0], KQ_rowsum[0]);
-        const bool thread_should_write = threadIdx.x / 16 < cols_per_thread;
 #else // Volta
        const int jc_cwm = threadIdx.y*cols_per_warp + T_C_KQ::get_i(threadIdx.x & 2);
        const float2 KQ_cmr = make_float2(KQ_max[(threadIdx.x & 2) / 2], KQ_rowsum[(threadIdx.x & 2) / 2]);
@@ -1450,7 +1319,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
        stride_Q1, stride_Q2, stride_K, stride_V, stride_mask,
        jt, kb0_start, kb0_stop);
    NO_DEVICE_CODE;
-#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
+#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
 }

 template<int DKQ, int DV, int ncols1, int ncols2, bool use_logit_softcap, bool mla>
@@ -1477,7 +1346,7 @@ static __global__ void flash_attn_ext_f16(
                            const int32_t nb21, const int32_t nb22, const int64_t nb23,
                            const int32_t ne31, const int32_t ne32, const int32_t ne33,
                            const int32_t nb31, const int32_t nb32, const int64_t nb33) {
-#if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)))
+#if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE))

    // Skip unused kernel variants for faster compilation:
    if (use_logit_softcap && !(DKQ == 128 || DKQ == 256)) {
@@ -1491,13 +1360,6 @@ static __global__ void flash_attn_ext_f16(
    }
 #endif // __CUDA_ARCH__ == GGML_CUDA_CC_TURING

-#if defined(AMD_WMMA_AVAILABLE)
-    if (ncols1*ncols2 > 32 || ncols1*ncols2 < 16 || DKQ > 128 || ncols2 == 1) {
-        NO_DEVICE_CODE;
-        return;
-    }
-#endif // defined(AMD_WMMA_AVAILABLE)
-
    static_assert(!mla || DKQ >= DV, "MLA needs DKQ >= DV");

    constexpr int ncols     = ncols1 * ncols2;
@@ -1611,7 +1473,7 @@ static __global__ void flash_attn_ext_f16(
              ne31, ne32, ne33,
              nb31, nb32, nb33);
    NO_DEVICE_CODE;
-#endif // defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)))
+#endif // defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE))
 }

 template <int DKQ, int DV, int ncols1, int ncols2>
@@ -1630,7 +1492,7 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
    const bool Q_in_reg       = ggml_cuda_fattn_mma_get_Q_in_reg      (DKQ, DV, ncols, cc);
    const int  nstages        = ggml_cuda_fattn_mma_get_nstages       (DKQ, DV, ncols1, ncols2, cc);

-    const int cols_per_warp = std::min(ncols, get_cols_per_warp(cc));
+    const int cols_per_warp = std::min(ncols, turing_mma_available(cc) ? 16 : 32);
    const int nwarps        = nthreads / WARP_SIZE;

    constexpr bool mla = DKQ == 576;
@@ -1650,34 +1512,29 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
    float logit_softcap;
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));

-#if defined(GGML_USE_HIP)
-    using fattn_kernel_ptr_t = const void*;
-#else
-    using fattn_kernel_ptr_t = fattn_kernel_t;
-#endif // defined(GGML_USE_HIP)
    fattn_kernel_t fattn_kernel;
    if (logit_softcap == 0.0f) {
        constexpr bool use_logit_softcap = false;
        fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, use_logit_softcap, mla>;

-#if !defined(GGML_USE_MUSA)
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
        static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
        if (!shared_memory_limit_raised[id]) {
-            CUDA_CHECK(cudaFuncSetAttribute(reinterpret_cast<fattn_kernel_ptr_t>(fattn_kernel), cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
+            CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
            shared_memory_limit_raised[id] = true;
        }
-#endif // !defined(GGML_USE_MUSA)
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
    } else {
        constexpr bool use_logit_softcap = true;
        fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, use_logit_softcap, mla>;

-#if !defined(GGML_USE_MUSA)
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
        static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
        if (!shared_memory_limit_raised[id]) {
-            CUDA_CHECK(cudaFuncSetAttribute(reinterpret_cast<fattn_kernel_ptr_t>(fattn_kernel), cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
+            CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
            shared_memory_limit_raised[id] = true;
        }
-#endif // !defined(GGML_USE_MUSA)
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
    }

    launch_fattn<DV, ncols1, ncols2>
--- a/ggml/src/ggml-cuda/fattn-vec.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec.cuh
@@ -10,7 +10,7 @@ static constexpr __device__ int ggml_cuda_fattn_vec_get_nthreads_device() {
    return 128;
 }

-// Currenlty llvm with the amdgcn target does not support unrolling loops
+// Currenlty llvm with the amdgcn target dose not support unrolling loops
 // that contain a break that can not be resolved at compile time.
 #ifdef __clang__
 #pragma clang diagnostic push
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -18,12 +18,12 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_con
        }
    }

-    if ((turing_mma_available(cc) || amd_wmma_available(cc)) && Q->ne[1] <= 16/ncols2) {
+    if (turing_mma_available(cc) && Q->ne[1] <= 16/ncols2) {
        ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 16/ncols2, ncols2>(ctx, dst);
        return;
    }

-    if (ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_TURING || amd_wmma_available(cc) || Q->ne[1] <= 32/ncols2) {
+    if (ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_TURING || Q->ne[1] <= 32/ncols2) {
        ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 32/ncols2, ncols2>(ctx, dst);
        return;
    }
@@ -230,18 +230,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const

    // The effective batch size for the kernel can be increased by gqa_ratio.
    // The kernel versions without this optimization are also used for ALiBi, if there is no mask, or if the KV cache is not padded,
-    bool gqa_opt_applies = gqa_ratio % 2 == 0 && mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;
-    for (const ggml_tensor * t : {Q, K, V, mask}) {
-        if (t == nullptr) {
-            continue;
-        }
-        for (size_t i = 1; i < GGML_MAX_DIMS; ++i) {
-            if (t->nb[i] % 16 != 0) {
-                gqa_opt_applies = false;
-                break;
-            }
-        }
-    }
+    const bool gqa_opt_applies = gqa_ratio % 2 == 0 && mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;

    const int cc = ggml_cuda_info().devices[device].cc;

@@ -348,31 +337,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
        return BEST_FATTN_KERNEL_WMMA_F16;
    }

-    if (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc) && gqa_opt_applies && Q->ne[0] <= 128 && Q->ne[0] != 40 && Q->ne[0] != 72) {
-        if (can_use_vector_kernel) {
-            if (!ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) {
-                if (Q->ne[1] == 1) {
-                    if (!gqa_opt_applies) {
-                        return BEST_FATTN_KERNEL_VEC;
-                    }
-                }
-            } else {
-                if (Q->ne[1] <= 2) {
-                    return BEST_FATTN_KERNEL_VEC;
-                }
-            }
-        }
-        int gqa_ratio_eff = 1;
-        const int ncols2_max = Q->ne[0] == 576 ? 16 : 8;
-        while (gqa_ratio % (2*gqa_ratio_eff) == 0 && gqa_ratio_eff < ncols2_max) {
-            gqa_ratio_eff *= 2;
-        }
-        if (Q->ne[1] * gqa_ratio_eff <= 8) {
-            return BEST_FATTN_KERNEL_TILE; // AMD WMMA is only faster if the full tile width of 16 can be utilized.
-        }
-        return BEST_FATTN_KERNEL_MMA_F16;
-    }
-
    // If there are no tensor cores available, use the generic tile kernel:
    if (can_use_vector_kernel) {
        if (!ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) {
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3737,7 +3737,6 @@ static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx) {

    return cuda_ctx->cuda_graph->is_enabled();
 #else
-    GGML_UNUSED(cuda_ctx);
    return false;
 #endif // USE_CUDA_GRAPH
 }
@@ -4551,7 +4550,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_L2_NORM:
            return true;
        case GGML_OP_RMS_NORM_BACK:
-            return ggml_is_contiguous(op->src[0]);
+            return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0;
            break;
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -206,16 +206,10 @@ namespace ggml_cuda_mma {

        static __device__ __forceinline__ int get_j(const int l) {
            if constexpr (I == 16 && J == 16) {
+                // matrix C
 #if defined(RDNA3)
-                if constexpr (std::is_same_v<T, float> || std::is_same_v<T, int>) {
-                    // matrix C
-                    return 2 * l + (threadIdx.x / 16);
-                } else {
-                    // matrix A&B
-                    return l;
-                }
+                return 2 * l + (threadIdx.x / 16);
 #else
-                // matrix C is the transposed matrix A&B on RDNA4
                return ne * (threadIdx.x / 16) + l;
 #endif // defined(RDNA3)
            } else if constexpr (I == 16 && J == 8) {
@@ -627,21 +621,6 @@ namespace ggml_cuda_mma {

        return ret;
    }
-#elif defined(AMD_WMMA_AVAILABLE)
-    template <int I, int J>
-    static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
-        tile<I, J/2, half2> ret;
-#pragma unroll
-        for (int l0 = 0; l0 < tile_float.ne; l0 += 2) {
-            ret.x[l0/2] = make_half2(tile_float.x[l0 + 0], tile_float.x[l0 + 1]);
-        }
-        return ret;
-    }
-
-    static __device__ __forceinline__ tile<8, 8, half2> get_transposed(const tile<16, 4, half2> & t) {
-        NO_DEVICE_CODE;
-        return tile<8, 8, half2>{};
-    }
 #else // Volta
    template <int I, int J>
    static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
@@ -660,19 +639,6 @@ namespace ggml_cuda_mma {
    }
 #endif // defined(TURING_MMA_AVAILABLE)

-    static __device__ __forceinline__ void make_identity_mat(tile<16, 8, half2> & t) {
-#if defined(RDNA4)
-        const int row = t.get_i(0);
-        const int left_right = t.get_j(0) / 4;
-        const int up_down = row / 8;
-        const int idx = row % 8;
-        reinterpret_cast<half*>(t.x)[idx] = left_right == up_down ? 1.0f : 0.0f;
-#else
-        GGML_UNUSED_VARS(t);
-        NO_DEVICE_CODE;
-#endif // defined(RDNA4)
-    }
-
    template <int I, int J, typename T, data_layout dl>
    static __device__ __forceinline__ void load_generic(tile<I, J, T, dl> & t, const T * __restrict__ xs0, const int stride) {
 #if defined(AMD_MFMA_AVAILABLE)
@@ -912,17 +878,6 @@ namespace ggml_cuda_mma {
            : "+r"(Dxi[2]), "+r"(Dxi[3])
            : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3]));
 #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
-#elif defined(AMD_WMMA_AVAILABLE)
-#if defined(RDNA4)
-        using halfx8_t = __attribute__((ext_vector_type(8))) _Float16;
-        halfx8_t& acc_frag = reinterpret_cast<halfx8_t&>(D.x[0]);
-        const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
-        const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
-        acc_frag = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
-#else
-        GGML_UNUSED_VARS(D, A, B);
-        NO_DEVICE_CODE;
-#endif // defined(RDNA4)
 #else
        GGML_UNUSED_VARS(D, A, B);
        NO_DEVICE_CODE;
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -190,7 +190,7 @@ void ggml_cuda_mul_mat_q(
    {
        const int64_t s11 = src1->nb[1] / ts_src1;
        const int64_t s12 = src1->nb[2] / ts_src1;
-        const int64_t s13 = src1->nb[3] / ts_src1;
+        const int64_t s13 = src1->nb[2] / ts_src1;

        if (use_native_mxfp4) {
            quantize_mmq_mxfp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
@@ -333,31 +333,28 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
    }

    if (amd_wmma_available(cc)) {
+        // RDNA 4 is consistently worse on rocblas
+        // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
        if (GGML_CUDA_CC_IS_RDNA3(cc)) {
-            // High expert counts are almost always better on MMQ due to
-            //     the synchronization overhead in the cuBLAS/hipBLAS path:
+            // High expert counts almost always better on MMQ
+            // due to a large amount of graph splits
            // https://github.com/ggml-org/llama.cpp/pull/18202
            if (n_experts >= 64) {
                return true;
            }

-            // For some quantization types MMQ can have lower peak TOPS than hipBLAS
-            //     so it's only faster for sufficiently small batch sizes:
            switch (type) {
+                // These quants are really bad on MMQ
                case GGML_TYPE_Q2_K:
-                    return ne11 <= 128;
                case GGML_TYPE_Q6_K:
-                    return ne11 <= (GGML_CUDA_CC_IS_RDNA3_0(cc) ? 128 : 256);
+                // These quants are usually worse but not always
                case GGML_TYPE_IQ2_XS:
                case GGML_TYPE_IQ2_S:
-                    return GGML_CUDA_CC_IS_RDNA3_5(cc) || ne11 <= 128;
+                    return ne11 <= 128;
                default:
                    return true;
            }
        }
-
-        // For RDNA4 MMQ is consistently faster than dequantization + hipBLAS:
-        // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
        return true;
    }

--- a/ggml/src/ggml-cuda/norm.cu
+++ b/ggml/src/ggml-cuda/norm.cu
@@ -25,8 +25,19 @@ static __global__ void norm_f32(
    }

    // sum up partial sums
-    extern __shared__ float2 s_sum2[];
-    mean_var = block_reduce<block_reduce_method::SUM, block_size>(mean_var, s_sum2);
+    mean_var = warp_reduce_sum(mean_var);
+    if constexpr (block_size > WARP_SIZE) {
+        static_assert(block_size == 1024, "unexpected block_size");
+        __shared__ float2 s_sum[32];
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = mean_var;
+        }
+        __syncthreads();
+        mean_var = s_sum[lane_id];
+        mean_var = warp_reduce_sum(mean_var);
+    }

    const float mean = mean_var.x / ncols;
    const float var = mean_var.y / ncols - mean * mean;
@@ -50,8 +61,19 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
        tmp += x[j];
    }

-    extern __shared__ float s_sum[];
-    tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
+    tmp = warp_reduce_sum(tmp);
+    if constexpr (block_size > WARP_SIZE) {
+        static_assert(block_size == 1024, "unexpected block_size");
+        __shared__ float s_sum[32];
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        __syncthreads();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp);
+    }

    const float mean = tmp / group_size;
    tmp = 0.0f;
@@ -62,7 +84,18 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
        tmp += xi * xi;
    }

-    tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
+    tmp = warp_reduce_sum(tmp);
+    if (block_size > WARP_SIZE) {
+        __shared__ float s_sum[32];
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        __syncthreads();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp);
+    }

    const float variance = tmp / group_size;
    const float scale = rsqrtf(variance + eps);
@@ -130,8 +163,22 @@ static __global__ void rms_norm_f32(const float * x,
    }

    // sum up partial sums
-    extern __shared__ float s_sum[];
-    tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
+    tmp = warp_reduce_sum(tmp);
+    if constexpr (block_size > WARP_SIZE) {
+        static_assert((block_size <= 1024) && (block_size % 32 == 0), "unexpected block_size");
+        __shared__ float s_sum[32];
+        const int        warp_id = tid / WARP_SIZE;
+        const int        lane_id = tid % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        __syncthreads();
+        tmp = 0.0f;
+        if (lane_id < (block_size / WARP_SIZE)) {
+            tmp = s_sum[lane_id];
+        }
+        tmp = warp_reduce_sum(tmp);
+    }

    const float mean = tmp / ncols;
    const float scale = rsqrtf(mean + eps);
@@ -259,8 +306,19 @@ static __global__ void l2_norm_f32(
    }

    // sum up partial sums
-    extern __shared__ float s_sum[];
-    tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
+    tmp = warp_reduce_sum(tmp);
+    if constexpr (block_size > WARP_SIZE) {
+        static_assert(block_size == 1024, "unexpected block_size");
+        __shared__ float s_sum[32];
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        __syncthreads();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp);
+    }

    // from https://pytorch.org/docs/stable/generated/torch.nn.functional.normalize.html
    const float scale = rsqrtf(fmaxf(tmp, eps * eps));
@@ -279,7 +337,7 @@ static void norm_f32_cuda(
        norm_f32<WARP_SIZE><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
-        norm_f32<1024><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float2): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+        norm_f32<1024><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
    }
 }

@@ -290,7 +348,7 @@ static void group_norm_f32_cuda(
        group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
-        group_norm_f32<1024><<<num_groups, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, group_size, ne_elements, eps);
+        group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
    }
 }

@@ -300,10 +358,10 @@ static void rms_norm_f32_cuda(
    const dim3 blocks_num(nrows, nchannels, nsamples);
    if (ncols < 1024) {
        const dim3 block_dims(256, 1, 1);
-        rms_norm_f32<256, false><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+        rms_norm_f32<256, false><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
-        rms_norm_f32<1024, false><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+        rms_norm_f32<1024, false><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
    }
 }

@@ -346,12 +404,12 @@ static void rms_norm_mul_f32_cuda(const float *  x,
        const uint3 mul_nsamples_packed  = init_fastdiv_values(mul_nsamples);
        if (ncols < 1024) {
            const dim3 block_dims(256, 1, 1);
-            rms_norm_f32<256, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
+            rms_norm_f32<256, true><<<blocks_num, block_dims, 0, stream>>>(
                x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
        } else {
            const dim3 block_dims(1024, 1, 1);
-            rms_norm_f32<1024, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
+            rms_norm_f32<1024, true><<<blocks_num, block_dims, 0, stream>>>(
                x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
        }
@@ -367,14 +425,14 @@ static void rms_norm_mul_f32_cuda(const float *  x,
        const uint3 add_nsamples_packed  = init_fastdiv_values(add_nsamples);
        if (ncols < 1024) {
            const dim3 block_dims(256, 1, 1);
-            rms_norm_f32<256, true, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
+            rms_norm_f32<256, true, true><<<blocks_num, block_dims, 0, stream>>>(
                x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add,
                add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed,
                add_nchannels_packed, add_nsamples_packed);
        } else {
            const dim3 block_dims(1024, 1, 1);
-            rms_norm_f32<1024, true, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
+            rms_norm_f32<1024, true, true><<<blocks_num, block_dims, 0, stream>>>(
                x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add,
                add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed,
@@ -402,7 +460,7 @@ static void l2_norm_f32_cuda(
        l2_norm_f32<WARP_SIZE><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
-        l2_norm_f32<1024><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+        l2_norm_f32<1024><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
    }
 }

--- a/ggml/src/ggml-cuda/reduce_rows.cuh
+++ b/ggml/src/ggml-cuda/reduce_rows.cuh
@@ -28,8 +28,22 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r
    }

    // sum up partial sums
-    __shared__ float shared_vals[32];
-    sum = block_reduce<block_reduce_method::SUM>(sum, shared_vals);
+    sum = warp_reduce_sum(sum);
+    if (blockDim.x > WARP_SIZE) {
+        assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
+        __shared__ float s_sum[32];
+        const int        warp_id = threadIdx.x / WARP_SIZE;
+        const int        lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = sum;
+        }
+        __syncthreads();
+        sum = 0.0f;
+        if (lane_id < (static_cast<int>(blockDim.x) / WARP_SIZE)) {
+            sum = s_sum[lane_id];
+        }
+        sum = warp_reduce_sum(sum);
+    }

    if (col != 0) {
        return;
--- a/ggml/src/ggml-cuda/softmax.cu
+++ b/ggml/src/ggml-cuda/softmax.cu
@@ -75,6 +75,9 @@ static __global__ void soft_max_f32(

    const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;

+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+
    const float slope = get_alibi_slope(p.max_bias, i02, p.n_head_log2, p.m0, p.m1);

    extern __shared__ float data_soft_max_f32[];
@@ -99,7 +102,21 @@ static __global__ void soft_max_f32(
    }

    // find the max value in the block
-    max_val = block_reduce<block_reduce_method::MAX, block_size_template>(max_val, buf_iw);
+    max_val = warp_reduce_max(max_val);
+    if (block_size > WARP_SIZE) {
+        if (warp_id == 0) {
+            buf_iw[lane_id] = -INFINITY;
+        }
+        __syncthreads();
+
+        if (lane_id == 0) {
+            buf_iw[warp_id] = max_val;
+        }
+        __syncthreads();
+
+        max_val = buf_iw[lane_id];
+        max_val = warp_reduce_max(max_val);
+    }

    float tmp = 0.0f; // partial sum

@@ -117,7 +134,22 @@ static __global__ void soft_max_f32(
    }

    // find the sum of exps in the block
-    tmp = block_reduce<block_reduce_method::SUM, block_size_template>(tmp, buf_iw);
+    tmp = warp_reduce_sum(tmp);
+    if (block_size > WARP_SIZE) {
+        __syncthreads();
+        if (warp_id == 0) {
+            buf_iw[lane_id] = 0.0f;
+        }
+        __syncthreads();
+
+        if (lane_id == 0) {
+            buf_iw[warp_id] = tmp;
+        }
+        __syncthreads();
+
+        tmp = buf_iw[lane_id];
+        tmp = warp_reduce_sum(tmp);
+    }

    if (sinks) {
        tmp += expf(sinks[i02] - max_val);
@@ -137,6 +169,50 @@ static __global__ void soft_max_f32(
    }
 }

+
+// TODO: This is a common pattern used across kernels that could be moved to common.cuh + templated
+static __device__ float two_stage_warp_reduce_max(float val) {
+    val = warp_reduce_max(val);
+    if (blockDim.x > WARP_SIZE) {
+        assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
+        __shared__ float local_vals[32];
+        const int        warp_id = threadIdx.x / WARP_SIZE;
+        const int        lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            local_vals[warp_id] = val;
+        }
+        __syncthreads();
+        val = -INFINITY;
+        if (lane_id < (static_cast<int>(blockDim.x) / WARP_SIZE)) {
+            val = local_vals[lane_id];
+        }
+        return warp_reduce_max(val);
+    } else {
+        return val;
+    }
+}
+
+static __device__ float two_stage_warp_reduce_sum(float val) {
+    val = warp_reduce_sum(val);
+    if (blockDim.x > WARP_SIZE) {
+        assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
+        __shared__ float local_vals[32];
+        const int        warp_id = threadIdx.x / WARP_SIZE;
+        const int        lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            local_vals[warp_id] = val;
+        }
+        __syncthreads();
+        val = 0.0f;
+        if (lane_id < (static_cast<int>(blockDim.x) / WARP_SIZE)) {
+            val = local_vals[lane_id];
+        }
+        return warp_reduce_sum(val);
+    } else {
+        return val;
+    }
+}
+
 // TODO: Template to allow keeping ncols in registers if they fit
 static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __restrict__ x,
                                                                float * __restrict__ dst,
@@ -154,7 +230,6 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __
    float     local_vals[n_elem_per_thread] = { -INFINITY, -INFINITY, -INFINITY, -INFINITY };
    float     local_max                     = -INFINITY;
    const int step_size                     = gridDim.x * blockDim.x;
-    __shared__ float shared_vals[32];

    // Compute thread-local max
    for (int col = col_start; col < p.ncols;) {
@@ -171,7 +246,7 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __
    }

    // Compute CTA-level max
-    local_max = block_reduce<block_reduce_method::MAX>(local_max, shared_vals);
+    local_max = two_stage_warp_reduce_max(local_max);

    // Store CTA-level max to GMEM
    if (tid == 0) {
@@ -186,7 +261,7 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __
    } else {
        local_max = -INFINITY;
    }
-    local_max = block_reduce<block_reduce_method::MAX>(local_max, shared_vals);
+    local_max = two_stage_warp_reduce_max(local_max);

    // Compute softmax dividends, accumulate divisor
    float tmp_expf = 0.0f;
@@ -209,7 +284,7 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __
    }

    // Reduce divisor within CTA
-    tmp_expf = block_reduce<block_reduce_method::SUM>(tmp_expf, shared_vals);
+    tmp_expf = two_stage_warp_reduce_sum(tmp_expf);

    // Store CTA-level sum to GMEM
    if (tid == 0) {
@@ -223,7 +298,7 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __
    } else {
        tmp_expf = 0.0f;
    }
-    tmp_expf = block_reduce<block_reduce_method::SUM>(tmp_expf, shared_vals);
+    tmp_expf = two_stage_warp_reduce_sum(tmp_expf);

    // Divide dividend by global sum + store data
    for (int col = col_start; col < p.ncols;) {
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -138,8 +138,6 @@
 #define cudaStream_t hipStream_t
 #define cudaSuccess hipSuccess
 #define cudaOccupancyMaxActiveBlocksPerMultiprocessor hipOccupancyMaxActiveBlocksPerMultiprocessor
-#define cudaFuncSetAttribute hipFuncSetAttribute
-#define cudaFuncAttributeMaxDynamicSharedMemorySize hipFuncAttributeMaxDynamicSharedMemorySize
 #define __trap() do { abort(); __builtin_unreachable(); } while(0)
 #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
 #define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -42,12 +42,12 @@
 #include "htp_iface.h"

 static size_t opt_ndev         = 1;
-static size_t opt_nhvx         = 0; // use all
-static int    opt_arch         = 0; // autodetect
+static size_t opt_nhvx         = 0;  // use all
+static int    opt_arch         = 0;  // autodetect
 static int    opt_etm          = 0;
 static int    opt_verbose      = 0;
 static int    opt_profile      = 0;
-static int    opt_hostbuf      = 1; // hostbuf ON by default
+static int    opt_hostbuf      = 1;
 static int    opt_experimental = 0;

 // Enable all stages by default
@@ -1753,9 +1753,6 @@ static bool ggml_backend_buffer_is_hexagon(const struct ggml_backend_buffer * b)
 }

 static inline bool ggml_backend_buffer_is_hexagon_repack(const struct ggml_backend_buffer * b) {
-    if (!opt_hostbuf) {
-        return ggml_backend_buffer_is_hexagon(b);
-    }
    return b->buft->iface.alloc_buffer == ggml_backend_hexagon_repack_buffer_type_alloc_buffer;
 }

@@ -2305,16 +2302,6 @@ static inline size_t init_binary_req(htp_general_req * req, dspqueue_buffer * bu
    return n_bufs;
 }

-static inline size_t init_cpy_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
-    req->op = HTP_OP_CPY;
-
-    size_t n_bufs = 0;
-    n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->dst,  &bufs[n_bufs], t,         DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
-
-    return n_bufs;
-}
-
 static inline size_t init_get_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
    req->op = HTP_OP_GET_ROWS;

@@ -2570,10 +2557,6 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
                ggml_hexagon_dispatch_op<init_get_rows_req>(sess, node, flags);
                break;

-            case GGML_OP_CPY:
-                ggml_hexagon_dispatch_op<init_cpy_req>(sess, node, flags);
-                break;
-
            default:
                GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(node));
        }
@@ -2875,27 +2858,6 @@ static bool ggml_hexagon_supported_buffers(ggml_hexagon_session *sess, const str
    return true;
 }

-static bool ggml_hexagon_supported_cpy(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
-    const struct ggml_tensor * src0 = op->src[0];
-    const struct ggml_tensor * dst  = op;
-
-    // for now we can do f32 -> f16 and f16 -> f32 (without reshaping)
-    if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
-    if ( dst->type != GGML_TYPE_F32 &&  dst->type != GGML_TYPE_F16) return false;
-
-    const bool sametype   = (src0->type == dst->type);
-    const bool transposed = ggml_is_transposed(src0) || ggml_is_transposed(dst);
-    const bool sameshape  = !transposed && ggml_are_same_shape(src0, dst);
-
-    // can handle any shape and any same-type (pretty slow if reshaping is required)
-    if (sametype) return true;
-
-    // cannot handle re-shaping and type conversion at the same time
-    if (!sameshape) return false;
-
-    return true;
-}
-
 static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
    auto sess = static_cast<ggml_hexagon_session *>(dev->context);

@@ -2974,10 +2936,6 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
            supp = ggml_hexagon_supported_get_rows(sess, op);
            break;

-        case GGML_OP_CPY:
-            supp = ggml_hexagon_supported_cpy(sess, op);
-            break;
-
        default:
            break;
    }
@@ -3103,7 +3061,7 @@ static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t
 }

 static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, const char * name) {
-    if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0 && opt_hostbuf) {
+    if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
        ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_hexagon_device_get_extra_buffers_type;
        return (void *) fct;
    }
@@ -3120,31 +3078,34 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
    static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
                  "please update hexagon_type to match ggml_type");

-    const char * str_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL");
    const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
    const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
-    const char * str_opmask  = getenv("GGML_HEXAGON_OPMASK");
-    const char * str_opsync  = getenv("GGML_HEXAGON_OPSYNC");
-    const char * str_profile = getenv("GGML_HEXAGON_PROFILE");
-    const char * str_etm     = getenv("GGML_HEXAGON_ETM");
-    const char * str_nhvx    = getenv("GGML_HEXAGON_NHVX");
-    const char * str_ndev    = getenv("GGML_HEXAGON_NDEV");
-    const char * str_arch    = getenv("GGML_HEXAGON_ARCH");

-    opt_experimental = str_experimental ? atoi(str_experimental) : 0;
    opt_verbose      = str_verbose ? atoi(str_verbose) : 0;
-    opt_hostbuf      = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
-    opt_opmask       = str_opmask  ? strtoul(str_opmask, NULL, 0) : opt_opmask;
-    opt_opsync       = str_opsync  ? atoi(str_opsync)  : 0;
-    opt_profile      = str_profile ? atoi(str_profile) : 0;
-    opt_etm          = str_etm     ? atoi(str_etm) : 0;
-    opt_nhvx         = str_nhvx    ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
-    opt_ndev         = str_ndev    ? strtoul(str_ndev, NULL, 0) : opt_ndev;
+    opt_profile      = getenv("GGML_HEXAGON_PROFILE") != nullptr;
+    opt_etm          = getenv("GGML_HEXAGON_ETM") != nullptr;
+    opt_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL") != nullptr;

-    if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
-        opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
+    const char * str_opmask = getenv("GGML_HEXAGON_OPMASK");
+    if (str_opmask != nullptr) {
+        opt_opmask = strtoul(str_opmask, NULL, 0);
+    }
+    opt_opsync = getenv("GGML_HEXAGON_OPSYNC") != nullptr;
+
+    const char * str_ndev = getenv("GGML_HEXAGON_NDEV");
+    if (str_ndev) {
+        opt_ndev = strtoul(str_ndev, NULL, 0);
+        if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
+            opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
+        }
    }

+    const char * str_nhvx = getenv("GGML_HEXAGON_NHVX");
+    if (str_nhvx) {
+        opt_nhvx = strtoul(str_nhvx, NULL, 0);
+    }
+
+    const char * str_arch = getenv("GGML_HEXAGON_ARCH");
    if (str_arch) {
        if (str_arch[0] == 'v') {
            str_arch++;
@@ -3152,6 +3113,8 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
        opt_arch = strtoul(str_arch, NULL, 0);
    }

+    opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : 1;
+
    reg->context = new ggml_hexagon_registry(reg);

    HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req),
--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -17,7 +17,11 @@ add_library(${HTP_LIB} SHARED
    main.c
    htp_iface_skel.c
    worker-pool.c
-    hex-dma.c
+    htp-dma.c
+    hvx-sigmoid.c
+    hvx-inverse.c
+    hvx-exp.c
+    hvx-utils.c
    matmul-ops.c
    binary-ops.c
    unary-ops.c
@@ -27,12 +31,10 @@ add_library(${HTP_LIB} SHARED
    flash-attn-ops.c
    set-rows-ops.c
    get-rows-ops.c
-    cpy-ops.c
 )

 target_compile_definitions(${HTP_LIB} PRIVATE
    $<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>
-    $<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,FARF_HIGH=1,>
    FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})

 build_idl(htp_iface.idl ${HTP_LIB})
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -2,20 +2,27 @@
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"

+#ifdef HTP_DEBUG
+#    define FARF_HIGH 1
+#endif
 #include <HAP_farf.h>
+#include <HAP_mem.h>
 #include <HAP_perf.h>
-
+#include <HAP_ps.h>
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
 #include <math.h>
+#include <qurt_thread.h>
 #include <string.h>

-#include "hex-dma.h"
-#include "hvx-utils.h"
-
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
+#include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
+#include "hvx-utils.h"
+#include "ops-utils.h"

 #define htp_act_preamble3              \
    const uint32_t ne00 = src0->ne[0]; \
@@ -69,7 +76,7 @@
    const uint32_t nb2 = dst->nb[2];   \
    const uint32_t nb3 = dst->nb[3];

-static void glu_swiglu_f32_per_thread(const struct htp_tensor * src0,
+static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
                                       const struct htp_tensor * src1,
                                       struct htp_tensor *       dst,
                                       const int32_t *           op_params,
@@ -117,9 +124,9 @@ static void glu_swiglu_f32_per_thread(const struct htp_tensor * src0,
        data_src1 += swapped ? 0 : nc_in_bytes;
    }

-    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
-    const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
-    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
+    const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
+    const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN);
+    const size_t dst_row_size_aligned  = htp_round_up(dst_row_size, VLEN);

    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
    uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
@@ -168,9 +175,9 @@ static void glu_swiglu_f32_per_thread(const struct htp_tensor * src0,
            float *       dst_spad_ptr  = dst_spad + ib * (dst_row_size_aligned / sizeof(float));

            //swiglu(x) = x1 * sigmoid(x0)
-            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, nc);
-            hvx_mul_mul_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr,
-                                (const uint8_t *) src1_spad_ptr, nc);
+            hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, nc);
+            hvx_mul_mul_f32_opt((const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr,
+                                (const uint8_t *) src1_spad_ptr, (uint8_t *) dst_spad_ptr, nc);
        }

        dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad), dst_row_size,
@@ -196,7 +203,7 @@ static void glu_swiglu_f32_per_thread(const struct htp_tensor * src0,
         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }

-static void glu_swiglu_oai_f32_per_thread(const struct htp_tensor * src0,
+static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
                                           const struct htp_tensor * src1,
                                           struct htp_tensor *       dst,
                                           const int32_t *           op_params,
@@ -242,9 +249,9 @@ static void glu_swiglu_oai_f32_per_thread(const struct htp_tensor * src0,
        data_src1 += swapped ? 0 : nc_in_bytes;
    }

-    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
-    const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
-    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
+    const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
+    const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN);
+    const size_t dst_row_size_aligned  = htp_round_up(dst_row_size, VLEN);

    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
    uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
@@ -297,18 +304,18 @@ static void glu_swiglu_oai_f32_per_thread(const struct htp_tensor * src0,
            float *       dst_spad_ptr  = dst_spad + ib * (dst_row_size_aligned / sizeof(float));

            // x (src0_spad_data) = std::min(src0_p[k], limit);
-            hvx_min_scalar_f32((uint8_t *) src0_spad_ptr, (const uint8_t *) src0_spad_ptr, limit, nc);
+            hvx_min_scalar_f32((const uint8_t *) src0_spad_ptr, limit, (uint8_t *) src0_spad_ptr, nc);
            // y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit);
-            hvx_clamp_scalar_f32((uint8_t *) src1_spad_ptr, (const uint8_t *) src1_spad_ptr, -limit, limit, nc);
+            hvx_clamp_scalar_f32((const uint8_t *) src1_spad_ptr, -limit, limit, (uint8_t *) src1_spad_ptr, nc);
            // y (src1_spad_data)  = y1 + 1.f
-            hvx_add_scalar_f32((uint8_t *) src1_spad_ptr, (const uint8_t *) src1_spad_ptr, 1.0, nc);
+            hvx_add_scalar_f32((const uint8_t *) src1_spad_ptr, 1.0, (uint8_t *) src1_spad_ptr, nc);
            // x1 (dst_spad_data) = alpha * (x)
-            hvx_mul_scalar_f32((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, alpha, nc);
+            hvx_mul_scalar_f32((const uint8_t *) src0_spad_ptr, alpha, (uint8_t *) dst_spad_ptr, nc);
            // x2 (dst_spad_data) = sigmoid(x1) = 1/(1+exp(-x1))
-            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) dst_spad_ptr, nc);
+            hvx_fast_sigmoid_f32((const uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, nc);
            // out = x * sigmoid(alpha * x) * (y + 1.f)
-            hvx_mul_mul_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr,
-                                (const uint8_t *) src1_spad_ptr, nc);
+            hvx_mul_mul_f32_opt((const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr,
+                                (const uint8_t *) src1_spad_ptr, (uint8_t *) dst_spad_ptr, nc);
        }

        dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad), dst_row_size,
@@ -335,7 +342,7 @@ static void glu_swiglu_oai_f32_per_thread(const struct htp_tensor * src0,
 }


-static void unary_gelu_f32_per_thread(const struct htp_tensor * src0,
+static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
                                       struct htp_tensor *       dst,
                                       const int32_t *           op_params,
                                       struct htp_spad *         src0_spad,
@@ -351,8 +358,8 @@ static void unary_gelu_f32_per_thread(const struct htp_tensor * src0,

    const size_t src0_row_size = nb01;
    const size_t dst_row_size  = nb1;
-    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
-    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
+    const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
+    const size_t dst_row_size_aligned  = htp_round_up(dst_row_size, VLEN);

    const uint32_t src0_nrows = ne01 * ne02 * ne03;

@@ -408,9 +415,9 @@ static void unary_gelu_f32_per_thread(const struct htp_tensor * src0,
            float* dst_spad_ptr        = dst_spad  + ib * (dst_row_size_aligned  / sizeof(float));

            // gelu = x * sigmoid(1.702 * x) // current implementation
-            hvx_mul_scalar_f32((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (float) 1.702, ne0);
-            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
-            hvx_mul_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
+            hvx_mul_scalar_f32((const uint8_t *) src0_spad_ptr, (float) 1.702, (uint8_t *) dst_spad_ptr, ne0);
+            hvx_fast_sigmoid_f32((const uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
+            hvx_mul_f32_opt((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
        }

        dma_queue_push_vtcm_to_ddr(dma_queue,
@@ -435,15 +442,15 @@ static void unary_gelu_f32_per_thread(const struct htp_tensor * src0,
         ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }

-static void unary_gelu_f32(unsigned int n, unsigned int i, void * data) {
+static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) {
    struct htp_ops_context * octx = (struct htp_ops_context *) data;
-    unary_gelu_f32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
+    unary_gelu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
                               octx->src0_nrows_per_thread, octx->ctx->dma[i]);
 }



-static void unary_silu_f32_per_thread(const struct htp_tensor * src0,
+static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
                                       struct htp_tensor *       dst,
                                       const int32_t *           op_params,
                                       struct htp_spad *         src0_spad,
@@ -459,8 +466,8 @@ static void unary_silu_f32_per_thread(const struct htp_tensor * src0,

    const size_t src0_row_size = nb01;
    const size_t dst_row_size  = nb1;
-    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
-    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
+    const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
+    const size_t dst_row_size_aligned  = htp_round_up(dst_row_size, VLEN);

    const uint32_t src0_nrows = ne01 * ne02 * ne03;

@@ -515,8 +522,8 @@ static void unary_silu_f32_per_thread(const struct htp_tensor * src0,
            float* dst_spad_ptr        = dst_spad  + ib * (dst_row_size_aligned  / sizeof(float));

            // silu = x * sigmoid(x)
-            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, ne0);
-            hvx_mul_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
+            hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
+            hvx_mul_f32_opt((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
        }

        dma_queue_push_vtcm_to_ddr(dma_queue,
@@ -541,25 +548,25 @@ static void unary_silu_f32_per_thread(const struct htp_tensor * src0,
         ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }

-static void unary_silu_f32(unsigned int n, unsigned int i, void * data) {
+static void unary_silu_fp32(unsigned int n, unsigned int i, void * data) {
    struct htp_ops_context * octx = (struct htp_ops_context *) data;
-    unary_silu_f32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
+    unary_silu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
                               octx->src0_nrows_per_thread, octx->ctx->dma[i]);
 }

-static void glu_swiglu_f32(unsigned int n, unsigned int i, void * data) {
+static void glu_swiglu_fp32(unsigned int n, unsigned int i, void * data) {
    struct htp_ops_context * octx = (struct htp_ops_context *) data;
-    glu_swiglu_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
+    glu_swiglu_fp32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
                               &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
 }

-static void glu_swiglu_oai_f32(unsigned int n, unsigned int i, void * data) {
+static void glu_swiglu_oai_fp32(unsigned int n, unsigned int i, void * data) {
    struct htp_ops_context * octx = (struct htp_ops_context *) data;
-    glu_swiglu_oai_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
+    glu_swiglu_oai_fp32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
                                   &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
 }

-static int execute_op_activations_f32(struct htp_ops_context * octx) {
+static int execute_op_activations_fp32(struct htp_ops_context * octx) {
    int err = HTP_STATUS_OK;

    const struct htp_tensor * src0 = &octx->src0;
@@ -576,21 +583,21 @@ static int execute_op_activations_f32(struct htp_ops_context * octx) {

    switch (octx->op) {
        case HTP_OP_UNARY_SILU:
-            act_op_func = unary_silu_f32;
+            act_op_func = unary_silu_fp32;
            op_type     = "silu-f32";
            break;

        case HTP_OP_GLU_SWIGLU:
-            act_op_func = glu_swiglu_f32;
+            act_op_func = glu_swiglu_fp32;
            op_type     = "swiglu-f32";
            break;

        case HTP_OP_GLU_SWIGLU_OAI:
-            act_op_func = glu_swiglu_oai_f32;
+            act_op_func = glu_swiglu_oai_fp32;
            op_type     = "swiglu-oai-f32";
            break;
        case HTP_OP_UNARY_GELU:
-            act_op_func = unary_gelu_f32;
+            act_op_func = unary_gelu_fp32;
            op_type     = "gelu-f32";
            break;
        default:
@@ -610,9 +617,9 @@ static int execute_op_activations_f32(struct htp_ops_context * octx) {
        src1_row_size = src0_row_size;
    }

-    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
-    const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
-    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
+    const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
+    const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN);
+    const size_t dst_row_size_aligned  = htp_round_up(dst_row_size, VLEN);
    // VTCM scratchpads for all tensors
    // N rows per thread, padded to HVX vector size

@@ -663,7 +670,7 @@ int op_activations(struct htp_ops_context * octx) {

    switch (octx->src0.type) {
        case HTP_TYPE_F32:
-            err = execute_op_activations_f32(octx);
+            err = execute_op_activations_fp32(octx);
            break;

        default:
--- a/ggml/src/ggml-hexagon/htp/binary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/binary-ops.c
@@ -2,25 +2,36 @@
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"

+#ifdef HTP_DEBUG
+#    define FARF_HIGH 1
+#endif
+
 #include <HAP_farf.h>
+#include <HAP_mem.h>
 #include <HAP_perf.h>
-
+#include <HAP_ps.h>
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
 #include <math.h>
+#include <qurt_thread.h>
 #include <string.h>

-#include "hex-dma.h"
-#include "hvx-utils.h"
-
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
+#include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
+#include "hvx-utils.h"
+#include "ops-utils.h"

-typedef void (*hvx_elemwise_f32_func)(uint8_t * data_dst, const uint8_t * src0, const uint8_t * src1, const uint32_t num_elems);
+typedef void (*hvx_elemwise_f32_func)(const uint8_t * src0,
+                                      const uint8_t * src1,
+                                      uint8_t *       data_dst,
+                                      const int       num_elems);

 static hvx_elemwise_f32_func func_table_HVX[]     = { hvx_mul_f32, hvx_add_f32, hvx_sub_f32 };
-static hvx_elemwise_f32_func func_table_HVX_opt[] = { hvx_mul_f32_aa, hvx_add_f32_aa, hvx_sub_f32_aa };
+static hvx_elemwise_f32_func func_table_HVX_opt[] = { hvx_mul_f32_opt, hvx_add_f32_opt, hvx_sub_f32_opt };

 #define htp_binary_preamble            \
    const struct htp_tensor * src0 = &octx->src0; \
@@ -87,8 +98,9 @@ static void binary_job_f32_per_thread(struct htp_ops_context * octx,

    int is_aligned = 1;
    int opt_path   = 0;
-    if ((0 == hex_is_aligned((void *) src0->data, VLEN)) || (0 == hex_is_aligned((void *) src1->data, VLEN)) ||
-        (0 == hex_is_aligned((void *) dst->data, VLEN))) {
+    if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) ||
+        (0 == htp_is_aligned((void *) dst->data, VLEN))) {
+        FARF(HIGH, "binary-f32: unaligned addresses in elementwise op, possibly slower execution\n");
        is_aligned = 0;
    }
    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
@@ -118,24 +130,24 @@ static void binary_job_f32_per_thread(struct htp_ops_context * octx,
        const uint8_t * restrict src1_ptr = data_src1 + i13 * nb13 + i12 * nb12 + i11 * src1_row_size;

        if (ir + 1 < src0_end_row) {
-            hex_l2fetch(src0_ptr + ne00, src0_row_size, src0_row_size, 1);
+            htp_l2fetch(src0_ptr + ne00, 1, src0_row_size, src0_row_size);
            if (src1_row_size == src0_row_size) {
-                hex_l2fetch(src1_ptr, src1_row_size, src1_row_size, 1);
+                htp_l2fetch(src1_ptr, 1, src1_row_size, src1_row_size);
            }
        }

        const uint32_t nr0 = ne00 / ne10;
        if (nr0 > 1) {
            if ((1 == is_aligned) && (nr0 == ne00)) {
-                hvx_splat_f32_a(spad_data_th, *(float *) src1_ptr, nr0);
+                hvx_bcast_fp32_a(spad_data_th, *(float *) src1_ptr, nr0);
            } else {
                for (uint32_t r = 0; r < nr0; r++) {
                    memcpy(spad_data_th + r * nb11, (const uint8_t *) src1_ptr, nb11);
                }
            }
-            func_HVX((uint8_t *) dst_ptr, (const uint8_t *) src0_ptr, (const uint8_t *) spad_data_th, ne00);
+            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) spad_data_th, (uint8_t *) dst_ptr, ne00);
        } else {
-            func_HVX((uint8_t *) dst_ptr, (const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, ne00);
+            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, (uint8_t *) dst_ptr, ne00);
        }

        src0_ptr += src0_row_size;
@@ -173,6 +185,11 @@ static void binary_add_id_job_f32_per_thread(struct htp_ops_context * octx,
    uint64_t t1, t2;
    t1 = HAP_perf_get_qtimer_count();

+    if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) ||
+        (0 == htp_is_aligned((void *) dst->data, VLEN))) {
+        FARF(HIGH, "add-id-f32: unaligned addresses, possibly slower execution\n");
+    }
+
    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
@@ -193,9 +210,9 @@ static void binary_add_id_job_f32_per_thread(struct htp_ops_context * octx,
        const float * restrict src1_ptr = (const float *) (data_src1 + 0 + 0 + i11 * nb11);

        if (ir + 1 < src0_end_row) {
-            hex_l2fetch(src0_ptr + ne00, src0_row_size, src0_row_size, 1);
+            htp_l2fetch(src0_ptr + ne00, 1, src0_row_size, src0_row_size);
            if (src1_row_size == src0_row_size) {
-                hex_l2fetch(src1_ptr + ne10, src1_row_size, src1_row_size, 1);
+                htp_l2fetch(src1_ptr + ne10, 1, src1_row_size, src1_row_size);
            }
        }

@@ -204,9 +221,9 @@ static void binary_add_id_job_f32_per_thread(struct htp_ops_context * octx,
            for (uint32_t r = 0; r < nr0; r++) {
                memcpy(spad_data + r * nb10, (const uint8_t *) src1_ptr, nb10);
            }
-            func_HVX((uint8_t *) dst_ptr, (const uint8_t *) src0_ptr, (const uint8_t *) spad_data, ne00);
+            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) spad_data, (uint8_t *) dst_ptr, ne00);
        } else {
-            func_HVX((uint8_t *) dst_ptr, (const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, ne00);
+            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, (uint8_t *) dst_ptr, ne00);
        }
    }

@@ -282,9 +299,9 @@ static int execute_op_binary_f32(struct htp_ops_context * octx) {
    const size_t dst_row_size  = dst->nb[1];

    // VTCM scratchpads for all tensors
-    octx->dst_spad.size  = hex_round_up(dst_row_size, 128) * n_threads;
-    octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads;
-    octx->src1_spad.size = hex_round_up(src1_row_size, 128) * n_threads;
+    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * n_threads;
+    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
+    octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads;

    size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;

--- a/ggml/src/ggml-hexagon/htp/cpy-ops.c
+++ b/ggml/src/ggml-hexagon/htp/cpy-ops.c
@@ -1,251 +0,0 @@
-#pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-function"
-#pragma clang diagnostic ignored "-Wunused-but-set-variable"
-
-#include <HAP_farf.h>
-#include <HAP_perf.h>
-
-#include <math.h>
-#include <string.h>
-
-#define GGML_COMMON_DECL_C
-#include "ggml-common.h"
-#include "htp-ctx.h"
-#include "htp-msg.h"
-#include "htp-ops.h"
-#include "hvx-utils.h"
-
-struct htp_copy_context {
-    struct htp_ops_context * octx;
-
-    uint32_t          src0_type_size;
-    uint32_t          src0_block_size;
-
-    uint32_t          dst_type_size;
-    uint32_t          dst_block_size;
-
-    uint32_t          src0_blocks_per_row;
-    uint32_t          dst_blocks_per_row;
-
-    uint32_t          src0_nrows_per_thread;
-
-    void (*copy)(struct htp_copy_context * ct, struct htp_ops_context * octx, int nth, int ith);
-};
-
-#define cpy_preamble                       \
-    struct htp_tensor *src0 = &octx->src0; \
-    struct htp_tensor *dst  = &octx->dst;  \
-                                           \
-    const uint32_t ne00 = src0->ne[0];     \
-    const uint32_t ne01 = src0->ne[1];     \
-    const uint32_t ne02 = src0->ne[2];     \
-    const uint32_t ne03 = src0->ne[3];     \
-                                           \
-    const uint32_t nb00 = src0->nb[0];     \
-    const uint32_t nb01 = src0->nb[1];     \
-    const uint32_t nb02 = src0->nb[2];     \
-    const uint32_t nb03 = src0->nb[3];     \
-                                           \
-    const uint32_t  ne0 = dst->ne[0];      \
-    const uint32_t  ne1 = dst->ne[1];      \
-    const uint32_t  ne2 = dst->ne[2];      \
-    const uint32_t  ne3 = dst->ne[3];      \
-                                           \
-    const uint32_t  nb0 = dst->nb[0];      \
-    const uint32_t  nb1 = dst->nb[1];      \
-    const uint32_t  nb2 = dst->nb[2];      \
-    const uint32_t  nb3 = dst->nb[3];      \
-                                           \
-    const uint32_t   nr = ne01;
-
-static void cpy_thread_sametype_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) {
-    cpy_preamble;
-
-    // parallelize by src0 rows
-    const uint32_t dr  = ct->src0_nrows_per_thread;
-    const uint32_t ir0 = dr * ith;
-    const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
-
-    // copy by rows
-    for (uint32_t i03 = 0; i03 < ne03; i03++) {
-        for (uint32_t i02 = 0; i02 < ne02; i02++) {
-            #pragma unroll(2)
-            for (uint32_t i01 = ir0; i01 < ir1; i01++) {
-                uint8_t* dst_ptr  = (uint8_t*) dst->data  + i01*nb1  + i02*nb2  + i03*nb3;
-                uint8_t* src0_ptr = (uint8_t*) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                hex_l2fetch(src0_ptr, ne00 * ct->src0_type_size, nb01, 2);
-                hvx_copy_uu(dst_ptr, src0_ptr, ne00, ct->src0_type_size);
-            }
-        }
-    }
-}
-
-static void cpy_thread_sametype_reshape(struct htp_copy_context * ct, struct htp_ops_context * octx, int nth, int ith) {
-    cpy_preamble;
-
-    // parallelize by src0 rows
-    const uint32_t dr  = ct->src0_nrows_per_thread;
-    const uint32_t ir0 = dr * ith;
-    const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
-
-    // dst counters
-    int64_t k10 = 0;
-    int64_t i11 = 0;
-    int64_t i12 = 0;
-    int64_t i13 = 0;
-
-    // number of blocks in a row
-    const int64_t nk00 = ct->src0_blocks_per_row;
-    const int64_t nk0  = ct->dst_blocks_per_row;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            k10 += nk00 * ir0;
-            while (k10 >= nk0) {
-                k10 -= nk0;
-                if (++i11 == ne1) {
-                    i11 = 0;
-                    if (++i12 == ne2) {
-                        i12 = 0;
-                        if (++i13 == ne3) {
-                            i13 = 0;
-                        }
-                    }
-                }
-            }
-            for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                for (int64_t k00 = 0; k00 < nk00; k00++) {
-                    const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                          char * dst_ptr  = ((char *)  dst->data + k10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-                    memcpy(dst_ptr, src0_ptr, ct->dst_type_size);
-
-                    if (++k10 == nk0) {
-                        k10 = 0;
-                        if (++i11 == ne1) {
-                            i11 = 0;
-                            if (++i12 == ne2) {
-                                i12 = 0;
-                                if (++i13 == ne3) {
-                                    i13 = 0;
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-            k10 += nk00 * (ne01 - ir1);
-            while (k10 >= nk0) {
-                k10 -= nk0;
-                if (++i11 == ne1) {
-                    i11 = 0;
-                    if (++i12 == ne2) {
-                        i12 = 0;
-                        if (++i13 == ne3) {
-                            i13 = 0;
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void cpy_thread_f16_f32_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) {
-    cpy_preamble;
-
-    // parallelize by src0 rows
-    const uint32_t dr  = ct->src0_nrows_per_thread;
-    const uint32_t ir0 = dr * ith;
-    const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
-
-    // copy by rows
-    for (uint32_t i03 = 0; i03 < ne03; i03++) {
-        for (uint32_t i02 = 0; i02 < ne02; i02++) {
-            #pragma unroll(2)
-            for (uint32_t i01 = ir0; i01 < ir1; i01++) {
-                uint8_t* dst_ptr  = (uint8_t*) dst->data  + i01*nb1  + i02*nb2  + i03*nb3;
-                uint8_t* src0_ptr = (uint8_t*) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                hex_l2fetch(src0_ptr, ne00 * sizeof(float), nb01, 2);
-                hvx_copy_f16_f32_uu(dst_ptr, src0_ptr, ne00);
-            }
-        }
-    }
-}
-
-static void cpy_thread_f32_f16_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) {
-    cpy_preamble;
-
-    // parallelize by src0 rows
-    const uint32_t dr  = ct->src0_nrows_per_thread;
-    const uint32_t ir0 = dr * ith;
-    const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
-
-    // copy by rows
-    for (uint32_t i03 = 0; i03 < ne03; i03++) {
-        for (uint32_t i02 = 0; i02 < ne02; i02++) {
-            #pragma unroll(2)
-            for (uint32_t i01 = ir0; i01 < ir1; i01++) {
-                uint8_t* dst_ptr  = (uint8_t*) dst->data  + i01*nb1  + i02*nb2  + i03*nb3;
-                uint8_t* src0_ptr = (uint8_t*) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                hex_l2fetch(src0_ptr, ne00 * sizeof(__fp16), nb01, 2);
-                hvx_copy_f32_f16_uu(dst_ptr, src0_ptr, ne00);
-            }
-        }
-    }
-}
-
-static void cpy_work_func(unsigned int n, unsigned int i, void *data) {
-    struct htp_copy_context *ct = (struct htp_copy_context *) data;
-    ct->copy(ct, ct->octx, n, i);
-}
-
-int op_cpy(struct htp_ops_context * octx) {
-    cpy_preamble;
-
-    struct htp_copy_context ct;
-    ct.octx = octx;
-
-    switch (src0->type) {
-    case HTP_TYPE_F32: ct.src0_type_size = 4; ct.src0_block_size = 1; ct.src0_blocks_per_row = ne00 / 1; break;
-    case HTP_TYPE_F16: ct.src0_type_size = 2; ct.src0_block_size = 1; ct.src0_blocks_per_row = ne00 / 1; break;
-    default:
-        return HTP_STATUS_NO_SUPPORT;
-    }
-
-    switch (dst->type) {
-    case HTP_TYPE_F32: ct.dst_type_size = 4; ct.dst_block_size = 1; ct.dst_blocks_per_row = ne0 / 1; break;
-    case HTP_TYPE_F16: ct.dst_type_size = 2; ct.dst_block_size = 1; ct.dst_blocks_per_row = ne0 / 1; break;
-    default:
-        return HTP_STATUS_NO_SUPPORT;
-    }
-
-    if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
-        return HTP_STATUS_OK;
-    }
-
-    const bool sametype   = (src0->type == dst->type);
-    const bool transposed = (nb00 > nb01) || (nb0 > nb1);
-    const bool sameshape  = !transposed && (ne00 == ne0 && ne01 == ne1 && ne02 == ne2 && ne03 == ne3);
-
-    const uint32_t n_jobs = MIN(nr, octx->n_threads);
-    ct.src0_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
-
-    if (sametype && sameshape) {
-        ct.copy = cpy_thread_sametype_sameshape;
-    } else if (sameshape) {
-        /**/ if (dst->type == HTP_TYPE_F16 && src0->type == HTP_TYPE_F32)
-            ct.copy = cpy_thread_f16_f32_sameshape;
-        else if (dst->type == HTP_TYPE_F32 && src0->type == HTP_TYPE_F16)
-            ct.copy = cpy_thread_f32_f16_sameshape;
-        else
-            return HTP_STATUS_NO_SUPPORT;
-    } else if (sametype) {
-        ct.copy = cpy_thread_sametype_reshape;
-    } else {
-        return HTP_STATUS_NO_SUPPORT;
-    }
-
-    worker_pool_run_func(octx->ctx->worker_pool, cpy_work_func, &ct, n_jobs);
-
-    return HTP_STATUS_OK;
-}
--- a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
+++ b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
@@ -2,20 +2,25 @@
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"

+#ifdef HTP_DEBUG
+#    define FARF_HIGH 1
+#endif
 #include <HAP_farf.h>
+#include <HAP_mem.h>
 #include <HAP_perf.h>
-
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
 #include <math.h>
 #include <string.h>

-#include "hex-dma.h"
-#include "hvx-utils.h"
-
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
+#include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
+#include "hvx-utils.h"
+#include "ops-utils.h"

 // Dot product of FP32 and FP16 vectors, accumulating to float
 static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict y, const void * restrict x, unsigned int n, float s) {
@@ -65,8 +70,8 @@ static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict
        rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
    }

-    rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_f32(s));
-    rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
+    rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_fp32(s));
+    rsum = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum));

    hvx_vec_store_u(r, 4, rsum);
 }
@@ -106,8 +111,8 @@ static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict
        rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf),  Q6_V_hi_W(xy_qf)));
    }

-    rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_f32(s));
-    rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
+    rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_fp32(s));
+    rsum = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum));
    hvx_vec_store_u(r, 4, rsum);
 }

@@ -119,7 +124,7 @@ static inline void hvx_mad_f32_f16_aa(float * restrict y, const void * restrict
    uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
    uint32_t nloe = n % VLEN_FP16; // leftover elements

-    HVX_Vector S = hvx_vec_splat_f16(s);
+    HVX_Vector S = hvx_vec_splat_fp16(s);

    uint32_t i = 0;
    #pragma unroll(4)
@@ -143,7 +148,7 @@ static inline void hvx_mad_f32_f16_aa(float * restrict y, const void * restrict

        if (nloe) {
            HVX_Vector xy = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(xs, ptr_y[i]));
-            hvx_vec_store_a(&ptr_y[i], nloe * 4, xy);
+            hvx_vec_store_u(&ptr_y[i], nloe * 4, xy);
        }
    }
 }
@@ -220,18 +225,18 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in
    const uint32_t DV = nev0;

    const size_t size_q_row = DK * ((q->type == HTP_TYPE_F32) ? 4 : 2);
-    const size_t size_q_row_padded = hex_round_up(size_q_row, 128);
+    const size_t size_q_row_padded = htp_round_up(size_q_row, 128);

    const size_t size_k_row = DK * sizeof(__fp16);
    const size_t size_v_row = DV * sizeof(__fp16);
    const size_t size_m_row = FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16); // Treat block as one row for mask

-    const size_t size_k_row_padded = hex_round_up(size_k_row, 128);
-    const size_t size_v_row_padded = hex_round_up(size_v_row, 128);
+    const size_t size_k_row_padded = htp_round_up(size_k_row, 128);
+    const size_t size_v_row_padded = htp_round_up(size_v_row, 128);

    const size_t size_k_block = size_k_row_padded * FLASH_ATTN_BLOCK_SIZE;
    const size_t size_v_block = size_v_row_padded * FLASH_ATTN_BLOCK_SIZE;
-    const size_t size_m_block = hex_round_up(FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16), 128);
+    const size_t size_m_block = htp_round_up(FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16), 128);

    // Scratchpad buffers for Q, K, V, Mask, and VKQ32 accumulator
    uint8_t * spad_q = octx->src0_spad.data + octx->src0_spad.size_per_thread * ith;
@@ -267,8 +272,8 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in
        float M = -INFINITY; // maximum KQ value

        // Clear accumulator
-        hvx_splat_f32_a(spad_a, 0, DV);
        float * VKQ32 = (float *) spad_a;
+        memset(VKQ32, 0, DV * sizeof(float));

        const __fp16 * mp_base = NULL;
        if (mask) {
@@ -335,30 +340,30 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in

                // 2. Softcap
                if (logit_softcap != 0.0f) {
-                    scores = hvx_vec_tanh_f32(scores);
-                    scores = Q6_Vqf32_vmpy_VsfVsf(scores, hvx_vec_splat_f32(logit_softcap));
+                    scores = hvx_vec_tanh_fp32(scores);
+                    scores = Q6_Vqf32_vmpy_VsfVsf(scores, hvx_vec_splat_fp32(logit_softcap));
                    scores = Q6_Vsf_equals_Vqf32(scores);
                }

                // 3. Mask
                if (mask) {
                    const __fp16 * mp = m_base + ic;
-                    HVX_Vector m_vals_f16 = *(const HVX_UVector *) mp;
+                    HVX_Vector m_vals_fp16 = *(const HVX_UVector *) mp;

-                    HVX_Vector one_f16 = Q6_Vh_vsplat_R(0x3c00);
-                    HVX_VectorPair m_vals_f32_pair = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_f16), one_f16);
+                    HVX_Vector one_fp16 = Q6_Vh_vsplat_R(0x3c00);
+                    HVX_VectorPair m_vals_fp32_pair = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_fp16), one_fp16);

-                    HVX_Vector m_vals_f32 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(m_vals_f32_pair));
+                    HVX_Vector m_vals_fp32 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(m_vals_fp32_pair));

-                    HVX_Vector slope_vec = hvx_vec_splat_f32(slope);
-                    HVX_Vector add_val = Q6_Vqf32_vmpy_VsfVsf(m_vals_f32, slope_vec);
+                    HVX_Vector slope_vec = hvx_vec_splat_fp32(slope);
+                    HVX_Vector add_val = Q6_Vqf32_vmpy_VsfVsf(m_vals_fp32, slope_vec);
                    scores = Q6_Vqf32_vadd_VsfVsf(scores, Q6_Vsf_equals_Vqf32(add_val));
                    scores = Q6_Vsf_equals_Vqf32(scores);
                }

                // 4. Online Softmax Update
-                HVX_Vector v_max = hvx_vec_reduce_max_f32(scores);
-                float m_block = hvx_vec_get_f32(v_max);
+                HVX_Vector v_max = hvx_vec_reduce_max_fp32(scores);
+                float m_block = hvx_vec_get_fp32(v_max);

                float M_old = M;
                float M_new = (m_block > M) ? m_block : M;
@@ -369,12 +374,12 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in
                hvx_scale_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms);
                S = S * ms;

-                HVX_Vector M_new_vec = hvx_vec_splat_f32(M_new);
+                HVX_Vector M_new_vec = hvx_vec_splat_fp32(M_new);
                HVX_Vector scores_shifted = Q6_Vqf32_vsub_VsfVsf(scores, M_new_vec);
-                HVX_Vector P = hvx_vec_exp_f32(Q6_Vsf_equals_Vqf32(scores_shifted));
+                HVX_Vector P = hvx_vec_exp_fp32(Q6_Vsf_equals_Vqf32(scores_shifted));

-                HVX_Vector p_sum_vec = hvx_vec_reduce_sum_f32(P);
-                float p_sum = hvx_vec_get_f32(p_sum_vec);
+                HVX_Vector p_sum_vec = hvx_vec_fp32_reduce_sum(P);
+                float p_sum = hvx_vec_get_fp32(p_sum_vec);
                S += p_sum;

                // 5. Accumulate V
@@ -479,9 +484,9 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in
        uint8_t * dst_ptr = (uint8_t *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1) * nb1;

        if (dst->type == HTP_TYPE_F32) {
-            hvx_copy_f32_ua(dst_ptr, (uint8_t *) VKQ32, DV);
+            hvx_copy_fp32_ua(dst_ptr, (uint8_t *) VKQ32, DV);
        } else if (dst->type == HTP_TYPE_F16) {
-            hvx_copy_f16_f32_ua(dst_ptr, (uint8_t *) VKQ32, DV);
+            hvx_copy_fp16_fp32_ua(dst_ptr, (uint8_t *) VKQ32, DV);
        }
    }
 }
@@ -518,16 +523,16 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
        octx->src3_div3 = init_fastdiv_values(mask->ne[3]);
    }

-    size_t size_q_row_padded = hex_round_up(q->ne[0] * (q->type == HTP_TYPE_F32 ? 4 : 2), 128);
-    size_t size_k_row_padded = hex_round_up(k->ne[0] * sizeof(__fp16), 128);
-    size_t size_v_row_padded = hex_round_up(v->ne[0] * sizeof(__fp16), 128);
+    size_t size_q_row_padded = htp_round_up(q->ne[0] * (q->type == HTP_TYPE_F32 ? 4 : 2), 128);
+    size_t size_k_row_padded = htp_round_up(k->ne[0] * sizeof(__fp16), 128);
+    size_t size_v_row_padded = htp_round_up(v->ne[0] * sizeof(__fp16), 128);

    size_t size_q_block = size_q_row_padded * 1; // single row for now
    size_t size_k_block = size_k_row_padded * FLASH_ATTN_BLOCK_SIZE;
    size_t size_v_block = size_v_row_padded * FLASH_ATTN_BLOCK_SIZE;
-    size_t size_m_block = hex_round_up(FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16), 128);
+    size_t size_m_block = htp_round_up(FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16), 128);

-    size_t size_vkq_acc = hex_round_up(v->ne[0] * sizeof(float), 128); // VKQ32
+    size_t size_vkq_acc = htp_round_up(v->ne[0] * sizeof(float), 128); // VKQ32

    octx->src0_spad.size_per_thread = size_q_block * 1;
    octx->src1_spad.size_per_thread = size_k_block * 2;
--- a/ggml/src/ggml-hexagon/htp/get-rows-ops.c
+++ b/ggml/src/ggml-hexagon/htp/get-rows-ops.c
@@ -2,9 +2,14 @@
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"

+#ifdef HTP_DEBUG
+#    define FARF_HIGH 1
+#endif
 #include <HAP_farf.h>
+#include <HAP_mem.h>
 #include <HAP_perf.h>
-
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
 #include <math.h>
 #include <string.h>

@@ -14,6 +19,7 @@
 #include "htp-msg.h"
 #include "htp-ops.h"
 #include "hvx-utils.h"
+#include "ops-utils.h"

 #define get_rows_preamble \
    const uint32_t ne00 = octx->src0.ne[0]; \
@@ -66,7 +72,7 @@ static int get_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth,

        const uintptr_t src0_ptr = octx->src0.data + i01*nb01 + i11*nb02 + i12*nb03;
        const uintptr_t dst_ptr  = octx->dst.data  + i10*nb1  + i11*nb2  + i12*nb3;
-        hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
+        hvx_copy_fp32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
    }

    return HTP_STATUS_OK;
--- a/ggml/src/ggml-hexagon/htp/hex-dump.h
+++ b/ggml/src/ggml-hexagon/htp/hex-dump.h
@@ -1,77 +0,0 @@
-#ifndef HEX_DUMP_H
-#define HEX_DUMP_H
-
-#include <HAP_farf.h>
-
-static inline void hex_dump_int8_line(char * pref, const int8_t * x, int n) {
-    char str[1024], *p = str, *p_end = str + sizeof(str);
-    p += snprintf(p, p_end - p, "%s: ", pref);
-    for (int i = 0; i < n && p < p_end; i++) {
-        p += snprintf(p, p_end - p, "%d, ", x[i]);
-    }
-    FARF(HIGH, "%s\n", str);
-}
-
-static inline void hex_dump_uint8_line(char * pref, const uint8_t * x, uint32_t n) {
-    char str[1024], *p = str, *p_end = str + sizeof(str);
-    p += snprintf(p, p_end - p, "%s: ", pref);
-    for (int i = 0; i < n && p < p_end; i++) {
-        p += snprintf(p, p_end - p, "%d, ", x[i]);
-    }
-    FARF(HIGH, "%s\n", str);
-}
-
-static inline void hex_dump_int32_line(char * pref, const int32_t * x, uint32_t n) {
-    char str[1024], *p = str, *p_end = str + sizeof(str);
-    p += snprintf(p, p_end - p, "%s: ", pref);
-    for (int i = 0; i < n; i++) {
-        p += snprintf(p, p_end - p, "%d, ", (int) x[i]);
-    }
-    FARF(HIGH, "%s\n", str);
-}
-
-static inline void hex_dump_f16_line(char * pref, const __fp16 * x, uint32_t n) {
-    char str[1024], *p = str, *p_end = str + sizeof(str);
-    p += snprintf(p, p_end - p, "%s: ", pref);
-    for (int i = 0; i < n; i++) {
-        p += snprintf(p, p_end - p, "%.6f, ", (float) x[i]);
-    }
-    FARF(HIGH, "%s\n", str);
-}
-
-static inline void hex_dump_f32_line(char * pref, const float * x, uint32_t n) {
-    char str[1024], *p = str, *p_end = str + sizeof(str);
-    p += snprintf(p, p_end - p, "%s: ", pref);
-    for (int i = 0; i < n; i++) {
-        p += snprintf(p, p_end - p, "%.6f, ", x[i]);
-    }
-    FARF(HIGH, "%s\n", str);
-}
-
-static inline void hex_dump_f32(char * pref, const float * x, uint32_t n) {
-    uint32_t n0 = n / 16;
-    uint32_t n1 = n % 16;
-
-    uint32_t i = 0;
-    for (; i < n0; i++) {
-        hex_dump_f32_line(pref, x + (16 * i), 16);
-    }
-    if (n1) {
-        hex_dump_f32_line(pref, x + (16 * i), n1);
-    }
-}
-
-static inline void hex_dump_f16(char * pref, const __fp16 * x, uint32_t n) {
-    uint32_t n0 = n / 16;
-    uint32_t n1 = n % 16;
-
-    uint32_t i = 0;
-    for (; i < n0; i++) {
-        hex_dump_f16_line(pref, x + (16 * i), 16);
-    }
-    if (n1) {
-        hex_dump_f16_line(pref, x + (16 * i), n1);
-    }
-}
-
-#endif /* HEX_DUMP_H */
--- a/ggml/src/ggml-hexagon/htp/hex-fastdiv.h
+++ b/ggml/src/ggml-hexagon/htp/hex-fastdiv.h
@@ -1,37 +0,0 @@
-#ifndef HEX_FASTDIV_H
-#define HEX_FASTDIV_H
-
-// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
-// Precompute mp (m' in the paper) and L such that division
-// can be computed using a multiply (high 32b of 64b result)
-// and a shift:
-//
-// n/d = (mulhi(n, mp) + n) >> L;
-struct fastdiv_values {
-    uint32_t mp;
-    uint32_t l;
-};
-
-static inline struct fastdiv_values init_fastdiv_values(uint32_t d) {
-    struct fastdiv_values result = { 0, 0 };
-    // compute L = ceil(log2(d));
-    while (result.l < 32 && ((uint32_t) 1 << result.l) < d) {
-        ++(result.l);
-    }
-
-    result.mp = (uint32_t) (((uint64_t) 1 << 32) * (((uint64_t) 1 << result.l) - d) / d + 1);
-    return result;
-}
-
-static inline uint32_t fastdiv(uint32_t n, const struct fastdiv_values * vals) {
-    // Compute high 32 bits of n * mp
-    const uint32_t hi = (uint32_t) (((uint64_t) n * vals->mp) >> 32);  // mulhi(n, mp)
-    // add n, apply bit shift
-    return (hi + n) >> vals->l;
-}
-
-static inline uint32_t fastmodulo(uint32_t n, uint32_t d, const struct fastdiv_values * vals) {
-    return n - fastdiv(n, vals) * d;
-}
-
-#endif /* HEX_FASTDIV_H */
--- a/ggml/src/ggml-hexagon/htp/hex-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hex-utils.h
@@ -1,51 +0,0 @@
-#ifndef HEX_UTILS_H
-#define HEX_UTILS_H
-
-#include <stdbool.h>
-#include <stdint.h>
-
-#include "hexagon_types.h"
-
-#include "hex-fastdiv.h"
-#include "hex-dump.h"
-
-#ifndef MAX
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-#endif
-
-#ifndef MIN
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#endif
-
-static inline uint64_t hex_get_cycles() {
-    uint64_t cycles = 0;
-    asm volatile(" %0 = c15:14\n" : "=r"(cycles));
-    return cycles;
-}
-
-static inline uint64_t hex_get_pktcnt() {
-    uint64_t pktcnt;
-    asm volatile(" %0 = c19:18\n" : "=r"(pktcnt));
-    return pktcnt;
-}
-
-static inline int32_t hex_is_aligned(void * addr, uint32_t align) {
-    return ((size_t) addr & (align - 1)) == 0;
-}
-
-static inline int32_t hex_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
-    uint32_t left_off  = (size_t) addr & (chunk_size - 1);
-    uint32_t right_off = left_off + n;
-    return right_off <= chunk_size;
-}
-
-static inline uint32_t hex_round_up(uint32_t n, uint32_t m) {
-    return m * ((n + m - 1) / m);
-}
-
-static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride, uint32_t height) {
-    const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
-    Q6_l2fetch_AP((void *) p, control);
-}
-
-#endif /* HEX_UTILS_H */
--- a/ggml/src/ggml-hexagon/htp/htp-ctx.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h
@@ -1,7 +1,7 @@
 #ifndef HTP_CTX_H
 #define HTP_CTX_H

-#include "hex-dma.h"
+#include "htp-dma.h"
 #include "worker-pool.h"

 #include <assert.h>
--- a/ggml/src/ggml-hexagon/htp/htp-dma.c
+++ b/ggml/src/ggml-hexagon/htp/htp-dma.c
@@ -1,4 +1,4 @@
-#include "hex-dma.h"
+#include "htp-dma.h"

 #include <stdbool.h>
 #include <stdlib.h>
--- a/ggml/src/ggml-hexagon/htp/htp-dma.h
+++ b/ggml/src/ggml-hexagon/htp/htp-dma.h
@@ -2,6 +2,7 @@
 #define HTP_DMA_H

 #include <HAP_farf.h>
+#include <hexagon_protos.h>
 #include <hexagon_types.h>
 #include <stdbool.h>
 #include <stdint.h>
--- a/ggml/src/ggml-hexagon/htp/htp-msg.h
+++ b/ggml/src/ggml-hexagon/htp/htp-msg.h
@@ -63,7 +63,6 @@ enum htp_op {
    HTP_OP_SET_ROWS       = 15,
    HTP_OP_SCALE          = 16,
    HTP_OP_GET_ROWS       = 17,
-    HTP_OP_CPY            = 18,
    INVALID
 };

--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -4,12 +4,11 @@
 #include "htp-ctx.h"
 #include "htp-msg.h"
 #include "worker-pool.h"
+#include "ops-utils.h"

 #include <assert.h>
 #include <stdint.h>

-#include <hex-fastdiv.h>
-
 // ggml-common.h must be included prior to this header

 struct htp_spad {
@@ -75,14 +74,6 @@ struct htp_ops_context {
    struct fastdiv_values get_rows_div_ne10;      // fastdiv values for ne10
    struct fastdiv_values get_rows_div_ne10_ne11; // fastdiv values for ne10 * ne11

-    struct fastdiv_values cpy_div_ne01; // fastdiv values for ne01
-    struct fastdiv_values cpy_div_ne02; // fastdiv values for ne02
-    struct fastdiv_values cpy_div_ne03; // fastdiv values for ne03
-
-    struct fastdiv_values cpy_rshp_div_n0;       // fastdiv values for ne00
-    struct fastdiv_values cpy_rshp_div_n1n0;     // fastdiv values for ne00*ne01
-    struct fastdiv_values cpy_rshp_div_n2n1n0;   // fastdiv values for ne00*ne01*ne02
-
    uint32_t flags;
 };

@@ -97,6 +88,5 @@ int op_rope(struct htp_ops_context * octx);
 int op_flash_attn_ext(struct htp_ops_context * octx);
 int op_set_rows(struct htp_ops_context * octx);
 int op_get_rows(struct htp_ops_context * octx);
-int op_cpy(struct htp_ops_context * octx);

 #endif /* HTP_OPS_H */
--- a/ggml/src/ggml-hexagon/htp/hvx-arith.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-arith.h
@@ -1,457 +0,0 @@
-#ifndef HVX_ARITH_H
-#define HVX_ARITH_H
-
-#include <assert.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <math.h>
-
-#include "hvx-base.h"
-#include "hex-utils.h"
-
-//
-// Binary operations (add, mul, sub)
-//
-
-#define hvx_arith_loop_body(dst_type, src0_type, src1_type, vec_store, vec_op) \
-    do {                                                                       \
-        dst_type * restrict vdst  = (dst_type *) dst;                          \
-        src0_type * restrict vsrc0 = (src0_type *) src0;                       \
-        src1_type * restrict vsrc1 = (src1_type *) src1;                       \
-                                                                               \
-        const uint32_t elem_size = sizeof(float);                              \
-        const uint32_t epv  = 128 / elem_size;                                 \
-        const uint32_t nvec = n / epv;                                         \
-        const uint32_t nloe = n % epv;                                         \
-                                                                               \
-        uint32_t i = 0;                                                        \
-                                                                               \
-        _Pragma("unroll(4)")                                                   \
-        for (; i < nvec; i++) {                                                \
-            vdst[i] = vec_op(vsrc0[i], vsrc1[i]);                              \
-        }                                                                      \
-        if (nloe) {                                                            \
-            HVX_Vector v = vec_op(vsrc0[i], vsrc1[i]);                         \
-            vec_store((void *) &vdst[i], nloe * elem_size, v);                 \
-        }                                                                      \
-    } while(0)
-
-#if __HVX_ARCH__ < 79
-#define HVX_OP_ADD(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b))
-#define HVX_OP_SUB(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a, b))
-#define HVX_OP_MUL(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b))
-#else
-#define HVX_OP_ADD(a, b) Q6_Vsf_vadd_VsfVsf(a, b)
-#define HVX_OP_SUB(a, b) Q6_Vsf_vsub_VsfVsf(a, b)
-#define HVX_OP_MUL(a, b) Q6_Vsf_vmpy_VsfVsf(a, b)
-#endif
-
-// ADD variants
-
-static inline void hvx_add_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
-    assert((unsigned long) dst % 128 == 0);
-    assert((unsigned long) src0 % 128 == 0);
-    assert((unsigned long) src1 % 128 == 0);
-    hvx_arith_loop_body(HVX_Vector, HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_ADD);
-}
-
-static inline void hvx_add_f32_au(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
-    assert((unsigned long) dst % 128 == 0);
-    assert((unsigned long) src0 % 128 == 0);
-    hvx_arith_loop_body(HVX_Vector, HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_ADD);
-}
-
-static inline void hvx_add_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
-    assert((unsigned long) src0 % 128 == 0);
-    assert((unsigned long) src1 % 128 == 0);
-    hvx_arith_loop_body(HVX_UVector, HVX_Vector, HVX_Vector, hvx_vec_store_u, HVX_OP_ADD);
-}
-
-static inline void hvx_add_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
-    hvx_arith_loop_body(HVX_UVector, HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_ADD);
-}
-
-// SUB variants
-
-static inline void hvx_sub_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
-    assert((unsigned long) dst % 128 == 0);
-    assert((unsigned long) src0 % 128 == 0);
-    assert((unsigned long) src1 % 128 == 0);
-    hvx_arith_loop_body(HVX_Vector, HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_SUB);
-}
-
-static inline void hvx_sub_f32_au(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
-    assert((unsigned long) dst % 128 == 0);
-    assert((unsigned long) src0 % 128 == 0);
-    hvx_arith_loop_body(HVX_Vector, HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_SUB);
-}
-
-static inline void hvx_sub_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
-    assert((unsigned long) src0 % 128 == 0);
-    assert((unsigned long) src1 % 128 == 0);
-    hvx_arith_loop_body(HVX_UVector, HVX_Vector, HVX_Vector, hvx_vec_store_u, HVX_OP_SUB);
-}
-
-static inline void hvx_sub_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
-    hvx_arith_loop_body(HVX_UVector, HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_SUB);
-}
-
-// MUL variants
-
-static inline void hvx_mul_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
-    assert((unsigned long) dst % 128 == 0);
-    assert((unsigned long) src0 % 128 == 0);
-    assert((unsigned long) src1 % 128 == 0);
-    hvx_arith_loop_body(HVX_Vector, HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_MUL);
-}
-
-static inline void hvx_mul_f32_au(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
-    assert((unsigned long) dst % 128 == 0);
-    assert((unsigned long) src0 % 128 == 0);
-    hvx_arith_loop_body(HVX_Vector, HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_MUL);
-}
-
-static inline void hvx_mul_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
-    assert((unsigned long) src0 % 128 == 0);
-    assert((unsigned long) src1 % 128 == 0);
-    hvx_arith_loop_body(HVX_UVector, HVX_Vector, HVX_Vector, hvx_vec_store_u, HVX_OP_MUL);
-}
-
-static inline void hvx_mul_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
-    hvx_arith_loop_body(HVX_UVector, HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_MUL);
-}
-
-// Dispatchers
-
-static inline void hvx_add_f32(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, const uint32_t num_elems) {
-    if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src0, 128)) {
-        if (hex_is_aligned((void *) src1, 128)) {
-            hvx_add_f32_aa(dst, src0, src1, num_elems);
-        } else {
-            hvx_add_f32_au(dst, src0, src1, num_elems);
-        }
-    } else if (hex_is_aligned((void *) src0, 128) && hex_is_aligned((void *) src1, 128)) {
-        hvx_add_f32_ua(dst, src0, src1, num_elems);
-    } else {
-        hvx_add_f32_uu(dst, src0, src1, num_elems);
-    }
-}
-
-static inline void hvx_sub_f32(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, const uint32_t num_elems) {
-    if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src0, 128)) {
-        if (hex_is_aligned((void *) src1, 128)) {
-            hvx_sub_f32_aa(dst, src0, src1, num_elems);
-        } else {
-            hvx_sub_f32_au(dst, src0, src1, num_elems);
-        }
-    } else if (hex_is_aligned((void *) src0, 128) && hex_is_aligned((void *) src1, 128)) {
-        hvx_sub_f32_ua(dst, src0, src1, num_elems);
-    } else {
-        hvx_sub_f32_uu(dst, src0, src1, num_elems);
-    }
-}
-
-static inline void hvx_mul_f32(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, const uint32_t num_elems) {
-    if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src0, 128)) {
-        if (hex_is_aligned((void *) src1, 128)) {
-            hvx_mul_f32_aa(dst, src0, src1, num_elems);
-        } else {
-            hvx_mul_f32_au(dst, src0, src1, num_elems);
-        }
-    } else if (hex_is_aligned((void *) src0, 128) && hex_is_aligned((void *) src1, 128)) {
-        hvx_mul_f32_ua(dst, src0, src1, num_elems);
-    } else {
-        hvx_mul_f32_uu(dst, src0, src1, num_elems);
-    }
-}
-
-// Mul-Mul Optimized
-
-static inline void hvx_mul_mul_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, const uint8_t * restrict src2, const uint32_t num_elems) {
-    assert((unsigned long) dst % 128 == 0);
-    assert((unsigned long) src0 % 128 == 0);
-    assert((unsigned long) src1 % 128 == 0);
-    assert((unsigned long) src2 % 128 == 0);
-
-    HVX_Vector * restrict vdst  = (HVX_Vector *) dst;
-    HVX_Vector * restrict vsrc0 = (HVX_Vector *) src0;
-    HVX_Vector * restrict vsrc1 = (HVX_Vector *) src1;
-    HVX_Vector * restrict vsrc2 = (HVX_Vector *) src2;
-
-    const uint32_t elem_size = sizeof(float);
-    const uint32_t epv  = 128 / elem_size;
-    const uint32_t nvec = num_elems / epv;
-    const uint32_t nloe = num_elems % epv;
-
-    uint32_t i = 0;
-
-    _Pragma("unroll(4)")
-    for (; i < nvec; i++) {
-        HVX_Vector v1 = HVX_OP_MUL(vsrc0[i], vsrc1[i]);
-        vdst[i] = HVX_OP_MUL(v1, vsrc2[i]);
-    }
-
-    if (nloe) {
-        HVX_Vector v1 = HVX_OP_MUL(vsrc0[i], vsrc1[i]);
-        HVX_Vector v2 = HVX_OP_MUL(v1, vsrc2[i]);
-        hvx_vec_store_a((void *) &vdst[i], nloe * elem_size, v2);
-    }
-}
-
-// Scalar Operations
-
-#define hvx_scalar_loop_body(dst_type, src_type, vec_store, scalar_op_macro)   \
-    do {                                                                       \
-        dst_type * restrict vdst = (dst_type *) dst;                           \
-        src_type * restrict vsrc = (src_type *) src;                           \
-                                                                               \
-        const uint32_t elem_size = sizeof(float);                              \
-        const uint32_t epv  = 128 / elem_size;                                 \
-        const uint32_t nvec = n / epv;                                         \
-        const uint32_t nloe = n % epv;                                         \
-                                                                               \
-        uint32_t i = 0;                                                        \
-                                                                               \
-        _Pragma("unroll(4)")                                                   \
-        for (; i < nvec; i++) {                                                \
-            HVX_Vector v = vsrc[i];                                            \
-            vdst[i] = scalar_op_macro(v);                                      \
-        }                                                                      \
-        if (nloe) {                                                            \
-            HVX_Vector v = vsrc[i];                                            \
-            v = scalar_op_macro(v);                                            \
-            vec_store((void *) &vdst[i], nloe * elem_size, v);                 \
-        }                                                                      \
-    } while(0)
-
-#define HVX_OP_ADD_SCALAR(v) \
-    ({ \
-        const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, v); \
-        HVX_Vector out = HVX_OP_ADD(v, val_vec); \
-        Q6_V_vmux_QVV(pred_inf, inf, out); \
-    })
-
-#define HVX_OP_MUL_SCALAR(v) HVX_OP_MUL(v, val_vec)
-#define HVX_OP_SUB_SCALAR(v) HVX_OP_SUB(v, val_vec)
-
-// Add Scalar Variants
-
-static inline void hvx_add_scalar_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
-    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
-    const HVX_Vector inf = hvx_vec_splat_f32(INFINITY);
-    assert((unsigned long) dst % 128 == 0);
-    assert((unsigned long) src % 128 == 0);
-    hvx_scalar_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_ADD_SCALAR);
-}
-
-static inline void hvx_add_scalar_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
-    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
-    const HVX_Vector inf = hvx_vec_splat_f32(INFINITY);
-    assert((unsigned long) dst % 128 == 0);
-    hvx_scalar_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_ADD_SCALAR);
-}
-
-static inline void hvx_add_scalar_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
-    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
-    const HVX_Vector inf = hvx_vec_splat_f32(INFINITY);
-    assert((unsigned long) src % 128 == 0);
-    hvx_scalar_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u, HVX_OP_ADD_SCALAR);
-}
-
-static inline void hvx_add_scalar_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
-    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
-    static const float kInf = INFINITY;
-    const HVX_Vector inf = hvx_vec_splat_f32(kInf);
-    hvx_scalar_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_ADD_SCALAR);
-}
-
-// Sub Scalar Variants
-
-static inline void hvx_sub_scalar_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
-    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
-    assert((unsigned long) dst % 128 == 0);
-    assert((unsigned long) src % 128 == 0);
-    hvx_scalar_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_SUB_SCALAR);
-}
-
-static inline void hvx_sub_scalar_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
-    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
-    assert((unsigned long) dst % 128 == 0);
-    hvx_scalar_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_SUB_SCALAR);
-}
-
-static inline void hvx_sub_scalar_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
-    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
-    assert((unsigned long) src % 128 == 0);
-    hvx_scalar_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u, HVX_OP_SUB_SCALAR);
-}
-
-static inline void hvx_sub_scalar_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
-    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
-    hvx_scalar_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_SUB_SCALAR);
-}
-
-// Mul Scalar Variants
-
-static inline void hvx_mul_scalar_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
-    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
-    assert((unsigned long) dst % 128 == 0);
-    assert((unsigned long) src % 128 == 0);
-    hvx_scalar_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_MUL_SCALAR);
-}
-
-static inline void hvx_mul_scalar_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
-    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
-    assert((unsigned long) dst % 128 == 0);
-    hvx_scalar_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_MUL_SCALAR);
-}
-
-static inline void hvx_mul_scalar_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
-    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
-    assert((unsigned long) src % 128 == 0);
-    hvx_scalar_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u, HVX_OP_MUL_SCALAR);
-}
-
-static inline void hvx_mul_scalar_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
-    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
-    hvx_scalar_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_MUL_SCALAR);
-}
-
-static inline void hvx_add_scalar_f32(uint8_t * restrict dst, const uint8_t * restrict src, const float val, const int num_elems) {
-    if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src, 128)) {
-        hvx_add_scalar_f32_aa(dst, src, val, num_elems);
-    } else if (hex_is_aligned((void *) dst, 128)) {
-        hvx_add_scalar_f32_au(dst, src, val, num_elems);
-    } else if (hex_is_aligned((void *) src, 128)) {
-        hvx_add_scalar_f32_ua(dst, src, val, num_elems);
-    } else {
-        hvx_add_scalar_f32_uu(dst, src, val, num_elems);
-    }
-}
-
-static inline void hvx_mul_scalar_f32(uint8_t * restrict dst, const uint8_t * restrict src, const float val, const int num_elems) {
-    if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src, 128)) {
-        hvx_mul_scalar_f32_aa(dst, src, val, num_elems);
-    } else if (hex_is_aligned((void *) dst, 128)) {
-        hvx_mul_scalar_f32_au(dst, src, val, num_elems);
-    } else if (hex_is_aligned((void *) src, 128)) {
-        hvx_mul_scalar_f32_ua(dst, src, val, num_elems);
-    } else {
-        hvx_mul_scalar_f32_uu(dst, src, val, num_elems);
-    }
-}
-
-static inline void hvx_sub_scalar_f32(uint8_t * restrict dst, const uint8_t * restrict src, const float val, const int num_elems) {
-    if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src, 128)) {
-        hvx_sub_scalar_f32_aa(dst, src, val, num_elems);
-    } else if (hex_is_aligned((void *) dst, 128)) {
-        hvx_sub_scalar_f32_au(dst, src, val, num_elems);
-    } else if (hex_is_aligned((void *) src, 128)) {
-        hvx_sub_scalar_f32_ua(dst, src, val, num_elems);
-    } else {
-        hvx_sub_scalar_f32_uu(dst, src, val, num_elems);
-    }
-}
-
-// MIN Scalar variants
-
-#define HVX_OP_MIN_SCALAR(v) Q6_Vsf_vmin_VsfVsf(val_vec, v)
-
-static inline void hvx_min_scalar_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
-    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
-    assert((unsigned long) dst % 128 == 0);
-    assert((unsigned long) src % 128 == 0);
-    hvx_scalar_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_MIN_SCALAR);
-}
-
-static inline void hvx_min_scalar_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
-    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
-    assert((unsigned long) dst % 128 == 0);
-    hvx_scalar_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_MIN_SCALAR);
-}
-
-static inline void hvx_min_scalar_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
-    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
-    assert((unsigned long) src % 128 == 0);
-    hvx_scalar_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u, HVX_OP_MIN_SCALAR);
-}
-
-static inline void hvx_min_scalar_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const float val, uint32_t n) {
-    const HVX_Vector val_vec = hvx_vec_splat_f32(val);
-    hvx_scalar_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_MIN_SCALAR);
-}
-
-static inline void hvx_min_scalar_f32(uint8_t * restrict dst, const uint8_t * restrict src, const float val, const int num_elems) {
-    if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src, 128)) {
-        hvx_min_scalar_f32_aa(dst, src, val, num_elems);
-    } else if (hex_is_aligned((void *) dst, 128)) {
-        hvx_min_scalar_f32_au(dst, src, val, num_elems);
-    } else if (hex_is_aligned((void *) src, 128)) {
-        hvx_min_scalar_f32_ua(dst, src, val, num_elems);
-    } else {
-        hvx_min_scalar_f32_uu(dst, src, val, num_elems);
-    }
-}
-
-// CLAMP Scalar variants
-
-#define HVX_OP_CLAMP_SCALAR(v) \
-    ({ \
-        HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(v, max_vec); \
-        HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(min_vec, v); \
-        HVX_Vector tmp = Q6_V_vmux_QVV(pred_cap_right, max_vec, v); \
-        Q6_V_vmux_QVV(pred_cap_left, min_vec, tmp); \
-    })
-
-static inline void hvx_clamp_scalar_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const float min, const float max, uint32_t n) {
-    const HVX_Vector min_vec = hvx_vec_splat_f32(min);
-    const HVX_Vector max_vec = hvx_vec_splat_f32(max);
-    assert((unsigned long) dst % 128 == 0);
-    assert((unsigned long) src % 128 == 0);
-    hvx_scalar_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a, HVX_OP_CLAMP_SCALAR);
-}
-
-static inline void hvx_clamp_scalar_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const float min, const float max, uint32_t n) {
-    const HVX_Vector min_vec = hvx_vec_splat_f32(min);
-    const HVX_Vector max_vec = hvx_vec_splat_f32(max);
-    assert((unsigned long) dst % 128 == 0);
-    hvx_scalar_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a, HVX_OP_CLAMP_SCALAR);
-}
-
-static inline void hvx_clamp_scalar_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const float min, const float max, uint32_t n) {
-    const HVX_Vector min_vec = hvx_vec_splat_f32(min);
-    const HVX_Vector max_vec = hvx_vec_splat_f32(max);
-    assert((unsigned long) src % 128 == 0);
-    hvx_scalar_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u, HVX_OP_CLAMP_SCALAR);
-}
-
-static inline void hvx_clamp_scalar_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const float min, const float max, uint32_t n) {
-    const HVX_Vector min_vec = hvx_vec_splat_f32(min);
-    const HVX_Vector max_vec = hvx_vec_splat_f32(max);
-    hvx_scalar_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u, HVX_OP_CLAMP_SCALAR);
-}
-
-static inline void hvx_clamp_scalar_f32(uint8_t * restrict dst, const uint8_t * restrict src, const float min, const float max, const int num_elems) {
-    if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src, 128)) {
-        hvx_clamp_scalar_f32_aa(dst, src, min, max, num_elems);
-    } else if (hex_is_aligned((void *) dst, 128)) {
-        hvx_clamp_scalar_f32_au(dst, src, min, max, num_elems);
-    } else if (hex_is_aligned((void *) src, 128)) {
-        hvx_clamp_scalar_f32_ua(dst, src, min, max, num_elems);
-    } else {
-        hvx_clamp_scalar_f32_uu(dst, src, min, max, num_elems);
-    }
-}
-
-#undef HVX_OP_ADD
-#undef HVX_OP_SUB
-#undef HVX_OP_MUL
-#undef hvx_arith_loop_body
-#undef HVX_OP_ADD_SCALAR
-#undef HVX_OP_SUB_SCALAR
-#undef HVX_OP_MUL_SCALAR
-#undef hvx_scalar_loop_body
-#undef HVX_OP_MIN_SCALAR
-#undef HVX_OP_CLAMP_SCALAR
-
-#endif // HVX_ARITH_H
--- a/ggml/src/ggml-hexagon/htp/hvx-base.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-base.h
@@ -1,167 +0,0 @@
-#ifndef HVX_BASE_H
-#define HVX_BASE_H
-
-#include <stdbool.h>
-#include <stdint.h>
-
-#include "hex-utils.h"
-#include "hvx-types.h"
-
-static inline void hvx_vec_store_u(void * restrict dst, uint32_t n, HVX_Vector v) {
-    // Rotate as needed.
-    v = Q6_V_vlalign_VVR(v, v, (size_t) dst);
-
-    uint32_t left_off  = (size_t) dst & 127;
-    uint32_t right_off = left_off + n;
-
-    HVX_VectorPred ql_not = Q6_Q_vsetq_R((size_t) dst);
-    HVX_VectorPred qr     = Q6_Q_vsetq2_R(right_off);
-
-    if (right_off > 128) {
-        Q6_vmem_QRIV(qr, (HVX_Vector *) dst + 1, v);
-        // all 1's
-        qr = Q6_Q_vcmp_eq_VbVb(v, v);
-    }
-
-    ql_not = Q6_Q_or_QQn(ql_not, qr);
-    Q6_vmem_QnRIV(ql_not, (HVX_Vector *) dst, v);
-}
-
-static inline void hvx_vec_store_a(void * restrict dst, uint32_t n, HVX_Vector v) {
-    assert((unsigned long) dst % 128 == 0);
-    HVX_VectorPred m = Q6_Q_or_QQn(Q6_Q_vsetq_R((unsigned long) dst), Q6_Q_vsetq2_R(n));
-    Q6_vmem_QnRIV(m, (HVX_Vector *) dst, v);
-}
-
-static inline HVX_Vector hvx_vec_splat_f32(float v) {
-    union { float  f; uint32_t i; } u = { .f = v };
-    return Q6_V_vsplat_R(u.i);
-}
-
-static inline HVX_Vector hvx_vec_splat_f16(float v) {
-    union { __fp16 f; uint16_t i; } u = { .f = v };
-    return Q6_Vh_vsplat_R(u.i);
-}
-
-static inline HVX_Vector hvx_vec_repl4(HVX_Vector v) {
-    // vdelta control to replicate first 4 bytes across all elements
-    static const uint8_t __attribute__((aligned(128))) repl[128] = {
-        0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
-        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
-        0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
-        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
-        0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
-        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
-        0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
-        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
-    };
-
-    HVX_Vector ctrl = *(HVX_Vector *) repl;
-    return Q6_V_vdelta_VV(v, ctrl);
-}
-
-static inline float hvx_vec_get_f32(HVX_Vector v) {
-    float __attribute__((aligned(128))) x;
-    hvx_vec_store_a(&x, 4, v);
-    return x;
-}
-
-static inline HVX_Vector hvx_vec_abs_f16(HVX_Vector v) {
-    // abs by clearing the fp16 sign bit
-    HVX_Vector mask = Q6_Vh_vsplat_R(0x7fff);
-    return Q6_V_vand_VV(v, mask);
-}
-
-static inline HVX_Vector hvx_vec_neg_f16(HVX_Vector v) {
-    // neg by setting the fp16 sign bit
-    HVX_Vector mask = Q6_Vh_vsplat_R(0x8000);
-    return Q6_V_vxor_VV(v, mask);
-}
-
-static inline HVX_Vector hvx_vec_abs_f32(HVX_Vector v) {
-    // abs by clearing the fp32 sign bit
-    HVX_Vector mask = Q6_V_vsplat_R(0x7fffffff);
-    return Q6_V_vand_VV(v, mask);
-}
-
-static inline HVX_Vector hvx_vec_neg_f32(HVX_Vector v) {
-#if __HVX_ARCH__ > 75
-    return Q6_Vsf_vfneg_Vsf(v);
-#else
-    // neg by setting the fp32 sign bit
-    HVX_Vector mask = Q6_V_vsplat_R(0x80000000);
-    return Q6_V_vxor_VV(v, mask);
-#endif  // __HVX_ARCH__ > 75
-}
-
-static inline HVX_VectorPred hvx_vec_is_nan_f16(HVX_Vector v) {
-    const HVX_Vector vnan_exp  = Q6_Vh_vsplat_R(0x7C00);
-    const HVX_Vector vnan_frac = Q6_Vh_vsplat_R(0x7FFF);
-
-    // get pred of which are NaN, i.e., exponent bits all 1s and fraction bits non 0s
-    HVX_VectorPred p_exp  = Q6_Q_vcmp_eq_VhVh(Q6_V_vand_VV(v, vnan_exp), vnan_exp);
-    HVX_VectorPred p_frac = Q6_Q_not_Q(Q6_Q_vcmp_eq_VhVh(Q6_V_vand_VV(v, vnan_frac), vnan_exp));
-    return Q6_Q_and_QQ(p_exp, p_frac);
-}
-
-static inline HVX_Vector hvx_vec_f32_to_f16(HVX_Vector v0, HVX_Vector v1) {
-    const HVX_Vector zero = Q6_V_vsplat_R(0);
-    HVX_Vector q0 = Q6_Vqf32_vadd_VsfVsf(v0, zero);
-    HVX_Vector q1 = Q6_Vqf32_vadd_VsfVsf(v1, zero);
-    HVX_Vector  v = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(q1, q0)));
-
-#if __HVX_ARCH__ < 79
-    // replace NaNs with -INF, older arches produce NaNs for (-INF + 0.0)
-    const HVX_Vector neg_inf = hvx_vec_splat_f16(-INFINITY);
-    HVX_VectorPred nan = hvx_vec_is_nan_f16(v);
-    v = Q6_V_vmux_QVV(nan, neg_inf, v);
-#endif
-
-    return v;
-}
-
-/* Q6_Vsf_equals_Vw is only available on v73+.*/
-#if __HVX_ARCH__ < 73
-static inline HVX_Vector hvx_vec_i32_to_qf32(HVX_Vector const in)
-{
-    HVX_Vector const vzero = Q6_V_vzero();
-    HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
-    HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
-    HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
-    HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
-    HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
-    HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
-    return ret;
-}
-
-static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in)
-{
-    return Q6_Vsf_equals_Vqf32(hvx_vec_i32_to_qf32(in));
-}
-#endif
-
-static inline HVX_Vector hvx_vec_i16_from_hf_rnd_sat(HVX_Vector vin) {
-    // This looks complicated.
-    // Ideally should just be Q6_Vh_equals_Vhf(vin)
-    // but that instruction does not do proper rounding.
-
-    // convert to qf32, multiplying by 1.0 in the process.
-    HVX_VectorPair v32 = Q6_Wqf32_vmpy_VhfVhf(vin, Q6_Vh_vsplat_R(0x3C00));
-
-    // 'in-range' values are +/32752.
-    // add 192K to it, convert to sf
-    HVX_Vector v192K = Q6_V_vsplat_R(0x48400000);
-    HVX_Vector vsf_0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_lo_W(v32), v192K));
-    HVX_Vector vsf_1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_hi_W(v32), v192K));
-
-    // for in-range cases, result is {163858... 229360} so the exponent is always 144.
-    // if we extract bits 21..0 as a signed quantity, and round 6 bits off, that will be the answer.
-    // Start by <<10 to get the final 'sign' bit in bit 15...
-    vsf_0 = Q6_Vw_vasl_VwR(vsf_0, 10);
-    vsf_1 = Q6_Vw_vasl_VwR(vsf_1, 10);
-
-    // now round down to 16
-    return Q6_Vh_vround_VwVw_sat(vsf_1, vsf_0);
-}
-
-#endif /* HVX_BASE_H */
--- a/ggml/src/ggml-hexagon/htp/hvx-copy.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-copy.h
@@ -1,247 +0,0 @@
-#ifndef HVX_COPY_H
-#define HVX_COPY_H
-
-#include <assert.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#include "hvx-base.h"
-
-#define hvx_splat_loop_body(dst_type, vec_store)                 \
-    do {                                                         \
-        dst_type * restrict vdst = (dst_type *) dst;             \
-                                                                 \
-        uint32_t nvec = n / (128 / elem_size);                   \
-        uint32_t nloe = n % (128 / elem_size);                   \
-                                                                 \
-        uint32_t i = 0;                                          \
-                                                                 \
-        _Pragma("unroll(4)")                                     \
-        for (; i < nvec; i++) {                                  \
-            vdst[i] = src;                                       \
-        }                                                        \
-        if (nloe) {                                              \
-            vec_store((void *) &vdst[i], nloe * elem_size, src); \
-        }                                                        \
-    } while(0)
-
-static inline void hvx_splat_a(uint8_t * restrict dst, HVX_Vector src, uint32_t n, uint32_t elem_size) {
-    assert((unsigned long) dst % 128 == 0);
-    hvx_splat_loop_body(HVX_Vector, hvx_vec_store_a);
-}
-
-static inline void hvx_splat_u(uint8_t * restrict dst, HVX_Vector src, uint32_t n, uint32_t elem_size) {
-    hvx_splat_loop_body(HVX_UVector, hvx_vec_store_u);
-}
-
-static inline void hvx_splat_f32_a(uint8_t * restrict dst, float v, uint32_t n) {
-    hvx_splat_a(dst,  hvx_vec_splat_f32(v), n, sizeof(float));
-}
-
-static inline void hvx_splat_f32_u(uint8_t * restrict dst, float v, uint32_t n) {
-    hvx_splat_u(dst,  hvx_vec_splat_f32(v), n, sizeof(float));
-}
-
-static inline void hvx_splat_f16_a(uint8_t * restrict dst, float v, uint32_t n) {
-    hvx_splat_u(dst,  hvx_vec_splat_f16(v), n, sizeof(__fp16));
-}
-
-static inline void hvx_splat_f16_u(uint8_t * restrict dst, float v, uint32_t n) {
-    hvx_splat_u(dst,  hvx_vec_splat_f16(v), n, sizeof(__fp16));
-}
-
-#define hvx_copy_loop_body(dst_type, src_type, vec_store)            \
-    do {                                                             \
-        dst_type * restrict vdst = (dst_type *) dst;                 \
-        src_type * restrict vsrc = (src_type *) src;                 \
-                                                                     \
-        const uint32_t epv  = 128 / elem_size;                       \
-        const uint32_t nvec = n / epv;                               \
-        const uint32_t nloe = n % epv;                               \
-                                                                     \
-        uint32_t i = 0;                                              \
-                                                                     \
-        _Pragma("unroll(4)")                                         \
-        for (; i < nvec; i++) { vdst[i] = vsrc[i]; }                 \
-        if (nloe) {                                                  \
-            vec_store((void *) &vdst[i], nloe * elem_size, vsrc[i]); \
-        }                                                            \
-    } while(0)
-
-// Generic copy routines
-static inline void hvx_copy_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) {
-    assert((unsigned long) dst % 128 == 0);
-    assert((unsigned long) src % 128 == 0);
-    hvx_copy_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
-}
-
-static inline void hvx_copy_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) {
-    assert((unsigned long) dst % 128 == 0);
-    hvx_copy_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
-}
-
-static inline void hvx_copy_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) {
-    assert((unsigned long) src % 128 == 0);
-    hvx_copy_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
-}
-
-static inline void hvx_copy_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) {
-    hvx_copy_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
-}
-
-// copy n fp16 elements : source and destination are aligned to HVX Vector (128)
-static inline void hvx_copy_f16_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    hvx_copy_aa(dst, src, n, sizeof(__fp16));
-}
-
-// copy n fp16 elements : source is aligned, destination is potentially unaligned
-static inline void hvx_copy_f16_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    hvx_copy_au(dst, src, n, sizeof(__fp16));
-}
-
-// copy n fp16 elements : source is aligned, destination is potentially unaligned
-static inline void hvx_copy_f16_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    hvx_copy_ua(dst, src, n, sizeof(__fp16));
-}
-
-// copy n fp16 elements : source is aligned, destination is potentially unaligned
-static inline void hvx_copy_f16_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    hvx_copy_uu(dst, src, n, sizeof(__fp16));
-}
-
-// copy n fp32 elements : source and destination are aligned to HVX Vector (128)
-static inline void hvx_copy_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    hvx_copy_aa(dst, src, n, sizeof(float));
-}
-
-// copy n fp32 elements : source is aligned, destination is unaligned
-static inline void hvx_copy_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    hvx_copy_ua(dst, src, n, sizeof(float));
-}
-
-// copy n fp32 elements : source is unaligned, destination is aligned
-static inline void hvx_copy_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    hvx_copy_au(dst, src, n, sizeof(float));
-}
-
-// copy n fp32 elements : source is unaligned, destination unaligned
-static inline void hvx_copy_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    hvx_copy_uu(dst, src, n, sizeof(float));
-}
-
-//// fp32 -> fp16
-
-#define hvx_copy_f16_f32_loop_body(dst_type, src_type, vec_store)                   \
-    do {                                                                            \
-        dst_type * restrict vdst = (dst_type *) dst;                                \
-        src_type * restrict vsrc = (src_type *) src;                                \
-                                                                                    \
-        const HVX_Vector zero = Q6_V_vsplat_R(0);                                   \
-                                                                                    \
-        const uint32_t elem_size = sizeof(__fp16);                                  \
-        const uint32_t epv  = 128 / elem_size;                                      \
-        const uint32_t nvec = n / epv;                                              \
-        const uint32_t nloe = n % epv;                                              \
-                                                                                    \
-        uint32_t i = 0;                                                             \
-                                                                                    \
-        _Pragma("unroll(4)")                                                        \
-        for (; i < nvec; i++) {                                                     \
-            vdst[i] = hvx_vec_f32_to_f16(vsrc[i*2+0], vsrc[i*2+1]);                 \
-        }                                                                           \
-        if (nloe) {                                                                 \
-            HVX_Vector v = hvx_vec_f32_to_f16(vsrc[i*2+0], vsrc[i*2+1]);            \
-            vec_store((void *) &vdst[i], nloe * elem_size, v);                      \
-        }                                                                           \
-    } while(0)
-
-// copy/convert n fp32 elements into n fp16 elements : source is aligned, destination is aligned
-static inline void hvx_copy_f16_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    assert((unsigned long) dst % 128 == 0);
-    assert((unsigned long) src % 128 == 0);
-    hvx_copy_f16_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
-}
-
-// copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is aligned
-static inline void hvx_copy_f16_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    assert((unsigned long) dst % 128 == 0);
-    hvx_copy_f16_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
-}
-
-// copy/convert n fp32 elements into n fp16 elements : source is aligned, destination is unaligned
-static inline void hvx_copy_f16_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    assert((unsigned long) src % 128 == 0);
-    hvx_copy_f16_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
-}
-
-// copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is unaligned
-static inline void hvx_copy_f16_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    hvx_copy_f16_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
-}
-
-//// fp16 -> fp32
-
-#define hvx_copy_f32_f16_loop_body(dst_type, src_type, vec_store)                   \
-    do {                                                                            \
-        dst_type * restrict vdst = (dst_type *) dst;                                \
-        src_type * restrict vsrc = (src_type *) src;                                \
-                                                                                    \
-        const HVX_Vector one = hvx_vec_splat_f16(1.0);                              \
-                                                                                    \
-        const uint32_t elem_size = sizeof(__fp16);                                  \
-        const uint32_t epv  = 128 / elem_size;                                      \
-        const uint32_t nvec = n / epv;                                              \
-              uint32_t nloe = n % epv;                                              \
-                                                                                    \
-        uint32_t i = 0;                                                             \
-                                                                                    \
-        _Pragma("unroll(4)")                                                        \
-        for (i = 0; i < nvec; ++i) {                                                \
-            HVX_VectorPair p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vsrc[i]), one); \
-            vdst[i*2]   = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(p));                        \
-            vdst[i*2+1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(p));                        \
-        }                                                                           \
-                                                                                    \
-        if (nloe) {                                                                 \
-            HVX_VectorPair p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vsrc[i]), one); \
-                                                                                    \
-            HVX_Vector vd = Q6_V_lo_W(p);                                           \
-            i = 2 * i;                                                              \
-                                                                                    \
-            if (nloe >= 32) {                                                       \
-                vdst[i] = Q6_Vsf_equals_Vqf32(vd);                                  \
-                nloe -= 32; ++i; vd = Q6_V_hi_W(p);                                 \
-            }                                                                       \
-                                                                                    \
-            if (nloe) {                                                             \
-                vd = Q6_Vsf_equals_Vqf32(vd);                                       \
-                hvx_vec_store_u(&vdst[i], nloe * sizeof(float), vd);                \
-            }                                                                       \
-        }                                                                           \
-    } while(0)
-
-// copy/convert n fp16 elements into n fp32 elements : source is aligned, destination is aligned
-static inline void hvx_copy_f32_f16_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    assert((unsigned long) dst % 128 == 0);
-    assert((unsigned long) src % 128 == 0);
-    hvx_copy_f32_f16_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
-}
-
-// copy/convert n fp16 elements into n fp32 elements : source is unaligned, destination is aligned
-static inline void hvx_copy_f32_f16_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    assert((unsigned long) dst % 128 == 0);
-    hvx_copy_f32_f16_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
-}
-
-// copy/convert n fp16 elements into n fp32 elements : source is aligned, destination is unaligned
-static inline void hvx_copy_f32_f16_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    assert((unsigned long) src % 128 == 0);
-    hvx_copy_f32_f16_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
-}
-
-// copy/convert n fp16 elements into n fp32 elements : source is unaligned, destination is unaligned
-static inline void hvx_copy_f32_f16_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    hvx_copy_f32_f16_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
-}
-
-#endif // HVX_COPY_H
--- a/ggml/src/ggml-hexagon/htp/hvx-dump.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-dump.h
@@ -1,132 +0,0 @@
-#ifndef HVX_DUMP_H
-#define HVX_DUMP_H
-
-#include <HAP_farf.h>
-
-#include <stdbool.h>
-#include <stdint.h>
-
-#include "hex-utils.h"
-#include "hvx-types.h"
-
-static void hvx_vec_dump_f16_n(char * pref, HVX_Vector v, uint32_t n) {
-    HVX_VectorAlias u = { .v = v };
-
-    const uint32_t n0 = n / 16;
-    const uint32_t n1 = n % 16;
-    int            i  = 0;
-    for (; i < n0; i++) {
-        hex_dump_f16_line(pref, u.fp16 + (16 * i), 16);
-    }
-    if (n1) {
-        hex_dump_f16_line(pref, u.fp16 + (16 * i), n1);
-    }
-}
-
-static void hvx_vec_dump_f16(char * pref, HVX_Vector v) {
-    hvx_vec_dump_f16_n(pref, v, 64);
-}
-
-static void hvx_vec_dump_f32_n(char * pref, HVX_Vector v, uint32_t n) {
-    union {
-        HVX_Vector v;
-        float      d[32];
-    } u = { .v = v };
-
-    const uint32_t n0 = n / 16;
-    const uint32_t n1 = n % 16;
-    int            i  = 0;
-    for (; i < n0; i++) {
-        hex_dump_f32_line(pref, u.d + (16 * i), 16);
-    }
-    if (n1) {
-        hex_dump_f32_line(pref, u.d + (16 * i), n1);
-    }
-}
-
-static void hvx_vec_dump_f32_hmt(char * pref, HVX_Vector v) {
-    union {
-        HVX_Vector v;
-        float      d[32];
-    } u = { .v = v };
-
-    FARF(HIGH, "%s: %.6f %.6f %.6f %.6f ...  %.6f %.6f %.6f %.6f ... %.6f %.6f %.6f %.6f\n", pref, u.d[0], u.d[1],
-         u.d[2], u.d[3], u.d[12], u.d[13], u.d[14], u.d[15], u.d[28], u.d[29], u.d[30], u.d[31]);
-}
-
-static void hvx_vec_dump_f32(char * pref, HVX_Vector v) {
-    hvx_vec_dump_f32_n(pref, v, 32);
-}
-
-static void hvx_vec_dump_int32(char * pref, HVX_Vector v) {
-    union {
-        HVX_Vector v;
-        int32_t    d[32];
-    } u = { .v = v };
-
-    for (int i = 0; i < 32 / 16; i++) {
-        hex_dump_int32_line(pref, u.d + (16 * i), 16);
-    }
-}
-
-static void hvx_vec_dump_int32_hmt(char * pref, HVX_Vector v) {
-    union {
-        HVX_Vector v;
-        int32_t    d[32];
-    } u = { .v = v };
-
-    FARF(HIGH, "%s: %d %d %d %d ... %d %d %d %d ... %d %d %d %d\n", pref, u.d[0], u.d[1], u.d[2], u.d[3], u.d[12],
-         u.d[13], u.d[14], u.d[15], u.d[28], u.d[29], u.d[30], u.d[31]);
-}
-
-static void hvx_vec_dump_int8_hmt(char * pref, HVX_Vector v) {
-    union {
-        HVX_Vector v;
-        int8_t     d[128];
-    } u = { .v = v };
-
-    FARF(HIGH, "%s: %d %d %d %d ... %d %d %d %d ... %d %d %d %d\n", pref, u.d[0], u.d[1], u.d[2], u.d[3], u.d[60],
-         u.d[61], u.d[62], u.d[63], u.d[124], u.d[125], u.d[126], u.d[127]);
-}
-
-static void hvx_vec_dump_int8(char * pref, HVX_Vector v) {
-    union {
-        HVX_Vector v;
-        int8_t     d[128];
-    } u = { .v = v };
-
-    for (int i = 0; i < 128 / 16; i++) {
-        hex_dump_int8_line(pref, u.d + (16 * i), 16);
-    }
-}
-
-static void hvx_vec_dump_uint8(char * pref, HVX_Vector v) {
-    union {
-        HVX_Vector v;
-        uint8_t    d[128];
-    } u = { .v = v };
-
-    for (int i = 0; i < 128 / 16; i++) {
-        hex_dump_uint8_line(pref, u.d + (16 * i), 16);
-    }
-}
-
-static bool hvx_vec_eq(HVX_Vector v0, HVX_Vector v1, size_t n) {
-    typedef union {
-        HVX_Vector v;
-        int8_t     d[128];
-    } U;
-
-    U u0 = { .v = v0 };
-    U u1 = { .v = v1 };
-
-    for (int i = 0; i < n; i++) {
-        if (u0.d[i] != u1.d[i]) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-#endif /* HVX_DUMP_H */
--- a/ggml/src/ggml-hexagon/htp/hvx-exp.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c
@@ -0,0 +1,94 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
+#include <math.h>
+#include <string.h>
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-dma.h"
+#include "htp-msg.h"
+#include "htp-ops.h"
+#include "hvx-utils.h"
+#include "ops-utils.h"
+
+static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec, HVX_Vector max_exp, HVX_Vector inf) {
+    const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
+
+    HVX_Vector out = hvx_vec_exp_fp32(in_vec);
+
+    return Q6_V_vmux_QVV(pred0, inf, out);
+}
+
+void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) {
+    int left_over       = num_elems & (VLEN_FP32 - 1);
+    int num_elems_whole = num_elems - left_over;
+
+    int unaligned_addr = 0;
+    int unaligned_loop = 0;
+    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
+        FARF(HIGH, "hvx_exp_f32: unaligned address in hvx op, possibly slower execution\n");
+        unaligned_addr = 1;
+    }
+    // assert((0 == unaligned_addr) || (0 == num_elems_whole));
+    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
+        unaligned_loop = 1;
+        FARF(HIGH, "hvx_exp_f32: unaligned loop in hvx op, possibly slower execution\n");
+    }
+
+    HVX_Vector vec_out = Q6_V_vzero();
+
+    static const float kInf    = INFINITY;
+    static const float kMaxExp = 88.02f;  // log(INF)
+
+    const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
+    const HVX_Vector inf     = hvx_vec_splat_fp32(kInf);
+
+    if (0 == unaligned_loop) {
+        HVX_Vector * p_vec_in1 = (HVX_Vector *) src;
+        HVX_Vector * p_vec_out = (HVX_Vector *) dst;
+
+        #pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            if (true == negate) {
+                HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
+                *p_vec_out++          = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
+            } else {
+                *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++, max_exp, inf);
+            }
+        }
+    } else {
+        #pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
+
+            if (true == negate) {
+                HVX_Vector neg_vec_in                    = hvx_vec_neg_fp32(in);
+                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
+            } else {
+                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in, max_exp, inf);
+            }
+        }
+    }
+
+    if (left_over > 0) {
+        const float * srcf = (float *) src + num_elems_whole;
+        float *       dstf = (float *) dst + num_elems_whole;
+
+        HVX_Vector in = *(HVX_UVector *) srcf;
+
+        if (true == negate) {
+            HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
+
+            vec_out = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
+        } else {
+            vec_out = hvx_vec_exp_fp32_guard(in, max_exp, inf);
+        }
+
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);
+    }
+}
--- a/ggml/src/ggml-hexagon/htp/hvx-exp.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-exp.h
@@ -1,215 +0,0 @@
-#ifndef HVX_EXP_H
-#define HVX_EXP_H
-
-#include <stdbool.h>
-#include <stdint.h>
-
-#include "hvx-base.h"
-#include "hvx-floor.h"
-
-#define EXP_COEFF_5 (0x39506967)  // 0.000198757 = 1/(7!)
-#define EXP_COEFF_4 (0x3AB743CE)  // 0.0013982   = 1/(6!)
-#define EXP_COEFF_3 (0x3C088908)  // 0.00833345  = 1/(5!)
-#define EXP_COEFF_2 (0x3D2AA9C1)  // 0.416658    = 1/(4!)
-#define EXP_COEFF_1 (0x3E2AAAAA)  // 0.16666667  = 1/(3!)
-#define EXP_COEFF_0 (0x3F000000)  // 0.5         = 1/(2!)
-#define EXP_LOGN2   (0x3F317218)  // ln(2)   = 0.6931471805
-#define EXP_LOG2E   (0x3FB8AA3B)  // log2(e) = 1/ln(2) = 1.4426950408
-#define EXP_ONE     (0x3f800000)  // 1.0
-#define EXP_RANGE_R (0x41a00000)  // 20.0
-#define EXP_RANGE_L (0xc1a00000)  // -20.0
-
-static inline HVX_Vector hvx_vec_exp_f32(HVX_Vector in_vec) {
-    HVX_Vector z_qf32_v;
-    HVX_Vector x_v;
-    HVX_Vector x_qf32_v;
-    HVX_Vector y_v;
-    HVX_Vector k_v;
-    HVX_Vector f_v;
-    HVX_Vector epsilon_v;
-    HVX_Vector log2e = Q6_V_vsplat_R(EXP_LOG2E);
-    HVX_Vector logn2 = Q6_V_vsplat_R(EXP_LOGN2);
-    HVX_Vector E_const;
-    HVX_Vector zero_v = Q6_V_vzero();
-
-    // exp(x) is approximated as follows:
-    //   f = floor(x/ln(2)) = floor(x*log2(e))
-    //   epsilon = x - f*ln(2)
-    //   exp(x) = exp(epsilon+f*ln(2))
-    //          = exp(epsilon)*exp(f*ln(2))
-    //          = exp(epsilon)*2^f
-    //
-    //   Since epsilon is close to zero, it can be approximated with its Taylor series:
-    //            exp(x) ~= 1+x+x^2/2!+x^3/3!+...+x^n/n!+...
-    //   Preserving the first eight elements, we get:
-    //            exp(x) ~= 1+x+e0*x^2+e1*x^3+e2*x^4+e3*x^5+e4*x^6+e5*x^7
-    //                   =  1+x+(E0+(E1+(E2+(E3+(E4+E5*x)*x)*x)*x)*x)*x^2
-
-    HVX_Vector temp_v = in_vec;
-
-    // Clamp inputs to (-20.0, 20.0)
-    HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, Q6_V_vsplat_R(EXP_RANGE_R));
-    HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(Q6_V_vsplat_R(EXP_RANGE_L), in_vec);
-
-    in_vec = Q6_V_vmux_QVV(pred_cap_right, Q6_V_vsplat_R(EXP_RANGE_R), temp_v);
-    in_vec = Q6_V_vmux_QVV(pred_cap_left, Q6_V_vsplat_R(EXP_RANGE_L), temp_v);
-
-    epsilon_v = Q6_Vqf32_vmpy_VsfVsf(log2e, in_vec);
-    epsilon_v = Q6_Vsf_equals_Vqf32(epsilon_v);
-
-    //    f_v is the floating point result and k_v is the integer result
-    f_v = hvx_vec_floor_f32(epsilon_v);
-    k_v = hvx_vec_truncate_f32(f_v);
-
-    x_qf32_v = Q6_Vqf32_vadd_VsfVsf(in_vec, zero_v);
-
-    //  x = x - f_v * logn2;
-    epsilon_v = Q6_Vqf32_vmpy_VsfVsf(f_v, logn2);
-    x_qf32_v  = Q6_Vqf32_vsub_Vqf32Vqf32(x_qf32_v, epsilon_v);
-    // normalize before every QFloat's vmpy
-    x_qf32_v  = Q6_Vqf32_vadd_Vqf32Vsf(x_qf32_v, zero_v);
-
-    // z = x * x;
-    z_qf32_v = Q6_Vqf32_vmpy_Vqf32Vqf32(x_qf32_v, x_qf32_v);
-    z_qf32_v = Q6_Vqf32_vadd_Vqf32Vsf(z_qf32_v, zero_v);
-
-    x_v = Q6_Vsf_equals_Vqf32(x_qf32_v);
-
-    // y = E4 + E5 * x;
-    E_const = Q6_V_vsplat_R(EXP_COEFF_5);
-    y_v     = Q6_Vqf32_vmpy_VsfVsf(E_const, x_v);
-    E_const = Q6_V_vsplat_R(EXP_COEFF_4);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
-
-    // y = E3 + y * x;
-    E_const = Q6_V_vsplat_R(EXP_COEFF_3);
-    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
-
-    // y = E2 + y * x;
-    E_const = Q6_V_vsplat_R(EXP_COEFF_2);
-    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
-
-    // y = E1 + y * x;
-    E_const = Q6_V_vsplat_R(EXP_COEFF_1);
-    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
-
-    // y = E0 + y * x;
-    E_const = Q6_V_vsplat_R(EXP_COEFF_0);
-    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
-    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
-
-    // y = x + y * z;
-    y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, z_qf32_v);
-    y_v = Q6_Vqf32_vadd_Vqf32Vqf32(y_v, x_qf32_v);
-    y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
-
-    // y = y + 1.0;
-    y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, Q6_V_vsplat_R(EXP_ONE));
-
-    // insert exponents
-    //        y = ldexpf(y, k);
-    //    y_v += k_v; // qf32
-    // modify exponent
-
-    y_v = Q6_Vsf_equals_Vqf32(y_v);
-
-    // add k_v to the exponent of y_v
-    HVX_Vector y_v_exponent = Q6_Vw_vasl_VwR(y_v, 1);
-
-    y_v_exponent = Q6_Vuw_vlsr_VuwR(y_v_exponent, IEEE_VSF_MANTLEN + 1);
-    y_v_exponent = Q6_Vw_vadd_VwVw(k_v, y_v_exponent);
-
-    // exponent cannot be negative; if overflow is detected, result is set to zero
-    HVX_VectorPred qy_v_negative_exponent = Q6_Q_vcmp_gt_VwVw(zero_v, y_v_exponent);
-
-    y_v = Q6_Vw_vaslacc_VwVwR(y_v, k_v, IEEE_VSF_MANTLEN);
-
-    y_v = Q6_V_vmux_QVV(qy_v_negative_exponent, zero_v, y_v);
-
-    return y_v;
-}
-
-static inline HVX_Vector hvx_vec_exp_f32_guard(HVX_Vector in_vec, HVX_Vector max_exp, HVX_Vector inf) {
-    const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
-
-    HVX_Vector out = hvx_vec_exp_f32(in_vec);
-
-    return Q6_V_vmux_QVV(pred0, inf, out);
-}
-
-static inline void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) {
-    int left_over       = num_elems & (VLEN_FP32 - 1);
-    int num_elems_whole = num_elems - left_over;
-
-    int unaligned_addr = 0;
-    int unaligned_loop = 0;
-    if ((0 == hex_is_aligned((void *) src, VLEN)) || (0 == hex_is_aligned((void *) dst, VLEN))) {
-        unaligned_addr = 1;
-    }
-    // assert((0 == unaligned_addr) || (0 == num_elems_whole));
-    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
-        unaligned_loop = 1;
-    }
-
-    HVX_Vector vec_out = Q6_V_vzero();
-
-    static const float kInf    = INFINITY;
-    static const float kMaxExp = 88.02f;  // log(INF)
-
-    const HVX_Vector max_exp = hvx_vec_splat_f32(kMaxExp);
-    const HVX_Vector inf     = hvx_vec_splat_f32(kInf);
-
-    if (0 == unaligned_loop) {
-        HVX_Vector * p_vec_in1 = (HVX_Vector *) src;
-        HVX_Vector * p_vec_out = (HVX_Vector *) dst;
-
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            if (true == negate) {
-                HVX_Vector neg_vec_in = hvx_vec_neg_f32(*p_vec_in1++);
-                *p_vec_out++          = hvx_vec_exp_f32_guard(neg_vec_in, max_exp, inf);
-            } else {
-                *p_vec_out++ = hvx_vec_exp_f32_guard(*p_vec_in1++, max_exp, inf);
-            }
-        }
-    } else {
-        #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
-
-            if (true == negate) {
-                HVX_Vector neg_vec_in                    = hvx_vec_neg_f32(in);
-                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_f32_guard(neg_vec_in, max_exp, inf);
-            } else {
-                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_f32_guard(in, max_exp, inf);
-            }
-        }
-    }
-
-    if (left_over > 0) {
-        const float * srcf = (float *) src + num_elems_whole;
-        float *       dstf = (float *) dst + num_elems_whole;
-
-        HVX_Vector in = *(HVX_UVector *) srcf;
-
-        if (true == negate) {
-            HVX_Vector neg_vec_in = hvx_vec_neg_f32(in);
-
-            vec_out = hvx_vec_exp_f32_guard(neg_vec_in, max_exp, inf);
-        } else {
-            vec_out = hvx_vec_exp_f32_guard(in, max_exp, inf);
-        }
-
-        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);
-    }
-}
-
-#endif /* HVX_EXP_H */
--- a/ggml/src/ggml-hexagon/htp/hvx-floor.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-floor.h
@@ -1,100 +0,0 @@
-#ifndef HVX_FLOOR_H
-#define HVX_FLOOR_H
-
-#include <stdbool.h>
-#include <stdint.h>
-
-#include "hvx-base.h"
-
-#define IEEE_VSF_EXPLEN   (8)
-#define IEEE_VSF_EXPBIAS  (127)
-#define IEEE_VSF_EXPMASK  (0xFF)
-#define IEEE_VSF_MANTLEN  (23)
-#define IEEE_VSF_MANTMASK (0x7FFFFF)
-#define IEEE_VSF_MIMPMASK (0x800000)
-
-static inline HVX_Vector hvx_vec_truncate_f32(HVX_Vector in_vec) {
-    HVX_Vector mask_mant_v  = Q6_V_vsplat_R(IEEE_VSF_MANTMASK);
-    HVX_Vector mask_impl_v  = Q6_V_vsplat_R(IEEE_VSF_MIMPMASK);
-    HVX_Vector const_zero_v = Q6_V_vzero();
-
-    HVX_VectorPred q_negative = Q6_Q_vcmp_gt_VwVw(const_zero_v, in_vec);
-
-    HVX_Vector expval_v = in_vec >> IEEE_VSF_MANTLEN;
-    expval_v &= IEEE_VSF_EXPMASK;
-    expval_v -= IEEE_VSF_EXPBIAS;
-
-    // negative exp == fractional value
-    HVX_VectorPred q_negexp = Q6_Q_vcmp_gt_VwVw(const_zero_v, expval_v);
-
-    HVX_Vector rshift_v = IEEE_VSF_MANTLEN - expval_v;         // fractional bits - exp shift
-
-    HVX_Vector mant_v = in_vec & mask_mant_v;                  // obtain mantissa
-    HVX_Vector vout   = Q6_Vw_vadd_VwVw(mant_v, mask_impl_v);  // add implicit 1.0
-
-    vout = Q6_Vw_vasr_VwVw(vout, rshift_v);                    // shift to obtain truncated integer
-    vout = Q6_V_vmux_QVV(q_negexp, const_zero_v, vout);        // expval<0 -> 0
-
-    HVX_Vector neg_vout = -vout;
-
-    vout = Q6_V_vmux_QVV(q_negative, neg_vout, vout);  // handle negatives
-
-    return (vout);
-}
-
-static inline HVX_Vector hvx_vec_floor_f32(HVX_Vector in_vec) {
-    HVX_Vector mask_mant_v    = Q6_V_vsplat_R(IEEE_VSF_MANTMASK);
-    HVX_Vector mask_impl_v    = Q6_V_vsplat_R(IEEE_VSF_MIMPMASK);
-    HVX_Vector const_mnlen_v  = Q6_V_vsplat_R(IEEE_VSF_MANTLEN);
-    HVX_Vector const_zero_v   = Q6_V_vzero();
-    HVX_Vector const_negone_v = Q6_V_vsplat_R(0xbf800000);  // -1 IEEE vsf
-
-    HVX_VectorPred q_negative = Q6_Q_vcmp_gt_VwVw(const_zero_v, in_vec);
-
-    HVX_Vector expval_v = in_vec >> IEEE_VSF_MANTLEN;
-    expval_v &= IEEE_VSF_EXPMASK;
-    expval_v -= IEEE_VSF_EXPBIAS;
-
-    HVX_VectorPred q_negexp     = Q6_Q_vcmp_gt_VwVw(const_zero_v, expval_v);
-    HVX_VectorPred q_expltmn    = Q6_Q_vcmp_gt_VwVw(const_mnlen_v, expval_v);
-    HVX_VectorPred q_negexp_pos = Q6_Q_vcmp_gtand_QVwVw(q_negexp, in_vec, const_zero_v);
-    HVX_VectorPred q_negexp_neg = Q6_Q_vcmp_gtand_QVwVw(q_negexp, const_zero_v, in_vec);
-
-    // if expval < 0 (q_negexp)         // <0, floor is 0
-    //    if vin > 0
-    //       floor = 0
-    //    if vin < 0
-    //       floor = -1
-    // if expval < mant_len (q_expltmn) // >0, but fraction may exist
-    //    get sign (q_negative)
-    //    mask >> expval                // fraction bits to mask off
-    //    vout = ~(mask)                // apply mask to remove fraction
-    //    if (qneg)                     // negative floor is one less (more, sign bit for neg)
-    //      vout += ((impl_mask) >> expval)
-    //    if (mask && vin)
-    //      vout = vin
-    // else                             // already an integer
-    //    ;                             // no change
-
-    // compute floor
-    mask_mant_v >>= expval_v;
-    HVX_Vector neg_addin_v    = mask_impl_v >> expval_v;
-    HVX_Vector vout_neg_addin = Q6_Vw_vadd_VwVw(in_vec, neg_addin_v);
-    HVX_Vector vout           = Q6_V_vmux_QVV(q_negative, vout_neg_addin, in_vec);
-
-    HVX_Vector     mask_chk_v = Q6_V_vand_VV(in_vec, mask_mant_v);  // chk if bits set
-    HVX_VectorPred q_integral = Q6_Q_vcmp_eq_VwVw(const_zero_v, mask_chk_v);
-
-    HVX_Vector not_mask_v = Q6_V_vnot_V(mask_mant_v);        // frac bits to clear
-    HVX_Vector vfrfloor_v = Q6_V_vand_VV(vout, not_mask_v);  // clear frac bits
-
-    vout = in_vec;
-    vout = Q6_V_vmux_QVV(q_expltmn, vfrfloor_v, vout);         // expval<mant
-    vout = Q6_V_vmux_QVV(q_integral, in_vec, vout);            // integral values
-    vout = Q6_V_vmux_QVV(q_negexp_pos, const_zero_v, vout);    // expval<0 x>0 -> 0
-    vout = Q6_V_vmux_QVV(q_negexp_neg, const_negone_v, vout);  // expval<0 x<0 -> -1
-
-    return vout;
-}
-
-#endif /* HVX_FLOOR_H */
--- a/ggml/src/ggml-hexagon/htp/hvx-inverse.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.c
@@ -0,0 +1,72 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
+#include <math.h>
+#include <string.h>
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-dma.h"
+#include "htp-msg.h"
+#include "htp-ops.h"
+#include "hvx-utils.h"
+#include "ops-utils.h"
+
+static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) {
+    HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
+
+    HVX_Vector           masked_out = Q6_V_vand_VV(out, nan_inf_mask);
+    const HVX_VectorPred pred       = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out);
+
+    return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out);
+}
+
+void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
+    int left_over       = num_elems & (VLEN_FP32 - 1);
+    int num_elems_whole = num_elems - left_over;
+
+    int unaligned_addr = 0;
+    int unaligned_loop = 0;
+    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
+        FARF(HIGH, "hvx_inverse_f32: unaligned address in hvx op, possibly slower execution\n");
+        unaligned_addr = 1;
+    }
+    // assert((0 == unaligned_addr) || (0 == num_elems_whole));
+    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
+        unaligned_loop = 1;
+        FARF(HIGH, "hvx_inverse_f32: unaligned loop in hvx op, possibly slower execution\n");
+    }
+
+    static const uint32_t kNanInfMask  = 0x7f800000;
+    const HVX_Vector      nan_inf_mask = Q6_V_vsplat_R(kNanInfMask);
+
+    if (0 == unaligned_loop) {
+        HVX_Vector * p_vec_in  = (HVX_Vector *) src;
+        HVX_Vector * p_vec_out = (HVX_Vector *) dst;
+
+        #pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++, nan_inf_mask);
+        }
+    } else {
+        #pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector in                            = *(HVX_UVector *) (src + i * SIZEOF_FP32);
+            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
+        }
+    }
+
+    if (left_over > 0) {
+        const float * srcf = (float *) src + num_elems_whole;
+        float *       dstf = (float *) dst + num_elems_whole;
+
+        HVX_Vector in  = *(HVX_UVector *) srcf;
+        HVX_Vector out = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
+
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
+    }
+}
--- a/ggml/src/ggml-hexagon/htp/hvx-inverse.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.h
@@ -1,176 +0,0 @@
-#ifndef HVX_INVERSE_H
-#define HVX_INVERSE_H
-
-#include <HAP_farf.h>
-
-#include <math.h>
-#include <string.h>
-#include <assert.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#include "hvx-base.h"
-
-// ====================================================
-// FUNCTION: 1/(x+1)     y(0) = 1,  y(0.5) = 0.6667, y(1) = 0.5
-// Order:3; continuity: True; Ends forced: True
-// Mode: unsigned;   Result fractional bits: 14
-// Peak Error: 1.1295e-04  Rms Error: 2.8410e-05   Mean Error: 1.1370e-05
-//      32769  -32706   31252  -10589
-//      32590  -30635   22793   -4493
-//      32066  -27505   16481   -2348
-//      31205  -24054   11849   -1306
-
-static inline HVX_Vector hvx_vec_recip_xp1_O3_unsigned(HVX_Vector vx) {
-    // input is 0..0xffff representing 0.0  .. 1.0
-    HVX_Vector p;
-    p = Q6_Vh_vlut4_VuhPh(vx, 0xFAE6F6D4EE73D6A3ull);
-    p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x2E49406159097A14ull);
-    p = Q6_Vh_vmps_VhVhVuhPuh_sat(p, vx, 0x5DF66B7177AB7FC2ull);
-    p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x79E57D427F4E8001ull);
-    return p;  // signed result, 14 fractional bits
-}
-
-// Find reciprocal of fp16.
-// (1) first, convert to fp32, multiplying by 1.0; this is done to
-//    handle denormals. Ignoring sign and zero, result should be at
-//    least 5.9604645e-08 (32-bit code 0x33800000) and at most 131008 (0x47ffe000)
-//    (exponent in range [103,143])
-// (2) extract the mantissa into 16-bit unsigned; find reciprocal using a fitted poly
-// (3) put this, along with '253-exp' (exp from (1)) together to make an qf32
-// (4) convert that to fp16
-// (5) put sign back in. Also, if the original value (w/o sign) was <0x81, replace
-//     the result with the max value.
-static inline HVX_Vector hvx_vec_inverse_f16(HVX_Vector vals) {
-    HVX_Vector     em_mask  = Q6_Vh_vsplat_R(0x7FFF);
-    HVX_Vector     avals    = Q6_V_vand_VV(vals, em_mask);
-    HVX_VectorPred is_neg   = Q6_Q_vcmp_gt_VhVh(avals, vals);
-    // is too small to 1/x ? for 'standard' fp16, this would be 0x101
-    HVX_VectorPred is_small = Q6_Q_vcmp_gt_VhVh(Q6_Vh_vsplat_R(0x101), avals);
-
-    HVX_VectorPair to_qf32  = Q6_Wqf32_vmpy_VhfVhf(avals, Q6_Vh_vsplat_R(0x3C00));  // *1.0
-    HVX_Vector     to_f32_0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(to_qf32));
-    HVX_Vector     to_f32_1 = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(to_qf32));
-
-    // bits 22..13 contain the mantissa now (w/o hidden bit); move to bit 14..5 of a 16-bit vector
-    HVX_Vector mant_u16 = Q6_Vh_vshuffo_VhVh(Q6_Vw_vasl_VwR(to_f32_1, 9), Q6_Vw_vasl_VwR(to_f32_0, 9));
-    // likewise extract the upper 16 from each, containing the exponents in range 103..142
-    HVX_Vector exp_u16  = Q6_Vh_vshuffo_VhVh(to_f32_1, to_f32_0);
-    //Get exponent in IEEE 32-bit representation
-    exp_u16             = Q6_Vuh_vlsr_VuhR(exp_u16, 7);
-
-    // so, mant_u16 contains an unbiased mantissa in upper 10 bits of each u16 lane
-    // We can consider it to be x-1.0, with 16 fractional bits, where 'x' is in range [1.0,2.0)
-    // Use poly to transform to 1/x, with 14 fractional bits
-    //
-    HVX_Vector rm = hvx_vec_recip_xp1_O3_unsigned(mant_u16);
-
-    HVX_Vector vcl0 = Q6_Vuh_vcl0_Vuh(rm);  //count leading zeros
-
-    // Get mantissa for 16-bit represenation
-    HVX_Vector mant_recip = Q6_V_vand_VV(Q6_Vh_vasr_VhR(Q6_Vh_vasl_VhVh(rm, vcl0), 5), Q6_Vh_vsplat_R(0x03FF));
-
-    //Compute Reciprocal Exponent
-    HVX_Vector exp_recip =
-        Q6_Vh_vsub_VhVh(Q6_Vh_vsub_VhVh(Q6_Vh_vsplat_R(254), exp_u16), Q6_Vh_vsub_VhVh(vcl0, Q6_Vh_vsplat_R(1)));
-    //Convert it for 16-bit representation
-    exp_recip = Q6_Vh_vadd_VhVh_sat(Q6_Vh_vsub_VhVh(exp_recip, Q6_Vh_vsplat_R(127)), Q6_Vh_vsplat_R(15));
-    exp_recip = Q6_Vh_vasl_VhR(exp_recip, 10);
-
-    //Merge exponent and mantissa for reciprocal
-    HVX_Vector recip = Q6_V_vor_VV(exp_recip, mant_recip);
-    // map 'small' inputs to standard largest value 0x7bff
-    recip            = Q6_V_vmux_QVV(is_small, Q6_Vh_vsplat_R(0x7bff), recip);
-    // add sign back
-    recip            = Q6_V_vandor_VQR(recip, is_neg, 0x80008000);
-    return recip;
-}
-
-static inline HVX_Vector hvx_vec_inverse_f32(HVX_Vector v_sf) {
-    HVX_Vector inv_aprox_sf = Q6_V_vsplat_R(0x7EEEEBB3);
-    HVX_Vector two_sf       = hvx_vec_splat_f32(2.0);
-
-    // First approximation
-    HVX_Vector i_sf = Q6_Vw_vsub_VwVw(inv_aprox_sf, v_sf);
-
-    HVX_Vector r_qf;
-
-    // Refine
-    r_qf = Q6_Vqf32_vmpy_VsfVsf(
-        i_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(i_sf, v_sf)))));
-    r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
-        r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
-    r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
-        r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
-
-    return Q6_Vsf_equals_Vqf32(r_qf);
-}
-
-static inline HVX_Vector hvx_vec_inverse_f32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) {
-    HVX_Vector out = hvx_vec_inverse_f32(v_sf);
-
-    HVX_Vector     masked_out = Q6_V_vand_VV(out, nan_inf_mask);
-    const HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out);
-
-    return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out);
-}
-
-#define hvx_inverse_f32_loop_body(dst_type, src_type, vec_store)             \
-    do {                                                                     \
-        dst_type * restrict vdst = (dst_type *) dst;                         \
-        src_type * restrict vsrc = (src_type *) src;                         \
-                                                                             \
-        const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(0x7f800000);           \
-                                                                             \
-        const uint32_t nvec = n / VLEN_FP32;                                 \
-        const uint32_t nloe = n % VLEN_FP32;                                 \
-                                                                             \
-        uint32_t i = 0;                                                      \
-                                                                             \
-        _Pragma("unroll(4)")                                                 \
-        for (; i < nvec; i++) {                                              \
-             vdst[i] = hvx_vec_inverse_f32_guard(vsrc[i], nan_inf_mask);     \
-        }                                                                    \
-        if (nloe) {                                                          \
-            HVX_Vector v = hvx_vec_inverse_f32_guard(vsrc[i], nan_inf_mask); \
-            vec_store((void *) &vdst[i], nloe * SIZEOF_FP32, v);             \
-        }                                                                    \
-    } while(0)
-
-static inline void hvx_inverse_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    assert((unsigned long) dst % 128 == 0);
-    assert((unsigned long) src % 128 == 0);
-    hvx_inverse_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
-}
-
-static inline void hvx_inverse_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    assert((unsigned long) dst % 128 == 0);
-    hvx_inverse_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
-}
-
-static inline void hvx_inverse_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    assert((unsigned long) src % 128 == 0);
-    hvx_inverse_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
-}
-
-static inline void hvx_inverse_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    hvx_inverse_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
-}
-
-static inline void hvx_inverse_f32(uint8_t * restrict dst, uint8_t * restrict src, const int num_elems) {
-    if ((unsigned long) dst % 128 == 0) {
-        if ((unsigned long) src % 128 == 0) {
-            hvx_inverse_f32_aa(dst, src, num_elems);
-        } else {
-            hvx_inverse_f32_au(dst, src, num_elems);
-        }
-    } else {
-        if ((unsigned long) src % 128 == 0) {
-            hvx_inverse_f32_ua(dst, src, num_elems);
-        } else {
-            hvx_inverse_f32_uu(dst, src, num_elems);
-        }
-    }
-}
-
-#endif // HVX_INVERSE_H
--- a/ggml/src/ggml-hexagon/htp/hvx-reduce.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-reduce.h
@@ -1,225 +0,0 @@
-#ifndef HVX_REDUCE_H
-#define HVX_REDUCE_H
-
-#include <math.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <assert.h>
-
-#include "hex-utils.h"
-#include "hvx-base.h"
-#include "hvx-types.h"
-
-static inline HVX_Vector hvx_vec_reduce_sum_n_i32(HVX_Vector in, unsigned int n) {
-    unsigned int total = n * 4;  // total vec nbytes
-    unsigned int width = 4;      // int32
-
-    HVX_Vector sum = in, sum_t;
-    while (width < total) {
-        sum_t = Q6_V_vror_VR(sum, width);     // rotate right
-        sum   = Q6_Vw_vadd_VwVw(sum_t, sum);  // elementwise sum
-        width = width << 1;
-    }
-    return sum;
-}
-
-static inline HVX_Vector hvx_vec_reduce_sum_i32(HVX_Vector in) {
-    return hvx_vec_reduce_sum_n_i32(in, 32);
-}
-
-static inline HVX_Vector hvx_vec_reduce_sum_n_qf32(HVX_Vector in, unsigned int n) {
-    unsigned int total = n * 4;  // total vec nbytes
-    unsigned int width = 4;      // fp32 nbytes
-
-    HVX_Vector sum = in, sum_t;
-    while (width < total) {
-        sum_t = Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum), width);  // rotate right
-        sum   = Q6_Vqf32_vadd_Vqf32Vsf(sum, sum_t);             // elementwise sum
-        width = width << 1;
-    }
-    return sum;
-}
-
-static inline HVX_Vector hvx_vec_reduce_sum_qf32(HVX_Vector in) {
-    return hvx_vec_reduce_sum_n_qf32(in, 32);
-}
-
-static inline HVX_Vector hvx_vec_reduce_sum_n_f32(HVX_Vector in, unsigned int n) {
-    unsigned int total = n * 4;  // total vec nbytes
-    unsigned int width = 4;      // fp32 nbytes
-
-    HVX_Vector sum = in, sum_t;
-    while (width < total) {
-        sum_t = Q6_V_vror_VR(sum, width);                               // rotate right
-        sum   = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t));  // elementwise sum
-        width = width << 1;
-    }
-    return sum;
-}
-
-static inline HVX_Vector hvx_vec_reduce_sum_f32(HVX_Vector in) {
-    return hvx_vec_reduce_sum_n_f32(in, 32);
-}
-
-static inline HVX_Vector hvx_vec_reduce_max_f16(HVX_Vector in) {
-    unsigned total = 128;  // total vec nbytes
-    unsigned width = 2;    // fp16 nbytes
-
-    HVX_Vector _max = in, _max_t;
-    while (width < total) {
-        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
-        _max   = Q6_Vhf_vmax_VhfVhf(_max_t, _max);  // elementwise max
-        width  = width << 1;
-    }
-
-    return _max;
-}
-
-static inline HVX_Vector hvx_vec_reduce_max2_f16(HVX_Vector in, HVX_Vector _max) {
-    unsigned total = 128;  // total vec nbytes
-    unsigned width = 2;    // fp32 nbytes
-
-    HVX_Vector _max_t;
-
-    _max = Q6_Vhf_vmax_VhfVhf(in, _max);
-    while (width < total) {
-        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
-        _max   = Q6_Vhf_vmax_VhfVhf(_max_t, _max);  // elementwise max
-        width  = width << 1;
-    }
-
-    return _max;
-}
-
-static inline HVX_Vector hvx_vec_reduce_max_f32(HVX_Vector in) {
-    unsigned total = 128;  // total vec nbytes
-    unsigned width = 4;    // fp32 nbytes
-
-    HVX_Vector _max = in, _max_t;
-    while (width < total) {
-        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
-        _max   = Q6_Vsf_vmax_VsfVsf(_max_t, _max);  // elementwise max
-        width  = width << 1;
-    }
-
-    return _max;
-}
-
-static inline HVX_Vector hvx_vec_reduce_max2_f32(HVX_Vector in, HVX_Vector _max) {
-    unsigned total = 128;  // total vec nbytes
-    unsigned width = 4;    // fp32 nbytes
-
-    HVX_Vector _max_t;
-
-    _max = Q6_Vsf_vmax_VsfVsf(in, _max);
-    while (width < total) {
-        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
-        _max   = Q6_Vsf_vmax_VsfVsf(_max_t, _max);  // elementwise max
-        width  = width << 1;
-    }
-
-    return _max;
-}
-
-#define hvx_reduce_loop_body(src_type, init_vec, pad_vec, vec_op, reduce_op, scalar_reduce) \
-    do {                                                                                    \
-        src_type * restrict vsrc = (src_type *) src;                                        \
-        HVX_Vector acc = init_vec;                                                          \
-                                                                                            \
-        const uint32_t elem_size = sizeof(float);                                           \
-        const uint32_t epv  = 128 / elem_size;                                              \
-        const uint32_t nvec = num_elems / epv;                                              \
-        const uint32_t nloe = num_elems % epv;                                              \
-                                                                                            \
-        uint32_t i = 0;                                                                     \
-        _Pragma("unroll(4)")                                                                \
-        for (; i < nvec; i++) {                                                             \
-            acc = vec_op(acc, vsrc[i]);                                                     \
-        }                                                                                   \
-        if (nloe) {                                                                         \
-            const float * srcf = (const float *) src + i * epv;                             \
-            HVX_Vector in = *(HVX_UVector *) srcf;                                          \
-            HVX_Vector temp = Q6_V_valign_VVR(in, pad_vec, nloe * elem_size);               \
-            acc = vec_op(acc, temp);                                                        \
-        }                                                                                   \
-        HVX_Vector v = reduce_op(acc);                                                      \
-        return scalar_reduce(v);                                                            \
-    } while(0)
-
-#define HVX_REDUCE_MAX_OP(acc, val) Q6_Vsf_vmax_VsfVsf(acc, val)
-#define HVX_REDUCE_SUM_OP(acc, val) Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(acc), val)
-#define HVX_SUM_SQ_OP(acc, val) Q6_Vqf32_vadd_Vqf32Vqf32(acc, Q6_Vqf32_vmpy_VsfVsf(val, val))
-#define HVX_REDUCE_MAX_SCALAR(v) hvx_vec_get_f32(v)
-#define HVX_REDUCE_SUM_SCALAR(v) hvx_vec_get_f32(Q6_Vsf_equals_Vqf32(v))
-
-// Max variants
-
-static inline float hvx_reduce_max_f32_a(const uint8_t * restrict src, const int num_elems) {
-    HVX_Vector init_vec = hvx_vec_splat_f32(((const float *) src)[0]);
-    assert((unsigned long) src % 128 == 0);
-    hvx_reduce_loop_body(HVX_Vector, init_vec, init_vec, HVX_REDUCE_MAX_OP, hvx_vec_reduce_max_f32, HVX_REDUCE_MAX_SCALAR);
-}
-
-static inline float hvx_reduce_max_f32_u(const uint8_t * restrict src, const int num_elems) {
-    HVX_Vector init_vec = hvx_vec_splat_f32(((const float *) src)[0]);
-    hvx_reduce_loop_body(HVX_UVector, init_vec, init_vec, HVX_REDUCE_MAX_OP, hvx_vec_reduce_max_f32, HVX_REDUCE_MAX_SCALAR);
-}
-
-static inline float hvx_reduce_max_f32(const uint8_t * restrict src, const int num_elems) {
-    if (hex_is_aligned((void *) src, 128)) {
-        return hvx_reduce_max_f32_a(src, num_elems);
-    } else {
-        return hvx_reduce_max_f32_u(src, num_elems);
-    }
-}
-
-// Sum variants
-
-static inline float hvx_reduce_sum_f32_a(const uint8_t * restrict src, const int num_elems) {
-    HVX_Vector init_vec = Q6_V_vsplat_R(0);
-    assert((unsigned long) src % 128 == 0);
-    hvx_reduce_loop_body(HVX_Vector, init_vec, init_vec, HVX_REDUCE_SUM_OP, hvx_vec_reduce_sum_qf32, HVX_REDUCE_SUM_SCALAR);
-}
-
-static inline float hvx_reduce_sum_f32_u(const uint8_t * restrict src, const int num_elems) {
-    HVX_Vector init_vec = Q6_V_vsplat_R(0);
-    hvx_reduce_loop_body(HVX_UVector, init_vec, init_vec, HVX_REDUCE_SUM_OP, hvx_vec_reduce_sum_qf32, HVX_REDUCE_SUM_SCALAR);
-}
-
-static inline float hvx_reduce_sum_f32(const uint8_t * restrict src, const int num_elems) {
-    if (hex_is_aligned((void *) src, 128)) {
-        return hvx_reduce_sum_f32_a(src, num_elems);
-    } else {
-        return hvx_reduce_sum_f32_u(src, num_elems);
-    }
-}
-
-// Sum of squares variants
-
-static inline float hvx_sum_of_squares_f32_a(const uint8_t * restrict src, const int num_elems) {
-    HVX_Vector init_vec = Q6_V_vsplat_R(0);
-    assert((uintptr_t) src % 128 == 0);
-    hvx_reduce_loop_body(HVX_Vector, init_vec, init_vec, HVX_SUM_SQ_OP, hvx_vec_reduce_sum_qf32, HVX_REDUCE_SUM_SCALAR);
-}
-
-static inline float hvx_sum_of_squares_f32_u(const uint8_t * restrict src, const int num_elems) {
-    HVX_Vector init_vec = Q6_V_vsplat_R(0);
-    hvx_reduce_loop_body(HVX_UVector, init_vec, init_vec, HVX_SUM_SQ_OP, hvx_vec_reduce_sum_qf32, HVX_REDUCE_SUM_SCALAR);
-}
-
-static inline float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems) {
-    if (hex_is_aligned((void *) src, 128)) {
-        return hvx_sum_of_squares_f32_a(src, num_elems);
-    } else {
-        return hvx_sum_of_squares_f32_u(src, num_elems);
-    }
-}
-
-#undef hvx_reduce_loop_body
-#undef HVX_REDUCE_MAX_OP
-#undef HVX_REDUCE_SUM_OP
-#undef HVX_REDUCE_MAX_SCALAR
-#undef HVX_REDUCE_SUM_SCALAR
-#undef HVX_SUM_SQ_OP
-
-#endif /* HVX_REDUCE_H */
--- a/ggml/src/ggml-hexagon/htp/hvx-scale.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-scale.h
@@ -1,133 +0,0 @@
-#ifndef HVX_SCALE_H
-#define HVX_SCALE_H
-
-#include <assert.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#include "hvx-base.h"
-
-#define hvx_scale_f32_loop_body(dst_type, src_type, vec_store)                       \
-    do {                                                                             \
-        dst_type * restrict vdst = (dst_type *) dst;                                 \
-        src_type * restrict vsrc = (src_type *) src;                                 \
-                                                                                     \
-        HVX_Vector vs = hvx_vec_splat_f32(scale);                                    \
-                                                                                     \
-        const uint32_t elem_size = sizeof(float);                                    \
-        const uint32_t epv = 128 / elem_size;                                        \
-        const uint32_t nvec = n / epv;                                               \
-        const uint32_t nloe = n % epv;                                               \
-                                                                                     \
-        uint32_t i = 0;                                                              \
-                                                                                     \
-        _Pragma("unroll(4)")                                                         \
-        for (; i < nvec; ++i) {                                                      \
-            HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);                        \
-            vdst[i]      = Q6_Vsf_equals_Vqf32(v);                                   \
-        }                                                                            \
-        if (nloe) {                                                                  \
-            HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);                        \
-            vec_store((void *) &vdst[i], nloe * elem_size, Q6_Vsf_equals_Vqf32(v));  \
-        }                                                                            \
-    } while(0)
-
-static inline void hvx_scale_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
-    assert((size_t) dst % 128 == 0);
-    assert((size_t) src % 128 == 0);
-    hvx_scale_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
-}
-
-static inline void hvx_scale_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
-    assert((size_t) dst % 128 == 0);
-    hvx_scale_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
-}
-
-static inline void hvx_scale_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
-    assert((size_t) src % 128 == 0);
-    hvx_scale_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
-}
-
-static inline void hvx_scale_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
-    hvx_scale_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
-}
-
-static inline void hvx_scale_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
-    if (((size_t) dst & 127) == 0) {
-        if (((size_t) src & 127) == 0) {
-            hvx_scale_f32_aa(dst, src, n, scale);
-        } else {
-            hvx_scale_f32_au(dst, src, n, scale);
-        }
-    } else {
-        if (((size_t) src & 127) == 0) {
-            hvx_scale_f32_ua(dst, src, n, scale);
-        } else {
-            hvx_scale_f32_uu(dst, src, n, scale);
-        }
-    }
-}
-
-#define hvx_scale_offset_f32_loop_body(dst_type, src_type, vec_store)                \
-    do {                                                                             \
-        dst_type * restrict vdst = (dst_type *) dst;                                 \
-        src_type * restrict vsrc = (src_type *) src;                                 \
-                                                                                     \
-        HVX_Vector vs = hvx_vec_splat_f32(scale);                                    \
-        HVX_Vector vo = hvx_vec_splat_f32(offset);                                   \
-                                                                                     \
-        const uint32_t elem_size = sizeof(float);                                    \
-        const uint32_t epv = 128 / elem_size;                                        \
-        const uint32_t nvec = n / epv;                                               \
-        const uint32_t nloe = n % epv;                                               \
-                                                                                     \
-        uint32_t i = 0;                                                              \
-                                                                                     \
-        _Pragma("unroll(4)")                                                         \
-        for (; i < nvec; ++i) {                                                      \
-            HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo); \
-            vdst[i] = Q6_Vsf_equals_Vqf32(v);                                        \
-        }                                                                            \
-        if (nloe) {                                                                  \
-            HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo); \
-            vec_store((void *) &vdst[i], nloe * elem_size, Q6_Vsf_equals_Vqf32(v));  \
-        }                                                                            \
-    } while(0)
-
-static inline void hvx_scale_offset_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
-    assert((size_t) dst % 128 == 0);
-    assert((size_t) src % 128 == 0);
-    hvx_scale_offset_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
-}
-
-static inline void hvx_scale_offset_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
-    assert((size_t) dst % 128 == 0);
-    hvx_scale_offset_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
-}
-
-static inline void hvx_scale_offset_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
-    assert((size_t) src % 128 == 0);
-    hvx_scale_offset_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
-}
-
-static inline void hvx_scale_offset_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
-    hvx_scale_offset_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
-}
-
-static inline void hvx_scale_offset_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
-    if (((size_t) dst & 127) == 0) {
-        if (((size_t) src & 127) == 0) {
-            hvx_scale_offset_f32_aa(dst, src, n, scale, offset);
-        } else {
-            hvx_scale_offset_f32_au(dst, src, n, scale, offset);
-        }
-    } else {
-        if (((size_t) src & 127) == 0) {
-            hvx_scale_offset_f32_ua(dst, src, n, scale, offset);
-        } else {
-            hvx_scale_offset_f32_uu(dst, src, n, scale, offset);
-        }
-    }
-}
-
-#endif // HVX_SCALE_H
--- a/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c
@@ -0,0 +1,49 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
+#include <math.h>
+#include <string.h>
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-dma.h"
+#include "htp-msg.h"
+#include "htp-ops.h"
+#include "hvx-utils.h"
+#include "ops-utils.h"
+
+#if 0
+// Reference algo used in hvx-utils
+static void fast_sigmoid_f32(const float*  restrict src, float* restrict dst, const int num_elems)
+{
+    const float c1 = 0.03138777;
+    const float c2 = 0.276281267;
+    const float c_log2f = 1.442695022;
+
+    int32_t store_ints[32];
+    float store_floats[3][32];
+
+    for (int i = 0; i < num_elems; i++)
+    {
+        float v = src0[i];
+
+        v *= c_log2f*0.5;
+        int intPart = (int)v;
+        float x = (v - intPart);
+        float xx = x * x;
+        float v1 = c_log2f + c2 * xx;
+        float v2 = x + xx * c1 * x;
+        float v3 = (v2 + v1);
+        *((int*)&v3) += intPart << 24;
+        float v4 = v2 - v1;
+        float v5 = v3 - v4;
+        float res = v3 / v5;
+
+        dst[i] = res;
+    }
+}
+#endif
--- a/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h
@@ -1,114 +0,0 @@
-#ifndef HVX_SIGMOID_H
-#define HVX_SIGMOID_H
-
-#include "hvx-base.h"
-
-#define FAST_SIGMOID_LOG2F (0x3fb8aa3b)  // 1.442695022
-#define FAST_SIGMOID_C1    (0x3d009076)  // 0.03138777
-#define FAST_SIGMOID_C2    (0x3e8d74bd)  // 0.276281267
-#define FAST_SIGMOID_C3    (0x3f000000)  // 0.5
-
-static inline HVX_Vector hvx_vec_fast_sigmoid_f32(HVX_Vector v) {
-    v = Q6_Vqf32_vmpy_VsfVsf(v, Q6_V_vsplat_R(FAST_SIGMOID_LOG2F));
-    v = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v), Q6_V_vsplat_R(FAST_SIGMOID_C3));
-
-    HVX_Vector in_int = hvx_vec_truncate_f32(Q6_Vsf_equals_Vqf32(v));
-    HVX_Vector x      = Q6_Vqf32_vsub_Vqf32Vsf(v, Q6_Vsf_equals_Vw(in_int));
-    HVX_Vector xx     = Q6_Vqf32_vmpy_Vqf32Vqf32(x, x);
-
-    HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(xx), Q6_V_vsplat_R(FAST_SIGMOID_C2));
-    v1            = Q6_Vqf32_vadd_Vqf32Vsf(v1, Q6_V_vsplat_R(FAST_SIGMOID_LOG2F));
-
-    HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(x), Q6_V_vsplat_R(FAST_SIGMOID_C1));
-    v2            = Q6_Vqf32_vmpy_Vqf32Vqf32(v2, xx);
-    v2            = Q6_Vqf32_vadd_Vqf32Vqf32(v2, x);
-
-    HVX_Vector v3          = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vqf32(v2, v1));
-    HVX_Vector v3_exponent = Q6_Vw_vasl_VwR(v3, 1);
-    v3_exponent            = Q6_Vuw_vlsr_VuwR(v3_exponent, 24);
-    v3_exponent            = Q6_Vw_vadd_VwVw(in_int, v3_exponent);
-    v3                     = Q6_Vw_vaslacc_VwVwR(v3, in_int, 24);
-
-    HVX_Vector v4 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_Vqf32Vqf32(v2, v1));
-    HVX_Vector v5 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(v3, v4));
-
-    HVX_Vector res = hvx_vec_inverse_f32(v5);
-    res            = Q6_Vqf32_vmpy_VsfVsf(v3, res);
-
-    return Q6_Vsf_equals_Vqf32(res);
-}
-
-static inline HVX_Vector hvx_vec_fast_sigmoid_f32_guard(HVX_Vector v,
-                                                         HVX_Vector one,
-                                                         HVX_Vector max_exp,
-                                                         HVX_Vector min_exp) {
-    const HVX_VectorPred pred_max = Q6_Q_vcmp_gt_VsfVsf(max_exp, v);
-    const HVX_VectorPred pred_min = Q6_Q_vcmp_gt_VsfVsf(v, min_exp);
-
-    HVX_Vector out = hvx_vec_fast_sigmoid_f32(v);
-    out            = Q6_V_vmux_QVV(pred_max, out, one);
-    return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero());
-}
-
-static inline HVX_Vector hvx_vec_tanh_f32(HVX_Vector x) {
-    // tanh(x) = 2 * sigmoid(2x) - 1
-    HVX_Vector two = hvx_vec_splat_f32(2.0f);
-    HVX_Vector one = hvx_vec_splat_f32(1.0f);
-    HVX_Vector x2  = Q6_Vqf32_vmpy_VsfVsf(x, two);
-
-    HVX_Vector max_exp = hvx_vec_splat_f32(87.f);
-    HVX_Vector min_exp = hvx_vec_splat_f32(-87.f);
-
-    HVX_Vector sig2x = hvx_vec_fast_sigmoid_f32_guard(Q6_Vsf_equals_Vqf32(x2), one, max_exp, min_exp);
-
-    HVX_Vector res = Q6_Vqf32_vmpy_VsfVsf(sig2x, two);
-    res = Q6_Vqf32_vsub_Vqf32Vsf(res, one);
-    return Q6_Vsf_equals_Vqf32(res);
-}
-
-#define hvx_sigmoid_loop_body(dst_type, src_type, vec_store)    \
-    do {                                                        \
-        dst_type * restrict vdst = (dst_type *) dst;            \
-        src_type * restrict vsrc = (src_type *) src;            \
-                                                                \
-        const HVX_Vector one     = hvx_vec_splat_f32(1.f);      \
-        const HVX_Vector max_exp = hvx_vec_splat_f32(87.f);     \
-        const HVX_Vector min_exp = hvx_vec_splat_f32(-87.f);    \
-                                                                \
-        const uint32_t epv  = 128 / sizeof(float);              \
-        const uint32_t nvec = n / epv;                          \
-        const uint32_t nloe = n % epv;                          \
-                                                                \
-        uint32_t i = 0;                                         \
-                                                                \
-        _Pragma("unroll(4)")                                    \
-        for (; i < nvec; i++) {                                 \
-             vdst[i] = hvx_vec_fast_sigmoid_f32_guard(vsrc[i], one, max_exp, min_exp); \
-        }                                                       \
-        if (nloe) {                                             \
-             HVX_Vector tmp = hvx_vec_fast_sigmoid_f32_guard(vsrc[i], one, max_exp, min_exp); \
-             vec_store((void *) &vdst[i], nloe * sizeof(float), tmp); \
-        }                                                       \
-    } while(0)
-
-static inline void hvx_sigmoid_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    assert((unsigned long) dst % 128 == 0);
-    assert((unsigned long) src % 128 == 0);
-    hvx_sigmoid_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
-}
-
-static inline void hvx_sigmoid_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    assert((unsigned long) dst % 128 == 0);
-    hvx_sigmoid_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
-}
-
-static inline void hvx_sigmoid_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    assert((unsigned long) src % 128 == 0);
-    hvx_sigmoid_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
-}
-
-static inline void hvx_sigmoid_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
-    hvx_sigmoid_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
-}
-
-#endif /* HVX_SIGMOID_H */
--- a/ggml/src/ggml-hexagon/htp/hvx-sqrt.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-sqrt.h
@@ -1,60 +0,0 @@
-#ifndef HVX_SQRT_H
-#define HVX_SQRT_H
-
-#include <stdbool.h>
-#include <stdint.h>
-
-#include "hex-utils.h"
-
-#include "hvx-base.h"
-
-#define RSQRT_CONST        0x5f3759df  // Constant for fast inverse square root calculation
-#define RSQRT_ONE_HALF     0x3f000000  // 0.5
-#define RSQRT_THREE_HALVES 0x3fc00000  // 1.5
-
-static inline HVX_Vector hvx_vec_rsqrt_f32(HVX_Vector in_vec) {
-    //Algorithm :
-    //  x2 = input*0.5
-    //  y  = * (long *) &input
-    //  y  = 0x5f3759df - (y>>2)
-    //  y  = y*(threehalfs - x2*y*y)
-
-    HVX_Vector rsqrtconst = Q6_V_vsplat_R(RSQRT_CONST);
-    HVX_Vector onehalf    = Q6_V_vsplat_R(RSQRT_ONE_HALF);
-    HVX_Vector threehalfs = Q6_V_vsplat_R(RSQRT_THREE_HALVES);
-
-    HVX_Vector x2, y, ypower2, temp;
-
-    x2 = Q6_Vqf32_vmpy_VsfVsf(in_vec, onehalf);
-    x2 = Q6_Vqf32_vadd_Vqf32Vsf(x2, Q6_V_vzero());
-
-    y = Q6_Vw_vasr_VwR(in_vec, 1);
-    y = Q6_Vw_vsub_VwVw(rsqrtconst, y);
-
-    // 1st iteration
-    ypower2 = Q6_Vqf32_vmpy_VsfVsf(y, y);
-    ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
-    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
-    temp    = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
-    temp    = Q6_Vqf32_vmpy_VsfVsf(y, Q6_Vsf_equals_Vqf32(temp));
-
-    // 2nd iteration
-    y       = Q6_Vqf32_vadd_Vqf32Vsf(temp, Q6_V_vzero());
-    ypower2 = Q6_Vqf32_vmpy_Vqf32Vqf32(y, y);
-    ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
-    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
-    temp    = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
-    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(y, temp);
-
-    // 3rd iteration
-    y       = Q6_Vqf32_vadd_Vqf32Vsf(temp, Q6_V_vzero());
-    ypower2 = Q6_Vqf32_vmpy_Vqf32Vqf32(y, y);
-    ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
-    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
-    temp    = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
-    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(y, temp);
-
-    return Q6_Vsf_equals_Vqf32(temp);
-}
-
-#endif /* HVX_SQRT_H */
--- a/ggml/src/ggml-hexagon/htp/hvx-types.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-types.h
@@ -1,36 +0,0 @@
-#ifndef HVX_TYPES_H
-#define HVX_TYPES_H
-
-#include <stdbool.h>
-#include <stdint.h>
-
-#include <hexagon_types.h>
-
-#define SIZEOF_FP32 (4)
-#define SIZEOF_FP16 (2)
-#define VLEN        (128)
-#define VLEN_FP32   (VLEN / SIZEOF_FP32)
-#define VLEN_FP16   (VLEN / SIZEOF_FP16)
-
-typedef union {
-    HVX_Vector v;
-    uint8_t    b[VLEN];
-    uint16_t   h[VLEN_FP16];
-    uint32_t   w[VLEN_FP32];
-    __fp16     fp16[VLEN_FP16];
-    float      fp32[VLEN_FP32];
-} __attribute__((aligned(VLEN), packed)) HVX_VectorAlias;
-
-typedef struct {
-    HVX_Vector v[2];
-} HVX_Vector_x2;
-
-typedef struct {
-    HVX_Vector v[4];
-} HVX_Vector_x4;
-
-typedef struct {
-    HVX_Vector v[8];
-} HVX_Vector_x8;
-
-#endif /* HVX_TYPES_H */
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	4a2751258a	server : simplify prompt state transition branches	2026-01-09 17:46:03 +02:00
Georgi Gerganov	cc5cafecf4	fix : nullptr task dereference	2026-01-09 17:32:39 +02:00
Georgi Gerganov	aef22e7afc	cont : reduce parent checks	2026-01-09 16:44:07 +02:00
Georgi Gerganov	9ceb268ee1	cont : remove redundant function	2026-01-09 16:42:29 +02:00
Georgi Gerganov	a4854f0349	cont : improve n_cmpl logic - launch the parent task first so it finds the slot with best cache - parent task waits for child tasks to be launched - when a child task finishes - remove its cache	2026-01-09 15:36:58 +02:00
Georgi Gerganov	f2d988db55	cont : cleanup	2026-01-09 13:22:04 +02:00
Georgi Gerganov	91fd50be1b	Merge branch 'master' into pr/18663	2026-01-09 13:05:16 +02:00
Georgi Gerganov	439c3b5021	cont : init child samplers + modify child logic	2026-01-09 10:55:46 +02:00
Georgi Gerganov	59dda88aae	Merge branch 'master' into HEAD	2026-01-09 09:35:12 +02:00
Xuan Son Nguyen	d7c27d4964	fix infinite loop on empty batch	2026-01-07 14:08:05 +01:00
Xuan Son Nguyen	a9d7bcb7fc	server: fix n_cmpl not skipping processing	2026-01-07 13:13:53 +01:00