ci : disable op offload in some tests

llama : update worst-case graph for unified cache
ggml : add GGML_SCHED_NO_REALLOC option to disable reallocations in ggml_backend_sched
2026-02-12 14:03:20 +02:00 · 2025-11-20 17:16:50 +02:00 · 2025-11-19 09:56:31 +02:00 · 2025-11-17 21:29:22 +01:00
559 changed files with 36556 additions and 143677 deletions
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@@ -3,8 +3,7 @@
 # ==============================================================================

 # Define the CANN base image for easier version updates later
-ARG CHIP_TYPE=910b
-ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11
+ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10

 # ==============================================================================
 # BUILD STAGE
@@ -12,6 +11,9 @@ ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.
 # ==============================================================================
 FROM ${CANN_BASE_IMAGE} AS build

+# Define the Ascend chip model for compilation. Default is Ascend910B3
+ARG ASCEND_SOC_TYPE=Ascend910B3
+
 # -- Install build dependencies --
 RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
    yum clean all && \
@@ -34,14 +36,13 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
 # For brevity, only core variables are listed here. You can paste the original ENV list here.

 # -- Build llama.cpp --
-# Use the passed CHIP_TYPE argument and add general build options
-ARG CHIP_TYPE
+# Use the passed ASCEND_SOC_TYPE argument and add general build options
 RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
    && \
    cmake -B build \
        -DGGML_CANN=ON \
        -DCMAKE_BUILD_TYPE=Release \
-        -DSOC_TYPE=ascend${CHIP_TYPE} \
+        -DSOC_TYPE=${ASCEND_SOC_TYPE} \
        . && \
    cmake --build build --config Release -j$(nproc)

@@ -111,7 +112,7 @@ ENTRYPOINT ["/app/tools.sh"]
 # ==============================================================================
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 ENTRYPOINT [ "/app/llama-cli" ]

--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 WORKDIR /app

--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -74,7 +74,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 WORKDIR /app

--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -73,7 +73,7 @@ ENTRYPOINT ["/app/tools.sh"]
 FROM base AS light

 COPY --from=build /app/lib/ /app
-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 WORKDIR /app

--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -81,7 +81,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 WORKDIR /app

--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -94,7 +94,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 WORKDIR /app

--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -105,7 +105,7 @@ WORKDIR /llama.cpp/bin

 # Copy llama.cpp binaries and libraries
 COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
-COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
+COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin

 ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]

--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -13,8 +13,6 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
    exec ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
    exec ./llama-cli "$@"
-elif [[ "$arg1" == '--run-legacy' || "$arg1" == '-l' ]]; then
-    exec ./llama-completion "$@"
 elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
    exec ./llama-bench "$@"
 elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
@@ -34,10 +32,8 @@ elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
-    echo "  --run (-r): Run a model (chat) previously converted into ggml"
-    echo "              ex: -m /models/7B/ggml-model-q4_0.bin"
-    echo "  --run-legacy (-l): Run a model (legacy completion) previously converted into ggml"
-    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -no-cnv -p \"Building a website can be done in 10 simple steps:\" -n 512"
+    echo "  --run (-r): Run a model previously converted into ggml"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
    echo "  --bench (-b): Benchmark the performance of the inference for various parameters."
    echo "              ex: -m model.gguf"
    echo "  --perplexity (-p): Measure the perplexity of a model over a given text."
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -1,7 +1,9 @@
-ARG UBUNTU_VERSION=26.04
+ARG UBUNTU_VERSION=25.10

 FROM ubuntu:$UBUNTU_VERSION AS build

+# Ref: https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html
+
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget xz-utils

@@ -50,7 +52,6 @@ WORKDIR /app

 RUN apt-get update \
    && apt-get install -y \
-    build-essential \
    git \
    python3 \
    python3-pip \
@@ -68,7 +69,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 WORKDIR /app

--- a/.github/actions/windows-setup-cuda/action.yml
+++ b/.github/actions/windows-setup-cuda/action.yml
@@ -65,34 +65,3 @@ runs:
          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-
-    - name: Install Cuda Toolkit 13.1
-      if: ${{ inputs.cuda_version == '13.1' }}
-      shell: pwsh
-      run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_crt/windows-x86_64/cuda_crt-windows-x86_64-13.1.80-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-13.1.80-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-13.1.80-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-13.1.80-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-13.2.0.9-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libnvvm/windows-x86_64/libnvvm-windows-x86_64-13.1.80-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-13.1.68-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-13.1.80-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-13.1.68-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-13.1.78-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_crt-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cudart-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvcc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvrtc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libcublas-windows-x86_64-13.2.0.9-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libnvvm-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvtx-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_profiler_api-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\visual_studio_integration-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cccl-windows-x86_64-13.1.78-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V13_1=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -291,7 +291,6 @@ jobs:
                         -DGGML_RVV=ON \
                         -DGGML_RV_ZFH=ON \
                         -DGGML_RV_ZICBOP=ON \
-                         -DGGML_RV_ZIHINTPAUSE=ON \
                         -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
                         -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake

--- a/.github/workflows/build-riscv-native.yml
+++ b/.github/workflows/build-riscv-native.yml
@@ -0,0 +1,120 @@
+name: Build on RISCV Linux Machine by Cloud-V
+on:
+  pull_request:
+  workflow_dispatch:
+  workflow_call:
+
+jobs:
+  debian-13-riscv64-native: # Bianbu 2.2
+    runs-on: [self-hosted, RISCV64]
+
+    steps:
+      - name: Install prerequisites
+        run: |
+          sudo apt-get update || true
+          sudo apt-get install -y libatomic1
+      - uses: actions/checkout@v4
+      - name: Setup Riscv
+        run: |
+          sudo apt-get update || true
+          sudo apt-get install -y --no-install-recommends \
+                  build-essential \
+                  gcc-14-riscv64-linux-gnu \
+                  g++-14-riscv64-linux-gnu \
+                  ccache \
+                  cmake
+
+      - name: Setup ccache
+        run: |
+          mkdir -p $HOME/.ccache
+          ccache -M 5G -d $HOME/.ccache
+          export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
+          export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
+          echo "$GITHUB_WORKSPACE"
+          echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
+          echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
+          echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
+          echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
+
+      - name: Build
+        run: |
+          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_EXAMPLES=ON \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DCMAKE_SYSTEM_NAME=Linux \
+            -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
+            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+            -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
+            -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+            -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+            -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+
+          cmake --build build --config Release -j $(nproc)
+
+  # debian-13-riscv64-spacemit-ime-native: # Bianbu 2.2
+  #   runs-on: [self-hosted, RISCV64]
+
+  #   steps:
+  #     - name: Install prerequisites
+  #       run: |
+  #         sudo apt-get update || true
+  #         sudo apt-get install -y libatomic1
+  #     - uses: actions/checkout@v4
+  #     - name: Setup Riscv
+  #       run: |
+  #         sudo apt-get update || true
+  #         sudo apt-get install -y --no-install-recommends \
+  #                 build-essential \
+  #                 gcc-14-riscv64-linux-gnu \
+  #                 g++-14-riscv64-linux-gnu \
+  #                 ccache \
+  #                 cmake
+  #         sudo apt-get upgrade binutils -y
+
+  #     - name: Setup ccache
+  #       run: |
+  #         mkdir -p $HOME/.ccache
+  #         ccache -M 5G -d $HOME/.ccache
+  #         export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
+  #         export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
+  #         echo "$GITHUB_WORKSPACE"
+  #         echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
+  #         echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
+  #         echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
+  #         echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
+
+  #     - name: Build
+  #       run: |
+  #         cmake -B build \
+  #           -DLLAMA_CURL=OFF \
+  #           -DCMAKE_BUILD_TYPE=Release \
+  #           -DGGML_OPENMP=OFF \
+  #           -DLLAMA_BUILD_EXAMPLES=ON \
+  #           -DLLAMA_BUILD_TOOLS=ON \
+  #           -DLLAMA_BUILD_TESTS=OFF \
+  #           -DCMAKE_SYSTEM_NAME=Linux \
+  #           -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
+  #           -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+  #           -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
+  #           -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+  #           -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+  #           -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+  #           -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
+  #           -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+  #           -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+  #           -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH \
+  #           -DGGML_RVV=ON \
+  #           -DGGML_RV_ZFH=ON \
+  #           -DGGML_RV_ZICBOP=ON \
+  #           -DGGML_CPU_RISCV64_SPACEMIT=ON \
+  #           -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1
+
+  #         cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -69,6 +69,13 @@ jobs:
          key: macOS-latest-cmake-arm64
          evict-old-files: 1d

+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+          brew install curl
+
      - name: Build
        id: cmake_build
        run: |
@@ -76,8 +83,6 @@ jobs:
          cmake -B build \
            -DCMAKE_BUILD_RPATH="@loader_path" \
            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_BUILD_BORINGSSL=ON \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=OFF \
            -DGGML_METAL_SHADER_DEBUG=ON \
@@ -105,6 +110,13 @@ jobs:
          key: macOS-latest-cmake-x64
          evict-old-files: 1d

+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+          brew install curl
+
      - name: Build
        id: cmake_build
        run: |
@@ -114,8 +126,6 @@ jobs:
          cmake -B build \
            -DCMAKE_BUILD_RPATH="@loader_path" \
            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_BUILD_BORINGSSL=ON \
            -DGGML_METAL=OFF \
            -DGGML_RPC=ON \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
@@ -141,6 +151,13 @@ jobs:
          key: macOS-latest-cmake-arm64-webgpu
          evict-old-files: 1d

+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+          brew install curl
+
      - name: Dawn Dependency
        id: dawn-depends
        run: |
@@ -200,7 +217,7 @@ jobs:
          sudo apt-get update
          sudo apt-get install -y --no-install-recommends \
            python3 python3-pip python3-dev \
-            libjpeg-dev build-essential libssl-dev \
+            libjpeg-dev build-essential libcurl4-openssl-dev \
            git-lfs

      - name: Python Dependencies
@@ -221,8 +238,6 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DGGML_RPC=ON
          cmake --build build --config Release -j $(nproc)
@@ -243,7 +258,7 @@ jobs:
          echo "Fetch llama2c model"
          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

      - name: Test llama2c (s390x)
        id: llama2c_test_s390x
@@ -252,7 +267,7 @@ jobs:
          cd build
          echo "Fetch llama2c big-endian model"
          wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
-          ./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+          ./bin/llama-cli -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

  ubuntu-latest-cmake-sanitizer:
    runs-on: ubuntu-latest
@@ -279,15 +294,13 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential libssl-dev
+          sudo apt-get install build-essential libcurl4-openssl-dev

      - name: Build
        id: cmake_build
        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
@@ -298,8 +311,6 @@ jobs:
        if: ${{ matrix.sanitizer == 'THREAD' }}
        run: |
          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
@@ -324,7 +335,7 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential libssl-dev
+          sudo apt-get install build-essential libcurl4-openssl-dev

      - name: Build
        id: cmake_build
@@ -332,8 +343,6 @@ jobs:
          mkdir build
          cd build
          cmake .. \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_LLGUIDANCE=ON
          cmake --build . --config Release -j $(nproc)
@@ -364,14 +373,12 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential libssl-dev
+          sudo apt-get install build-essential libcurl4-openssl-dev

      - name: Build
        id: cmake_build
        run: |
          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DGGML_RPC=ON
          cmake --build build --config Release -j $(nproc)

@@ -398,14 +405,12 @@ jobs:
      - name: Dependencies
        id: depends
        run: |
-          sudo apt-get install -y glslc libvulkan-dev libssl-dev
+          sudo apt-get install -y glslc libvulkan-dev libcurl4-openssl-dev

      - name: Configure
        id: cmake_configure
        run: |
          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
            -DGGML_BACKEND_DL=ON \
            -DGGML_CPU_ALL_VARIANTS=ON \
@@ -435,7 +440,7 @@ jobs:
        run: |
          sudo add-apt-repository -y ppa:kisak/kisak-mesa
          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
+          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev

      - name: Get latest Vulkan SDK version
        id: vulkan_sdk_version
@@ -461,8 +466,6 @@ jobs:
        run: |
          source ./vulkan_sdk/setup-env.sh
          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DGGML_VULKAN=ON
          cmake --build build --config Release -j $(nproc)

@@ -494,7 +497,7 @@ jobs:
        run: |
          sudo add-apt-repository -y ppa:kisak/kisak-mesa
          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
+          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev

      - name: Get latest Vulkan SDK version
        id: vulkan_sdk_version
@@ -534,10 +537,7 @@ jobs:
        id: cmake_build
        run: |
          export Dawn_DIR=dawn/lib64/cmake/Dawn
-          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
-            -DGGML_WEBGPU=ON
+          cmake -B build -DGGML_WEBGPU=ON
          cmake --build build --config Release -j $(nproc)

      - name: Test
@@ -547,46 +547,6 @@ jobs:
          # This is using llvmpipe and runs slower than other backends
          ctest -L main --verbose --timeout 3600

-  ubuntu-24-wasm-webgpu:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ubuntu-latest-wasm-webgpu
-          evict-old-files: 1d
-
-      - name: Install Emscripten
-        run: |
-          git clone https://github.com/emscripten-core/emsdk.git
-          cd emsdk
-          ./emsdk install latest
-          ./emsdk activate latest
-
-      - name: Fetch emdawnwebgpu
-        run: |
-          DAWN_TAG="v20251027.212519"
-          EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
-          echo "Downloading ${EMDAWN_PKG}"
-          curl -L -o emdawn.zip \
-            "https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
-          unzip emdawn.zip
-
-      - name: Build WASM WebGPU
-        run: |
-          source emsdk/emsdk_env.sh
-          emcmake cmake -B build-wasm \
-            -DGGML_WEBGPU=ON \
-            -DLLAMA_CURL=OFF \
-            -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
-
-          cmake --build build-wasm --target test-backend-ops -j $(nproc)
-
  ubuntu-22-cmake-hip:
    runs-on: ubuntu-22.04
    container: rocm/dev-ubuntu-22.04:6.1.2
@@ -600,7 +560,7 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev
+          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev rocwmma-dev

      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
@@ -612,8 +572,6 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build -S . \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
            -DGGML_HIP_ROCWMMA_FATTN=ON \
            -DGGML_HIP=ON
@@ -632,7 +590,7 @@ jobs:
        id: depends
        run: |
          apt-get update
-          apt-get install -y build-essential git cmake libssl-dev
+          apt-get install -y build-essential git cmake libcurl4-openssl-dev

      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
@@ -644,8 +602,6 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build -S . \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DGGML_MUSA=ON
          cmake --build build --config Release -j $(nproc)

@@ -670,7 +626,7 @@ jobs:
        shell: bash
        run: |
          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev

      - name: install oneAPI MKL library
        shell: bash
@@ -692,8 +648,6 @@ jobs:
        run: |
          source /opt/intel/oneapi/setvars.sh
          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DGGML_SYCL=ON \
            -DCMAKE_C_COMPILER=icx \
            -DCMAKE_CXX_COMPILER=icpx
@@ -720,7 +674,7 @@ jobs:
        shell: bash
        run: |
          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev

      - name: install oneAPI MKL library
        shell: bash
@@ -742,8 +696,6 @@ jobs:
        run: |
          source /opt/intel/oneapi/setvars.sh
          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DGGML_SYCL=ON \
            -DCMAKE_C_COMPILER=icx \
            -DCMAKE_CXX_COMPILER=icpx \
@@ -770,6 +722,12 @@ jobs:
          key: macOS-latest-cmake-ios
          evict-old-files: 1d

+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+
      - name: Build
        id: cmake_build
        run: |
@@ -801,6 +759,12 @@ jobs:
          key: macOS-latest-cmake-tvos
          evict-old-files: 1d

+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+
      - name: Build
        id: cmake_build
        run: |
@@ -826,6 +790,12 @@ jobs:
        id: checkout
        uses: actions/checkout@v4

+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+
      - name: Build
        id: cmake_build
        run: |
@@ -868,6 +838,12 @@ jobs:
          name: llama-xcframework
          path: build-apple/llama.xcframework/

+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+
      - name: Build llama.cpp with CMake
        id: cmake_build
        run: |
@@ -1019,12 +995,21 @@ jobs:
            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
          cmake --build build-arm64-release --target install --config release

+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+        with:
+          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
+
      - name: Build
        id: cmake_build
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
          cmake -S . -B build ${{ matrix.defines }} `
-            -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
+            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
+          cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release

      - name: Add libopenblas.dll
        id: add_libopenblas_dll
@@ -1068,7 +1053,7 @@ jobs:
            DEBIAN_FRONTEND: noninteractive
          run: |
              apt update
-              apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev
+              apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev

        - name: ccache
          uses: ggml-org/ccache-action@v1.2.16
@@ -1079,12 +1064,10 @@ jobs:
        - name: Build with CMake
          run: |
            cmake -S . -B build -G Ninja \
-              -DLLAMA_CURL=OFF \
-              -DLLAMA_OPENSSL=ON \
-              -DLLAMA_FATAL_WARNINGS=ON \
              -DCMAKE_BUILD_TYPE=Release \
              -DCMAKE_CUDA_ARCHITECTURES=89-real \
              -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
+              -DLLAMA_FATAL_WARNINGS=ON \
              -DGGML_NATIVE=OFF \
              -DGGML_CUDA=ON
            cmake --build build
@@ -1118,20 +1101,25 @@ jobs:
        run: |
          choco install ninja

+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+
      - name: Build
        id: cmake_build
        shell: cmd
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
          cmake -S . -B build -G "Ninja Multi-Config" ^
            -DLLAMA_BUILD_SERVER=ON ^
-            -DLLAMA_CURL=OFF ^
-            -DLLAMA_BUILD_BORINGSSL=ON ^
            -DGGML_NATIVE=OFF ^
            -DGGML_BACKEND_DL=ON ^
            -DGGML_CPU_ALL_VARIANTS=ON ^
            -DGGML_CUDA=ON ^
-            -DGGML_RPC=ON
+            -DGGML_RPC=ON ^
+            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include"
          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
          cmake --build build --config Release
@@ -1163,7 +1151,7 @@ jobs:
        run:  |
          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL

-      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+      # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args

      - name: Build
        id: cmake_build
@@ -1220,8 +1208,14 @@ jobs:
          key: ${{ github.job }}
          evict-old-files: 1d

+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+
      - name: Build
        id: cmake_build
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
@@ -1230,12 +1224,11 @@ jobs:
            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" `
            -DCMAKE_BUILD_TYPE=Release `
-            -DLLAMA_CURL=OFF `
-            -DLLAMA_BUILD_BORINGSSL=ON `
            -DROCM_DIR="${env:HIP_PATH}" `
            -DGGML_HIP=ON `
            -DGGML_HIP_ROCWMMA_FATTN=ON `
-            -DGGML_RPC=ON
+            -DGGML_RPC=ON `
+            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}

  ios-xcode-build:
@@ -1397,57 +1390,32 @@ jobs:
    strategy:
      matrix:
        arch: [x86, aarch64]
-        chip_type: ['910b', '310p']
-        build: ['Release']
+        cann:
+          - '8.1.RC1.alpha001-910b-openeuler22.03-py3.10'
+        device:
+          - 'ascend910b3'
+        build:
+          - 'Release'
    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+    container: ascendai/cann:${{ matrix.cann }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0

-      - name: Free up disk space
-        uses: ggml-org/free-disk-space@v1.3.1
-        with:
-          tool-cache: true
-
-      - name: Set container image
-        id: cann-image
+      - name: Dependencies
        run: |
-          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
-          echo "image=${image}" >> "${GITHUB_OUTPUT}"
-
-      - name: Pull container image
-        run: docker pull "${{ steps.cann-image.outputs.image }}"
+          yum update -y
+          yum install -y git gcc gcc-c++ make cmake libcurl-devel

      - name: Build
-        env:
-          BUILD_TYPE: ${{ matrix.build }}
-          SOC_TYPE: ascend${{ matrix.chip_type }}
        run: |
-          HOST_UID=$(id -u)
-          HOST_GID=$(id -g)
+          export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}

-          docker run --rm \
-            -v "${PWD}:/workspace" \
-            -w /workspace \
-            -e SOC_TYPE=${SOC_TYPE} \
-            -e BUILD_TYPE=${BUILD_TYPE} \
-            "${{ steps.cann-image.outputs.image }}" \
-            bash -lc '
-              set -e
-              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
-              yum clean all && rm -rf /var/cache/yum
-              git config --global --add safe.directory "/workspace"
-              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-              cmake -S . -B build \
-                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-                  -DGGML_CANN=on \
-                  -DSOC_TYPE=${SOC_TYPE}
-              cmake --build build -j $(nproc)
-
-              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
-            '
+          cmake -S . -B build \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
+              -DGGML_CANN=on \
+              -DSOC_TYPE=${{ matrix.device }}
+          cmake --build build -j $(nproc)

 # TODO: simplify the following workflows using a matrix
 # TODO: run lighter CI on PRs and the full CI only on master (if needed)
@@ -1631,33 +1599,33 @@ jobs:
        run: |
          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

-  # ggml-ci-x64-amd-vulkan:
-  #   runs-on: [self-hosted, Linux, X64, AMD]
+  ggml-ci-x64-amd-vulkan:
+    runs-on: [self-hosted, Linux, X64, AMD]

-  #   steps:
-  #     - name: Clone
-  #       id: checkout
-  #       uses: actions/checkout@v4
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4

-  #     - name: Test
-  #       id: ggml-ci
-  #       run: |
-  #         vulkaninfo --summary
-  #         GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+      - name: Test
+        id: ggml-ci
+        run: |
+          vulkaninfo --summary
+          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

-  # ggml-ci-x64-amd-rocm:
-  #   runs-on: [self-hosted, Linux, X64, AMD]
+  ggml-ci-x64-amd-rocm:
+    runs-on: [self-hosted, Linux, X64, AMD]

-  #   steps:
-  #     - name: Clone
-  #       id: checkout
-  #       uses: actions/checkout@v4
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4

-  #     - name: Test
-  #       id: ggml-ci
-  #       run: |
-  #         amd-smi static
-  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+      - name: Test
+        id: ggml-ci
+        run: |
+          amd-smi static
+          GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  ggml-ci-mac-metal:
    runs-on: [self-hosted, macOS, ARM64]
@@ -1711,337 +1679,6 @@ jobs:
         run: |
           GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

-  ubuntu-cpu-cmake-riscv64-native:
-    runs-on: RISCV64
-
-    steps:
-      - name: Install dependencies
-        run: |
-          sudo apt-get update
-
-          # Install necessary packages
-          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
-
-          # Set gcc-14 and g++-14 as the default compilers
-          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
-          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
-          sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
-          sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
-
-          # Install Rust stable version
-          rustup install stable
-          rustup default stable
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Check environment
-        run: |
-          uname -a
-          gcc --version
-          g++ --version
-          ldd --version
-          cmake --version
-          rustc --version
-
-      - name: Setup ccache
-        run: |
-          # Set unique cache directory for this job
-          export CCACHE_DIR="$HOME/.ccache/cpu-cmake-rv64-native"
-          mkdir -p "$CCACHE_DIR"
-
-          # Configure ccache for optimal performance
-          ccache --set-config=max_size=5G
-          ccache --set-config=compression=true
-          ccache --set-config=compression_level=6
-          ccache --set-config=cache_dir="$CCACHE_DIR"
-
-          # Enable more aggressive caching
-          ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
-          ccache --set-config=hash_dir=false
-
-          # Export for subsequent steps
-          echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
-          echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_OPENMP=OFF \
-            -DLLAMA_BUILD_EXAMPLES=ON \
-            -DLLAMA_BUILD_TOOLS=ON \
-            -DLLAMA_BUILD_TESTS=ON \
-            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-            -DGGML_RPC=ON \
-            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
-
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L 'main|curl' --verbose --timeout 900
-
-      - name: Test llama2c conversion
-        id: llama2c_test
-        run: |
-          cd build
-          echo "Fetch tokenizer"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
-          echo "Fetch llama2c model"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
-          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
-
-  ubuntu-cmake-sanitizer-riscv64-native:
-    runs-on: RISCV64
-
-    continue-on-error: true
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug]
-
-    steps:
-      - name: Install dependencies
-        run: |
-          sudo apt-get update
-
-          # Install necessary packages
-          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
-
-          # Set gcc-14 and g++-14 as the default compilers
-          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
-          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
-          sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
-          sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
-
-          # Install Rust stable version
-          rustup install stable
-          rustup default stable
-
-      - name: GCC version check
-        run: |
-          gcc --version
-          g++ --version
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Setup ccache
-        run: |
-          # Unique cache directory per matrix combination
-          export CCACHE_DIR="$HOME/.ccache/sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}"
-          mkdir -p "$CCACHE_DIR"
-
-          # Configure ccache
-          ccache --set-config=max_size=5G
-          ccache --set-config=compression=true
-          ccache --set-config=compression_level=6
-          ccache --set-config=cache_dir="$CCACHE_DIR"
-          ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
-          ccache --set-config=hash_dir=false
-
-          # Export for subsequent steps
-          echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
-          echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
-
-      - name: Build
-        id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
-        run: |
-          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-            -DGGML_OPENMP=ON \
-            -DLLAMA_BUILD_EXAMPLES=ON \
-            -DLLAMA_BUILD_TOOLS=ON \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
-
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-            -DGGML_OPENMP=OFF \
-            -DLLAMA_BUILD_EXAMPLES=ON \
-            -DLLAMA_BUILD_TOOLS=ON \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
-
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-
-  ubuntu-llguidance-riscv64-native:
-    runs-on: RISCV64
-    steps:
-      - name: Install dependencies
-        run: |
-          sudo apt-get update
-
-          # Install necessary packages
-          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
-
-          # Set gcc-14 and g++-14 as the default compilers
-          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
-          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
-          sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
-          sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
-
-          # Install Rust stable version
-          rustup install stable
-          rustup default stable
-
-      - name: GCC version check
-        run: |
-          gcc --version
-          g++ --version
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Setup ccache
-        run: |
-          export CCACHE_DIR="$HOME/.ccache/llguidance-riscv64"
-          mkdir -p "$CCACHE_DIR"
-
-          ccache --set-config=max_size=5G
-          ccache --set-config=compression=true
-          ccache --set-config=compression_level=6
-          ccache --set-config=cache_dir="$CCACHE_DIR"
-          ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
-          ccache --set-config=hash_dir=false
-
-          echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
-          echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_OPENMP=OFF \
-            -DLLAMA_BUILD_EXAMPLES=ON \
-            -DLLAMA_BUILD_TOOLS=ON \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-            -DLLAMA_LLGUIDANCE=ON \
-            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
-
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-
-  ubuntu-cmake-rpc-riscv64-native:
-    runs-on: RISCV64
-
-    continue-on-error: true
-
-    steps:
-      - name: Install dependencies
-        run: |
-          sudo apt-get update
-
-          # Install necessary packages
-          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
-
-          # Set gcc-14 and g++-14 as the default compilers
-          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
-          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
-          sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
-          sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
-
-          # Install Rust stable version
-          rustup install stable
-          rustup default stable
-
-      - name: GCC version check
-        run: |
-          gcc --version
-          g++ --version
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Setup ccache
-        run: |
-          export CCACHE_DIR="$HOME/.ccache/rpc-riscv64"
-          mkdir -p "$CCACHE_DIR"
-
-          ccache --set-config=max_size=5G
-          ccache --set-config=compression=true
-          ccache --set-config=compression_level=6
-          ccache --set-config=cache_dir="$CCACHE_DIR"
-          ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
-          ccache --set-config=hash_dir=false
-
-          echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
-          echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_OPENMP=OFF \
-            -DLLAMA_BUILD_EXAMPLES=ON \
-            -DLLAMA_BUILD_TOOLS=ON \
-            -DLLAMA_BUILD_TESTS=ON \
-            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-            -DGGML_RPC=ON
-
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose
-
  ggml-ci-arm64-graviton4-kleidiai:
     runs-on: ah-ubuntu_22_04-c8g_8x

--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -66,21 +66,14 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          zip -y -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*

-      - name: Upload artifacts (zip)
+      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
          name: llama-bin-macos-arm64.zip

-      - name: Upload artifacts (tar)
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz
-          name: llama-bin-macos-arm64.tar.gz
-
  macOS-x64:
    runs-on: macos-15-intel

@@ -127,21 +120,14 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          zip -y -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*

-      - name: Upload artifacts (zip)
+      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
          name: llama-bin-macos-x64.zip

-      - name: Upload artifacts (tar)
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
-          name: llama-bin-macos-x64.tar.gz
-
  ubuntu-22-cpu:
    strategy:
      matrix:
@@ -196,21 +182,14 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          zip -y -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*

-      - name: Upload artifacts (zip)
+      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
          name: llama-bin-ubuntu-${{ matrix.build }}.zip

-      - name: Upload artifacts (tar)
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz
-          name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
-
  ubuntu-22-vulkan:
    runs-on: ubuntu-22.04

@@ -256,21 +235,14 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          zip -y -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*

-      - name: Upload artifacts (zip)
+      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
          name: llama-bin-ubuntu-vulkan-x64.zip

-      - name: Upload artifacts (tar)
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
-          name: llama-bin-ubuntu-vulkan-x64.tar.gz
-
  windows-cpu:
    runs-on: windows-2025

@@ -326,7 +298,7 @@ jobs:
        run: |
          Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
          Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
-          7z a -snl llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
+          7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
@@ -408,7 +380,7 @@ jobs:
      - name: Pack artifacts
        id: pack_artifacts
        run: |
-          7z a -snl llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
+          7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
@@ -421,7 +393,7 @@ jobs:

    strategy:
      matrix:
-        cuda: ['12.4', '13.1']
+        cuda: ['12.4']

    steps:
      - name: Clone
@@ -462,7 +434,7 @@ jobs:
      - name: Pack artifacts
        id: pack_artifacts
        run: |
-          7z a -snl llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
+          7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
@@ -476,7 +448,6 @@ jobs:
          $dst='.\build\bin\cudart\'
          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          robocopy "${{env.CUDA_PATH}}\bin\x64" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
          7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*

      - name: Upload Cuda runtime
@@ -546,8 +517,6 @@ jobs:
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin

          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
@@ -557,7 +526,7 @@ jobs:
          cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin

          echo "cp oneAPI running time dll files to ./build/bin done"
-          7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
+          7z a llama-bin-win-sycl-x64.zip ./build/bin/*

      - name: Upload the release package
        uses: actions/upload-artifact@v4
@@ -663,7 +632,7 @@ jobs:
      - name: Pack artifacts
        id: pack_artifacts
        run: |
-          7z a -snl llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
+          7z a llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
@@ -716,92 +685,13 @@ jobs:
      - name: Pack artifacts
        id: pack_artifacts
        run: |
-          zip -y -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
-          tar -czvf llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz -C build-apple llama.xcframework
+          zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework

-      - name: Upload artifacts (zip)
+      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
-          name: llama-${{ steps.tag.outputs.name }}-xcframework.zip
-
-      - name: Upload artifacts (tar)
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
-          name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
-
-
-  openEuler-cann:
-    strategy:
-      matrix:
-        arch: [x86, aarch64]
-        chip_type: ['910b', '310p']
-        build: ['Release']
-    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Free up disk space
-        uses: ggml-org/free-disk-space@v1.3.1
-        with:
-          tool-cache: true
-
-      - name: Set container image
-        id: cann-image
-        run: |
-          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
-          echo "image=${image}" >> "${GITHUB_OUTPUT}"
-
-      - name: Pull container image
-        run: docker pull "${{ steps.cann-image.outputs.image }}"
-
-      - name: Build
-        env:
-          BUILD_TYPE: ${{ matrix.build }}
-          SOC_TYPE: ascend${{ matrix.chip_type }}
-        run: |
-          HOST_UID=$(id -u)
-          HOST_GID=$(id -g)
-
-          docker run --rm \
-            -v "${PWD}:/workspace" \
-            -w /workspace \
-            -e SOC_TYPE=${SOC_TYPE} \
-            -e BUILD_TYPE=${BUILD_TYPE} \
-            "${{ steps.cann-image.outputs.image }}" \
-            bash -lc '
-              set -e
-              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
-              yum clean all && rm -rf /var/cache/yum
-              git config --global --add safe.directory "/workspace"
-              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-              cmake -S . -B build \
-                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-                  -DGGML_CANN=on \
-                  -DSOC_TYPE=${SOC_TYPE}
-              cmake --build build -j $(nproc)
-
-              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
-            '
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
-
-      - name: Upload artifacts (tar)
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
-          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
+          name: llama-${{ steps.tag.outputs.name }}-xcframework

  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -824,7 +714,6 @@ jobs:
      - macOS-arm64
      - macOS-x64
      - ios-xcode-build
-      - openEuler-cann

    steps:
      - name: Clone
@@ -879,7 +768,6 @@ jobs:

          echo "Moving other artifacts..."
          mv -v artifact/*.zip release
-          mv -v artifact/*.tar.gz release

      - name: Create release
        id: create_release
@@ -888,40 +776,6 @@ jobs:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
          tag_name: ${{ steps.tag.outputs.name }}
-          body: |
-            > [!WARNING]
-            > **Release Format Update**: Linux releases will soon use .tar.gz archives instead of .zip. Please make the necessary changes to your deployment scripts.
-
-            <details open>
-
-            ${{ github.event.head_commit.message }}
-
-            </details>
-
-            **macOS/iOS:**
-            - [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
-            - [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
-            - [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz)
-
-            **Linux:**
-            - [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
-            - [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
-            - [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
-
-            **Windows:**
-            - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
-            - [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
-            - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip)
-            - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip)
-            - [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
-            - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
-            - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
-
-            **openEuler:**
-            - [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
-            - [openEuler x86 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86.tar.gz)
-            - [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
-            - [openEuler aarch64 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64.tar.gz)

      - name: Upload release
        id: upload_release
@@ -933,7 +787,7 @@ jobs:
            const fs = require('fs');
            const release_id = '${{ steps.create_release.outputs.id }}';
            for (let file of await fs.readdirSync('./release')) {
-              if (path.extname(file) === '.zip' || file.endsWith('.tar.gz')) {
+              if (path.extname(file) === '.zip') {
                console.log('uploadReleaseAsset', file);
                await github.repos.uploadReleaseAsset({
                  owner: context.repo.owner,
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -56,7 +56,7 @@ jobs:
            curl \
            wget \
            language-pack-en \
-            libssl-dev
+            libcurl4-openssl-dev

      - name: Clone
        id: checkout
@@ -242,7 +242,7 @@ jobs:
            curl \
            wget \
            language-pack-en \
-            libssl-dev
+            libcurl4-openssl-dev

      - name: Clone
        id: checkout
@@ -283,8 +283,6 @@ jobs:
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
-              -DLLAMA_CURL=OFF \
-              -DLLAMA_OPENSSL=ON \
              -DLLAMA_BUILD_SERVER=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
@@ -297,8 +295,6 @@ jobs:
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
-              -DLLAMA_CURL=OFF \
-              -DLLAMA_OPENSSL=ON \
              -DLLAMA_BUILD_SERVER=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
@@ -310,8 +306,6 @@ jobs:
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
-              -DLLAMA_CURL=OFF \
-              -DLLAMA_OPENSSL=ON \
              -DLLAMA_BUILD_SERVER=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
@@ -351,10 +345,16 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+
      - name: Build
        id: cmake_build
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
-          cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
+          cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server

      - name: Python setup
@@ -368,6 +368,13 @@ jobs:
        run: |
          pip install -r tools/server/tests/requirements.txt

+      - name: Copy Libcurl
+        id: prepare_libcurl
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
+
      - name: Tests
        id: server_integration_tests
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
--- a/.github/workflows/winget.yml
+++ b/.github/workflows/winget.yml
@@ -9,7 +9,6 @@ jobs:
  update:
    name: Update Winget Package
    runs-on: ubuntu-latest
-    if: github.repository_owner == 'ggml-org'

    steps:
      - name: Install cargo binstall
--- a/.gitignore
+++ b/.gitignore
@@ -20,40 +20,52 @@
 *.so
 *.swp
 *.tmp
-*.DS_Store

 # IDE / OS

-/.cache/
-/.ccls-cache/
-/.direnv/
-/.envrc
-/.idea/
-/.swiftpm
-/.vs/
-/.vscode/
-/nppBackup
+.cache/
+.ccls-cache/
+.direnv/
+.DS_Store
+.envrc
+.idea/
+.swiftpm
+.vs/
+.vscode/
+nppBackup


 # Coverage

-/gcovr-report/
-/lcov-report/
+gcovr-report/
+lcov-report/

 # Build Artifacts

-/tags
-/.build/
-/build*
-/release
-/debug
+tags
+.build/
+build*
+release
+debug
+!build-info.cmake
+!build-info.cpp.in
+!build-info.sh
+!build.zig
+!docs/build.md
 /libllama.so
 /llama-*
 /vulkan-shaders-gen
+android-ndk-*
+arm_neon.h
+cmake-build-*
+CMakeSettings.json
+compile_commands.json
+ggml-metal-embed.metal
+llama-batched-swift
 /rpc-server
-/out/
-/tmp/
-/autogen-*.md
+out/
+tmp/
+autogen-*.md

 # Deprecated

@@ -62,38 +74,44 @@

 # CI

-!/.github/workflows/*.yml
+!.github/workflows/*.yml

 # Models

-/models/*
-/models-mnt
-!/models/.editorconfig
-!/models/ggml-vocab-*.gguf*
-!/models/templates
+models/*
+models-mnt
+!models/.editorconfig
+!models/ggml-vocab-*.gguf*
+!models/templates

 # Zig
-/zig-out/
-/zig-cache/
+zig-out/
+zig-cache/
+
+# Logs
+
+ppl-*.txt
+qnt-*.txt
+perf-*.txt

 # Examples

-/examples/jeopardy/results.txt
-/tools/server/*.css.hpp
-/tools/server/*.html.hpp
-/tools/server/*.js.hpp
-/tools/server/*.mjs.hpp
-/tools/server/*.gz.hpp
-!/build_64.sh
-!/examples/*.bat
-!/examples/*/*.kts
-!/examples/*/*/*.kts
-!/examples/sycl/*.bat
-!/examples/sycl/*.sh
+examples/jeopardy/results.txt
+tools/server/*.css.hpp
+tools/server/*.html.hpp
+tools/server/*.js.hpp
+tools/server/*.mjs.hpp
+tools/server/*.gz.hpp
+!build_64.sh
+!examples/*.bat
+!examples/*/*.kts
+!examples/*/*/*.kts
+!examples/sycl/*.bat
+!examples/sycl/*.sh

 # Server Web UI temporary files
-/tools/server/webui/node_modules
-/tools/server/webui/dist
+node_modules
+tools/server/webui/dist

 # Python

@@ -129,10 +147,8 @@ poetry.toml
 # Local scripts
 /run-vim.sh
 /run-chat.sh
-/.ccache/
+.ccache/

 # IDE
-/*.code-workspace
-/.windsurf/
-# emscripten
-a.out.*
+*.code-workspace
+.windsurf/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,24 +33,10 @@ endif()

 option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)

-option(LLAMA_WASM_MEM64 "llama: use 64-bit memory in WASM builds" ON)
-
 if (EMSCRIPTEN)
    set(BUILD_SHARED_LIBS_DEFAULT OFF)

-    # Use 64-bit memory to support backend_get_memory queries
-    # TODO: analyze performance impact, see https://spidermonkey.dev/blog/2025/01/15/is-memory64-actually-worth-using
-    if (LLAMA_WASM_MEM64)
-      add_compile_options("-sMEMORY64=1")
-      add_link_options("-sMEMORY64=1")
-    endif()
-    add_link_options("-sALLOW_MEMORY_GROWTH=1")
-
-    option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" OFF)
-    option(LLAMA_BUILD_HTML "llama: build HTML file" ON)
-    if (LLAMA_BUILD_HTML)
-        set(CMAKE_EXECUTABLE_SUFFIX ".html")
-    endif()
+    option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
 else()
    if (MINGW)
        set(BUILD_SHARED_LIBS_DEFAULT OFF)
@@ -72,12 +58,6 @@ if (MSVC)
    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
 endif()

-if (LLAMA_STANDALONE)
-    # enable parallel builds for msbuild
-    list(APPEND CMAKE_VS_GLOBALS UseMultiToolTask=true)
-    list(APPEND CMAKE_VS_GLOBALS EnforceProcessCountAcrossBuilds=true)
-endif()
-
 if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
    set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
 else()
@@ -199,6 +179,11 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
    # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()

+if (MINGW)
+    # Target Windows 8 for PrefetchVirtualMemory
+    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
+endif()
+
 #
 # build the library
 #
--- a/41
+++ b/41
@@ -2,25 +2,23 @@
 # multiplie collaborators per item can be specified

 /.devops/*.Dockerfile                   @ngxson
-/.github/actions/                       @CISC
+/.github/actions/                       @slaren @CISC
 /.github/workflows/                     @CISC
+/.github/workflows/release.yml          @slaren
+/.github/workflows/winget.yml           @slaren
 /ci/                                    @ggerganov
 /cmake/                                 @ggerganov
 /common/CMakeLists.txt                  @ggerganov
-/common/arg.*                           @ggerganov
+/common/arg.*                           @ggerganov @ericcurtin
 /common/base64.hpp.*                    @ggerganov
 /common/build-info.*                    @ggerganov
-/common/chat.*                          @pwilkin
-/common/chat-peg-parser.*               @aldehir
 /common/common.*                        @ggerganov
 /common/console.*                       @ggerganov
 /common/http.*                          @angt
 /common/llguidance.*                    @ggerganov
 /common/log.*                           @ggerganov
-/common/peg-parser.*                    @aldehir
 /common/sampling.*                      @ggerganov
 /common/speculative.*                   @ggerganov
-/common/unicode.*                       @aldehir
 /convert_*.py                           @CISC
 /examples/batched.swift/                @ggerganov
 /examples/batched/                      @ggerganov
@@ -42,14 +40,21 @@
 /examples/passkey/                      @ggerganov
 /examples/retrieval/                    @ggerganov
 /examples/save-load-state/              @ggerganov
+/examples/simple-chat/                  @slaren
+/examples/simple/                       @slaren
 /examples/speculative-simple/           @ggerganov
 /examples/speculative/                  @ggerganov
 /ggml/cmake/                            @ggerganov
-/ggml/include/                          @ggerganov
-/ggml/src/ggml-common.h                 @ggerganov
-/ggml/src/ggml-cpu/                     @ggerganov
+/ggml/include/                          @ggerganov @slaren
+/ggml/src/ggml-alloc.c                  @slaren
+/ggml/src/ggml-backend*                 @slaren
+/ggml/src/ggml-blas/                    @slaren
+/ggml/src/ggml-common.h                 @ggerganov @slaren
+/ggml/src/ggml-cpu/                     @ggerganov @slaren
 /ggml/src/ggml-cpu/spacemit/            @alex-spacemit
+/ggml/src/ggml-cuda/common.cuh          @slaren
 /ggml/src/ggml-cuda/fattn*              @JohannesGaessler
+/ggml/src/ggml-cuda/ggml-cuda.cu        @slaren
 /ggml/src/ggml-cuda/mmf.*               @JohannesGaessler @am17an
 /ggml/src/ggml-cuda/mmq.*               @JohannesGaessler
 /ggml/src/ggml-cuda/mmvf.*              @JohannesGaessler
@@ -57,19 +62,19 @@
 /ggml/src/ggml-cuda/fattn-wmma*         @IMbackK
 /ggml/src/ggml-hip/                     @IMbackK
 /ggml/src/ggml-cuda/vendors/hip.h       @IMbackK
-/ggml/src/ggml-impl.h                   @ggerganov
+/ggml/src/ggml-impl.h                   @ggerganov @slaren
 /ggml/src/ggml-metal/                   @ggerganov
 /ggml/src/ggml-opencl/                  @lhez @max-krasnyansky
 /ggml/src/ggml-hexagon/                 @max-krasnyansky @lhez
 /ggml/src/ggml-opt.cpp                  @JohannesGaessler
 /ggml/src/ggml-quants.*                 @ggerganov
 /ggml/src/ggml-rpc/                     @rgerganov
-/ggml/src/ggml-threading.*              @ggerganov
+/ggml/src/ggml-threading.*              @ggerganov @slaren
 /ggml/src/ggml-vulkan/                  @0cc4m
 /ggml/src/ggml-webgpu/                  @reeselevine
 /ggml/src/ggml-zdnn/                    @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
-/ggml/src/ggml.c                        @ggerganov
-/ggml/src/ggml.cpp                      @ggerganov
+/ggml/src/ggml.c                        @ggerganov @slaren
+/ggml/src/ggml.cpp                      @ggerganov @slaren
 /ggml/src/gguf.cpp                      @JohannesGaessler @Green-Sky
 /gguf-py/                               @CISC
 /media/                                 @ggerganov
@@ -81,22 +86,28 @@
 /src/llama-arch.*                       @CISC
 /src/llama-chat.*                       @ngxson
 /src/llama-graph.*                      @CISC
+/src/llama-model-loader.*               @slaren
 /src/llama-model.*                      @CISC
 /src/llama-vocab.*                      @CISC
 /src/models/                            @CISC
 /tests/                                 @ggerganov
-/tests/test-chat-.*                     @pwilkin
+/tests/test-backend-ops.cpp             @slaren
+/tests/test-thread-safety.cpp           @slaren
 /tools/batched-bench/                   @ggerganov
+/tools/llama-bench/                     @slaren
 /tools/main/                            @ggerganov
 /tools/mtmd/                            @ngxson
 /tools/perplexity/                      @ggerganov
 /tools/quantize/                        @ggerganov
 /tools/rpc/                             @rgerganov
-/tools/server/*                         @ngxson @ggerganov # no subdir
+/tools/run/                             @ericcurtin
+/tools/server/*                         @ngxson @ggerganov @ericcurtin # no subdir
 /tools/server/webui/                    @allozaur
 /tools/tokenize/                        @ggerganov
 /tools/tts/                             @ggerganov
 /vendor/                                @ggerganov
+/.clang-format                          @slaren
+/.clang-tidy                            @slaren
 /AUTHORS                                @ggerganov
 /CMakeLists.txt                         @ggerganov
 /CONTRIBUTING.md                        @ggerganov
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -15,12 +15,10 @@ The project differentiates between 3 levels of contributors:
    - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
    - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
 - Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
- When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If your PR becomes stale, rebase it on top of latest `master` to get maintainers attention
+- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
 - Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
 - Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs
- Using AI to generate PRs is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before publishing the PR. Note that trivial tab autocompletions do not require disclosure.

 # Pull requests (for maintainers)

--- a/README.md
+++ b/README.md
@@ -61,7 +61,7 @@ range of hardware - locally and in the cloud.
 - Plain C/C++ implementation without any dependencies
 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
 - AVX, AVX2, AVX512 and AMX support for x86 architectures
- RVV, ZVFH, ZFH, ZICBOP and ZIHINTPAUSE support for RISC-V architectures
+- RVV, ZVFH, ZFH and ZICBOP support for RISC-V architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
 - Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
 - Vulkan and SYCL backend support
@@ -242,7 +242,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
 - [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
- [unslothai/unsloth](https://github.com/unslothai/unsloth) – 🦥 exports/saves fine-tuned and trained models to GGUF (Apache-2.0)

 </details>

@@ -276,7 +275,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [MUSA](docs/build.md#musa) | Moore Threads GPU |
 | [CUDA](docs/build.md#cuda) | Nvidia GPU |
 | [HIP](docs/build.md#hip) | AMD GPU |
-| [ZenDNN](docs/build.md#zendnn) | AMD CPU |
 | [Vulkan](docs/build.md#vulkan) | GPU |
 | [CANN](docs/build.md#cann) | Ascend NPU |
 | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
@@ -347,6 +345,19 @@ To learn more about model quantization, [read this documentation](tools/quantize

    </details>

+- <details>
+    <summary>Run simple text completion</summary>
+
+    To disable conversation mode explicitly, use `-no-cnv`
+
+    ```bash
+    llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
+
+    # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
+    ```
+
+    </details>
+
 - <details>
    <summary>Constrain the output with a custom grammar</summary>

@@ -601,4 +612,3 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
 - [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
 - [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
 - [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
- [subprocess.h](https://github.com/sheredom/subprocess.h) - Single-header process launching solution for C and C++ - Public domain
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -65,6 +65,4 @@ However, If you have discovered a security vulnerability in this project, please

 Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).

-Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report.
-
 A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -45,7 +45,7 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`

-CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"

 if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
@@ -398,18 +398,18 @@ function gg_run_qwen3_0_6b {
    ./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
    ./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)

-    (time ./bin/llama-completion -no-cnv --model ${model_f16}  -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    if [ -z ${GG_BUILD_NO_BF16} ]; then
--- a/cmake/build-info.cmake
+++ b/cmake/build-info.cmake
@@ -39,10 +39,26 @@ if(Git_FOUND)
    endif()
 endif()

-set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
-
-if(CMAKE_VS_PLATFORM_NAME)
-    set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
+if(MSVC)
+    set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+    if (CMAKE_VS_PLATFORM_NAME)
+        set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
+    else()
+        set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
+    endif()
 else()
-    set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
+    execute_process(
+        COMMAND ${CMAKE_C_COMPILER} --version
+        OUTPUT_VARIABLE OUT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    string(REGEX REPLACE " *\n.*" "" OUT "${OUT}")
+    set(BUILD_COMPILER ${OUT})
+
+    execute_process(
+        COMMAND ${CMAKE_C_COMPILER} -dumpmachine
+        OUTPUT_VARIABLE OUT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    set(BUILD_TARGET ${OUT})
 endif()
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -50,10 +50,6 @@ add_library(${TARGET} STATIC
    base64.hpp
    chat-parser.cpp
    chat-parser.h
-    chat-parser-xml-toolcall.h
-    chat-parser-xml-toolcall.cpp
-    chat-peg-parser.cpp
-    chat-peg-parser.h
    chat.cpp
    chat.h
    common.cpp
@@ -71,18 +67,12 @@ add_library(${TARGET} STATIC
    log.h
    ngram-cache.cpp
    ngram-cache.h
-    peg-parser.cpp
-    peg-parser.h
-    preset.cpp
-    preset.h
    regex-partial.cpp
    regex-partial.h
    sampling.cpp
    sampling.h
    speculative.cpp
    speculative.h
-    unicode.cpp
-    unicode.h
    )

 if (BUILD_SHARED_LIBS)
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@@ -3,10 +3,8 @@
 #include "common.h"

 #include <set>
-#include <map>
 #include <string>
 #include <vector>
-#include <cstring>

 //
 // CLI argument parsing
@@ -16,7 +14,6 @@ struct common_arg {
    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
    std::set<enum llama_example> excludes = {};
    std::vector<const char *> args;
-    std::vector<const char *> args_neg;  // for negated args like --no-xxx
    const char * value_hint   = nullptr; // help text or example for arg value
    const char * value_hint_2 = nullptr; // for second arg value
    const char * env          = nullptr;
@@ -26,9 +23,6 @@ struct common_arg {
    void (*handler_string) (common_params & params, const std::string &) = nullptr;
    void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
    void (*handler_int)    (common_params & params, int) = nullptr;
-    void (*handler_bool)   (common_params & params, bool) = nullptr;
-
-    common_arg() = default;

    common_arg(
        const std::initializer_list<const char *> & args,
@@ -50,13 +44,6 @@ struct common_arg {
        void (*handler)(common_params & params)
    ) : args(args), help(help), handler_void(handler) {}

-    common_arg(
-        const std::initializer_list<const char *> & args,
-        const std::initializer_list<const char *> & args_neg,
-        const std::string & help,
-        void (*handler)(common_params & params, bool)
-    ) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {}
-
    // support 2 values for arg
    common_arg(
        const std::initializer_list<const char *> & args,
@@ -74,33 +61,9 @@ struct common_arg {
    bool is_exclude(enum llama_example ex);
    bool get_value_from_env(std::string & output) const;
    bool has_value_from_env() const;
-    std::string to_string() const;
-
-    // for using as key in std::map
-    bool operator<(const common_arg& other) const {
-        if (args.empty() || other.args.empty()) {
-            return false;
-        }
-        return strcmp(args[0], other.args[0]) < 0;
-    }
-    bool operator==(const common_arg& other) const {
-        if (args.empty() || other.args.empty()) {
-            return false;
-        }
-        return strcmp(args[0], other.args[0]) == 0;
-    }
-
-    // get all args and env vars (including negated args/env)
-    std::vector<std::string> get_args() const;
-    std::vector<std::string> get_env() const;
+    std::string to_string();
 };

-namespace common_arg_utils {
-    bool is_truthy(const std::string & value);
-    bool is_falsey(const std::string & value);
-    bool is_autoy(const std::string & value);
-}
-
 struct common_params_context {
    enum llama_example ex = LLAMA_EXAMPLE_COMMON;
    common_params & params;
@@ -113,11 +76,7 @@ struct common_params_context {
 // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

-// parse input arguments from CLI into a map
-// TODO: support repeated args in the future
-bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
-
-// initialize argument parser context - used by test-arg-parser and preset
+// function to be used by test-arg-parser
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

 struct common_remote_params {
--- a/common/chat-parser-xml-toolcall.cpp
+++ b/common/chat-parser-xml-toolcall.cpp
@@ -1,879 +0,0 @@
-#include "chat.h"
-#include "chat-parser.h"
-#include "common.h"
-#include "json-partial.h"
-#include "json-schema-to-grammar.h"
-#include "log.h"
-#include "regex-partial.h"
-
-using json = nlohmann::ordered_json;
-
-class xml_toolcall_syntax_exception : public std::runtime_error {
-  public:
-    xml_toolcall_syntax_exception(const std::string & message) : std::runtime_error(message) {}
-};
-
-template<typename T>
-inline void sort_uniq(std::vector<T> &vec) {
-    std::sort(vec.begin(), vec.end());
-    vec.erase(std::unique(vec.begin(), vec.end()), vec.end());
-}
-
-template<typename T>
-inline bool all_space(const T &str) {
-    return std::all_of(str.begin(), str.end(), [](unsigned char ch) { return std::isspace(ch); });
-}
-
-static size_t utf8_truncate_safe(const std::string_view s) {
-    size_t len = s.size();
-    if (len == 0) return 0;
-    size_t i = len;
-    for (size_t back = 0; back < 4 && i > 0; ++back) {
-        --i;
-        unsigned char c = s[i];
-        if ((c & 0x80) == 0) {
-            return len;
-        } else if ((c & 0xC0) == 0xC0) {
-            size_t expected_len = 0;
-            if ((c & 0xE0) == 0xC0) expected_len = 2;
-            else if ((c & 0xF0) == 0xE0) expected_len = 3;
-            else if ((c & 0xF8) == 0xF0) expected_len = 4;
-            else return i;
-            if (len - i >= expected_len) {
-                return len;
-            } else {
-                return i;
-            }
-        }
-    }
-    return len - std::min(len, size_t(3));
-}
-
-inline void utf8_truncate_safe_resize(std::string &s) {
-    s.resize(utf8_truncate_safe(s));
-}
-
-inline std::string_view utf8_truncate_safe_view(const std::string_view s) {
-    return s.substr(0, utf8_truncate_safe(s));
-}
-
-static std::optional<common_chat_msg_parser::find_regex_result> try_find_2_literal_splited_by_spaces(common_chat_msg_parser & builder, const std::string & literal1, const std::string & literal2) {
-    if (literal1.size() == 0) return builder.try_find_literal(literal2);
-    const auto saved_pos = builder.pos();
-    while (auto res = builder.try_find_literal(literal1)) {
-        builder.consume_spaces();
-        const auto match_len = std::min(literal2.size(), builder.input().size() - builder.pos());
-        if (builder.input().compare(builder.pos(), match_len, literal2, 0, match_len) == 0) {
-            if (res->prelude.size() != res->groups[0].begin - saved_pos) {
-                res->prelude = builder.str({saved_pos, res->groups[0].begin});
-            }
-            builder.move_to(builder.pos() + match_len);
-            res->groups[0].end = builder.pos();
-            GGML_ASSERT(res->groups[0].begin != res->groups[0].end);
-            return res;
-        }
-        builder.move_to(res->groups[0].begin + 1);
-    }
-    builder.move_to(saved_pos);
-    return std::nullopt;
-}
-
-/**
- * make a GBNF that accept any strings except those containing any of the forbidden strings.
- */
-std::string make_gbnf_excluding(std::vector<std::string> forbids) {
-    constexpr auto charclass_escape = [](unsigned char c) -> std::string {
-        if (c == '\\' || c == ']' || c == '^' || c == '-') {
-            std::string s = "\\";
-            s.push_back((char)c);
-            return s;
-        }
-        if (isprint(c)) {
-            return std::string(1, (char)c);
-        }
-        char buf[16];
-        snprintf(buf, 15, "\\x%02X", c);
-        return std::string(buf);
-    };
-    constexpr auto build_expr = [charclass_escape](auto self, const std::vector<std::string>& forbids, int l, int r, int depth) -> std::string {
-        std::vector<std::pair<unsigned char, std::pair<int,int>>> children;
-        int i = l;
-        while (i < r) {
-            const std::string &s = forbids[i];
-            if ((int)s.size() == depth) {
-                ++i;
-                continue;
-            }
-            unsigned char c = (unsigned char)s[depth];
-            int j = i;
-            while (j < r && (int)forbids[j].size() > depth &&
-                   (unsigned char)forbids[j][depth] == c) {
-                ++j;
-            }
-            children.push_back({c, {i, j}});
-            i = j;
-        }
-        std::vector<std::string> alts;
-        if (!children.empty()) {
-            std::string cls;
-            for (auto &ch : children) cls += charclass_escape(ch.first);
-            alts.push_back(std::string("[^") + cls + "]");
-        }
-        for (auto &ch : children) {
-            std::string childExpr = self(self, forbids, ch.second.first, ch.second.second, depth+1);
-            if (!childExpr.empty()) {
-                std::string quoted_ch = "\"";
-                if (ch.first == '\\') quoted_ch += "\\\\";
-                else if (ch.first == '"') quoted_ch += "\\\"";
-                else if (isprint(ch.first)) quoted_ch.push_back(ch.first);
-                else {
-                    char buf[16];
-                    snprintf(buf, 15, "\\x%02X", ch.first);
-                    quoted_ch += buf;
-                }
-                quoted_ch += "\"";
-                std::string branch = quoted_ch + std::string(" ") + childExpr;
-                alts.push_back(branch);
-            }
-        }
-        if (alts.empty()) return "";
-        std::ostringstream oss;
-        oss << "( ";
-        for (size_t k = 0; k < alts.size(); ++k) {
-            if (k) oss << " | ";
-            oss << alts[k];
-        }
-        oss << " )";
-        return oss.str();
-    };
-    if (forbids.empty()) return "( . )*";
-    sort(forbids.begin(), forbids.end());
-    std::string expr = build_expr(build_expr, forbids, 0, forbids.size(), 0);
-    if (expr.empty()) {
-        std::string cls;
-        for (auto &s : forbids) if (!s.empty()) cls += charclass_escape((unsigned char)s[0]);
-        expr = std::string("( [^") + cls + "] )";
-    }
-    if (forbids.size() == 1)
-        return expr + "*";
-    else
-        return std::string("( ") + expr + " )*";
-}
-
-/**
- * Build grammar for xml-style tool call
- * form.scope_start and form.scope_end can be empty.
- * Requires data.format for model-specific hacks.
- */
-void build_grammar_xml_tool_call(common_chat_params & data, const json & tools, const struct xml_tool_call_format & form) {
-    GGML_ASSERT(!form.tool_start.empty());
-    GGML_ASSERT(!form.tool_sep.empty());
-    GGML_ASSERT(!form.key_start.empty());
-    GGML_ASSERT(!form.val_end.empty());
-    GGML_ASSERT(!form.tool_end.empty());
-
-    std::string key_val_sep = form.key_val_sep;
-    if (form.key_val_sep2) {
-        key_val_sep += "\n";
-        key_val_sep += *form.key_val_sep2;
-    }
-    GGML_ASSERT(!key_val_sep.empty());
-
-    if (tools.is_array() && !tools.empty()) {
-        data.grammar = build_grammar([&](const common_grammar_builder &builder) {
-            auto string_arg_val = form.last_val_end ?
-                    builder.add_rule("string-arg-val", make_gbnf_excluding({form.val_end, *form.last_val_end})) :
-                    builder.add_rule("string-arg-val", make_gbnf_excluding({form.val_end}));
-
-            std::vector<std::string> tool_rules;
-            for (const auto & tool : tools) {
-                if (!tool.contains("type") || tool.at("type") != "function" || !tool.contains("function")) {
-                    LOG_WRN("Skipping tool without function: %s", tool.dump(2).c_str());
-                    continue;
-                }
-                const auto & function = tool.at("function");
-                if (!function.contains("name") || !function.at("name").is_string()) {
-                    LOG_WRN("Skipping invalid function (invalid name): %s", function.dump(2).c_str());
-                    continue;
-                }
-                if (!function.contains("parameters") || !function.at("parameters").is_object()) {
-                    LOG_WRN("Skipping invalid function (invalid parameters): %s", function.dump(2).c_str());
-                    continue;
-                }
-                std::string name = function.at("name");
-                auto parameters = function.at("parameters");
-                builder.resolve_refs(parameters);
-
-                struct parameter_rule {
-                    std::string symbol_name;
-                    bool is_required;
-                };
-                std::vector<parameter_rule> arg_rules;
-                if (!parameters.contains("properties") || !parameters.at("properties").is_object()) {
-                    LOG_WRN("Skipping invalid function (invalid properties): %s", function.dump(2).c_str());
-                    continue;
-                } else {
-                    std::vector<std::string> requiredParameters;
-                    if (parameters.contains("required")) {
-                        try { parameters.at("required").get_to(requiredParameters); }
-                        catch (const std::runtime_error&) {
-                            LOG_WRN("Invalid function required parameters, ignoring: %s", function.at("required").dump(2).c_str());
-                        }
-                    }
-                    sort_uniq(requiredParameters);
-                    for (const auto & [key, value] : parameters.at("properties").items()) {
-                        std::string quoted_key = key;
-                        bool required = std::binary_search(requiredParameters.begin(), requiredParameters.end(), key);
-                        if (form.key_start.back() == '"' && key_val_sep[0] == '"') {
-                            quoted_key = gbnf_format_literal(key);
-                            quoted_key = quoted_key.substr(1, quoted_key.size() - 2);
-                        }
-                        arg_rules.push_back(parameter_rule {builder.add_rule("func-" + name + "-kv-" + key,
-                            gbnf_format_literal(form.key_start) + " " +
-                            gbnf_format_literal(quoted_key) + " " +
-                            gbnf_format_literal(key_val_sep) + " " +
-                            ((value.contains("type") && value["type"].is_string() && value["type"] == "string" && (!form.raw_argval || *form.raw_argval)) ?
-                                    (form.raw_argval ?
-                                            string_arg_val :
-                                            "( " + string_arg_val + " | " + builder.add_schema(name + "-arg-" + key, value) + " )"
-                                    ) :
-                                    builder.add_schema(name + "-arg-" + key, value)
-                            )
-                        ), required});
-                    }
-                }
-
-                auto next_arg_with_sep = builder.add_rule(name + "-last-arg-end", form.last_val_end ? gbnf_format_literal(*form.last_val_end) : gbnf_format_literal(form.val_end));
-                decltype(next_arg_with_sep) next_arg = "\"\"";
-                for (auto i = arg_rules.size() - 1; /* i >= 0 && */ i < arg_rules.size(); --i) {
-                    std::string include_this_arg = arg_rules[i].symbol_name + " " + next_arg_with_sep;
-                    next_arg = builder.add_rule(name + "-arg-after-" + std::to_string(i), arg_rules[i].is_required ?
-                            include_this_arg : "( " + include_this_arg + " ) | " + next_arg
-                    );
-                    include_this_arg = gbnf_format_literal(form.val_end) + " " + include_this_arg;
-                    next_arg_with_sep = builder.add_rule(name + "-arg-after-" + std::to_string(i) + "-with-sep", arg_rules[i].is_required ?
-                            include_this_arg : "( " + include_this_arg + " ) | " + next_arg_with_sep
-                    );
-                }
-
-                std::string quoted_name = name;
-                if (form.tool_start.back() == '"' && form.tool_sep[0] == '"') {
-                    quoted_name = gbnf_format_literal(name);
-                    quoted_name = quoted_name.substr(1, quoted_name.size() - 2);
-                }
-                quoted_name = gbnf_format_literal(quoted_name);
-                // Kimi-K2 uses functions.{{ tool_call['function']['name'] }}:{{ loop.index }} as function name
-                if (data.format == COMMON_CHAT_FORMAT_KIMI_K2) {
-                    quoted_name = "\"functions.\" " + quoted_name + " \":\" [0-9]+";
-                }
-                tool_rules.push_back(builder.add_rule(name + "-call",
-                        gbnf_format_literal(form.tool_start) + " " +
-                        quoted_name + " " +
-                        gbnf_format_literal(form.tool_sep) + " " +
-                        next_arg
-                ));
-            }
-
-            auto tool_call_once = builder.add_rule("root-tool-call-once", string_join(tool_rules, " | "));
-            auto tool_call_more = builder.add_rule("root-tool-call-more", gbnf_format_literal(form.tool_end) + " " + tool_call_once);
-            auto call_end = builder.add_rule("root-call-end", form.last_tool_end ? gbnf_format_literal(*form.last_tool_end) : gbnf_format_literal(form.tool_end));
-            auto tool_call_multiple_with_end = builder.add_rule("root-tool-call-multiple-with-end", tool_call_once + " " + tool_call_more + "* " + call_end);
-            builder.add_rule("root",
-                (form.scope_start.empty() ? "" : gbnf_format_literal(form.scope_start) + " ") +
-                tool_call_multiple_with_end  + "?" +
-                (form.scope_end.empty() ? "" : " " + gbnf_format_literal(form.scope_end))
-            );
-        });
-
-        // grammar trigger for tool call
-        data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, form.scope_start + form.tool_start });
-    }
-}
-
-/**
- * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
- * Throws xml_toolcall_syntax_exception if there is invalid syntax and cannot recover the original status for common_chat_msg_parser.
- * form.scope_start, form.tool_sep and form.scope_end can be empty.
- */
-inline bool parse_xml_tool_calls(common_chat_msg_parser & builder, const struct xml_tool_call_format & form) {
-    GGML_ASSERT(!form.tool_start.empty());
-    GGML_ASSERT(!form.key_start.empty());
-    GGML_ASSERT(!form.key_val_sep.empty());
-    GGML_ASSERT(!form.val_end.empty());
-    GGML_ASSERT(!form.tool_end.empty());
-
-    // Helper to choose return false or throw error
-    constexpr auto return_error = [](common_chat_msg_parser & builder, auto &start_pos, const bool &recovery) {
-        LOG_DBG("Failed to parse XML-Style tool call at position: %s\n", gbnf_format_literal(builder.consume_rest().substr(0, 20)).c_str());
-        if (recovery) {
-            builder.move_to(start_pos);
-            return false;
-        } else throw xml_toolcall_syntax_exception("Tool call parsing failed with unrecoverable errors. Try using a grammar to constrain the model’s output.");
-    };
-    // Drop substring from needle to end from a JSON
-    constexpr auto partial_json = [](std::string &json_str, std::string_view needle = "XML_TOOL_CALL_PARTIAL_FLAG") {
-        auto pos = json_str.rfind(needle);
-        if (pos == std::string::npos) {
-            return false;
-        }
-        for (auto i = pos + needle.size(); i < json_str.size(); ++i) {
-            unsigned char ch = static_cast<unsigned char>(json_str[i]);
-            if (ch != '\'' && ch != '"' && ch != '}' && ch != ':' && !std::isspace(ch)) {
-                return false;
-            }
-        }
-        if (pos != 0 && json_str[pos - 1] == '"') {
-            --pos;
-        }
-        json_str.resize(pos);
-        return true;
-    };
-    // Helper to generate a partial argument JSON
-    constexpr auto gen_partial_json = [partial_json](auto set_partial_arg, auto &arguments, auto &builder, auto &function_name) {
-        auto rest = builder.consume_rest();
-        utf8_truncate_safe_resize(rest);
-        set_partial_arg(rest, "XML_TOOL_CALL_PARTIAL_FLAG");
-        auto tool_str = arguments.dump();
-        if (partial_json(tool_str)) {
-            if (builder.add_tool_call(function_name, "", tool_str)) {
-                return;
-            }
-        }
-        LOG_DBG("Failed to parse partial XML-Style tool call, fallback to non-partial: %s\n", tool_str.c_str());
-    };
-    // Helper to find a close (because there may be form.last_val_end or form.last_tool_end)
-    constexpr auto try_find_close = [](
-            common_chat_msg_parser & builder,
-            const std::string & end,
-            const std::optional<std::string> & alt_end,
-            const std::string & end_next,
-            const std::optional<std::string> & alt_end_next
-    ) {
-        auto saved_pos = builder.pos();
-        auto tc = builder.try_find_literal(end);
-        auto val_end_size = end.size();
-        if (alt_end) {
-            auto pos_1 = builder.pos();
-            builder.move_to(saved_pos);
-            auto tc2 = try_find_2_literal_splited_by_spaces(builder, *alt_end, end_next);
-            if (alt_end_next) {
-                builder.move_to(saved_pos);
-                auto tc3 = try_find_2_literal_splited_by_spaces(builder, *alt_end, *alt_end_next);
-                if (tc3 && (!tc2 || tc2->prelude.size() > tc3->prelude.size())) {
-                    tc2 = tc3;
-                }
-            }
-            if (tc2 && (!tc || tc->prelude.size() > tc2->prelude.size())) {
-                tc = tc2;
-                tc->groups[0].end = std::min(builder.input().size(), tc->groups[0].begin + alt_end->size());
-                builder.move_to(tc->groups[0].end);
-                val_end_size = alt_end->size();
-            } else {
-                builder.move_to(pos_1);
-            }
-        }
-        return std::make_pair(val_end_size, tc);
-    };
-    // Helper to find a val_end or last_val_end, returns matched pattern size
-    const auto try_find_val_end = [try_find_close, &builder, &form]() {
-        return try_find_close(builder, form.val_end, form.last_val_end, form.tool_end, form.last_tool_end);
-    };
-    // Helper to find a tool_end or last_tool_end, returns matched pattern size
-    const auto try_find_tool_end = [try_find_close, &builder, &form]() {
-        return try_find_close(builder, form.tool_end, form.last_tool_end, form.scope_end, std::nullopt);
-    };
-
-    bool recovery = true;
-    const auto start_pos = builder.pos();
-    if (!all_space(form.scope_start)) {
-        if (auto tc = builder.try_find_literal(form.scope_start)) {
-            if (all_space(tc->prelude)) {
-                if (form.scope_start.size() != tc->groups[0].end - tc->groups[0].begin)
-                    throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.scope_start));
-            } else {
-                builder.move_to(start_pos);
-                return false;
-            }
-        } else return false;
-    }
-    while (auto tc = builder.try_find_literal(form.tool_start)) {
-        if (!all_space(tc->prelude)) {
-            LOG_DBG("XML-Style tool call: Expected %s, but found %s, trying to match next pattern\n",
-                    gbnf_format_literal(form.tool_start).c_str(),
-                    gbnf_format_literal(tc->prelude).c_str()
-            );
-            builder.move_to(tc->groups[0].begin - tc->prelude.size());
-            break;
-        }
-
-        // Find tool name
-        auto func_name = builder.try_find_literal(all_space(form.tool_sep) ? form.key_start : form.tool_sep);
-        if (!func_name) {
-            auto [sz, tc] = try_find_tool_end();
-            func_name = tc;
-        }
-        if (!func_name) {
-            // Partial tool name not supported
-            throw common_chat_msg_partial_exception("incomplete tool_call");
-        }
-        // If the model generate multiple tool call and the first tool call has no argument
-        if (func_name->prelude.find(form.tool_end) != std::string::npos || (form.last_tool_end ? func_name->prelude.find(*form.last_tool_end) != std::string::npos : false)) {
-            builder.move_to(func_name->groups[0].begin - func_name->prelude.size());
-            auto [sz, tc] = try_find_tool_end();
-            func_name = tc;
-        }
-
-        // Parse tool name
-        builder.move_to(all_space(form.tool_sep) ? func_name->groups[0].begin : func_name->groups[0].end);
-        std::string function_name = string_strip(func_name->prelude);
-        // Kimi-K2 uses functions.{{ tool_call['function']['name'] }}:{{ loop.index }} as function name
-        if (builder.syntax().format == COMMON_CHAT_FORMAT_KIMI_K2) {
-            if (string_starts_with(function_name, "functions.")) {
-                static const std::regex re(":\\d+$");
-                if (std::regex_search(function_name, re)) {
-                    function_name = function_name.substr(10, function_name.rfind(":") - 10);
-                }
-            }
-        }
-
-        // Argument JSON
-        json arguments = json::object();
-
-        // Helper to generate a partial argument JSON
-        const auto gen_partial_args = [&](auto set_partial_arg) {
-            gen_partial_json(set_partial_arg, arguments, builder, function_name);
-        };
-
-        // Parse all arg_key/arg_value pairs
-        while (auto tc = builder.try_find_literal(form.key_start)) {
-            if (!all_space(tc->prelude)) {
-                LOG_DBG("XML-Style tool call: Expected %s, but found %s, trying to match next pattern\n",
-                        gbnf_format_literal(form.key_start).c_str(),
-                        gbnf_format_literal(tc->prelude).c_str()
-                );
-                builder.move_to(tc->groups[0].begin - tc->prelude.size());
-                break;
-            }
-            if (tc->groups[0].end - tc->groups[0].begin != form.key_start.size()) {
-                auto tool_call_arg = arguments.dump();
-                if (tool_call_arg.size() != 0 && tool_call_arg[tool_call_arg.size() - 1] == '}') {
-                    tool_call_arg.resize(tool_call_arg.size() - 1);
-                }
-                builder.add_tool_call(function_name, "", tool_call_arg);
-                throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.key_start));
-            }
-
-            // Parse arg_key
-            auto key_res = builder.try_find_literal(form.key_val_sep);
-            if (!key_res) {
-                gen_partial_args([&](auto &rest, auto &needle) {arguments[rest + needle] = "";});
-                throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(form.key_val_sep) + " after " + gbnf_format_literal(form.key_start));
-            }
-            if (key_res->groups[0].end - key_res->groups[0].begin != form.key_val_sep.size()) {
-                gen_partial_args([&](auto &, auto &needle) {arguments[key_res->prelude + needle] = "";});
-                throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.key_val_sep));
-            }
-            auto &key = key_res->prelude;
-            recovery = false;
-
-            // Parse arg_value
-            if (form.key_val_sep2) {
-                if (auto tc = builder.try_find_literal(*form.key_val_sep2)) {
-                    if (!all_space(tc->prelude)) {
-                        LOG_DBG("Failed to parse XML-Style tool call: Unexcepted %s between %s and %s\n",
-                                gbnf_format_literal(tc->prelude).c_str(),
-                                gbnf_format_literal(form.key_val_sep).c_str(),
-                                gbnf_format_literal(*form.key_val_sep2).c_str()
-                        );
-                        return return_error(builder, start_pos, false);
-                    }
-                    if (tc->groups[0].end - tc->groups[0].begin != form.key_val_sep2->size()) {
-                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
-                        throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(*form.key_val_sep2));
-                    }
-                } else {
-                    gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
-                    throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(*form.key_val_sep2) + " after " + gbnf_format_literal(form.key_val_sep));
-                }
-            }
-            auto val_start = builder.pos();
-
-            // Test if arg_val is a partial JSON
-            std::optional<common_json> value_json = std::nullopt;
-            if (!form.raw_argval || !*form.raw_argval) {
-                try { value_json = builder.try_consume_json(); }
-                catch (const std::runtime_error&) { builder.move_to(val_start); }
-                // TODO: Delete this when json_partial adds top-level support for null/true/false
-                if (builder.pos() == val_start) {
-                    const static std::regex number_regex(R"([0-9-][0-9]*(\.\d*)?([eE][+-]?\d*)?)");
-                    builder.consume_spaces();
-                    std::string_view sv = utf8_truncate_safe_view(builder.input());
-                    sv.remove_prefix(builder.pos());
-                    std::string rest = "a";
-                    if (sv.size() < 6) rest = sv;
-                    if (string_starts_with("null", rest) || string_starts_with("true", rest) || string_starts_with("false", rest) || std::regex_match(sv.begin(), sv.end(), number_regex)) {
-                        value_json = {123, {"123", "123"}};
-                        builder.consume_rest();
-                    } else {
-                        builder.move_to(val_start);
-                    }
-                }
-            }
-
-            // If it is a JSON and followed by </arg_value>, parse as json
-            // cannot support streaming because it may be a plain text starting with JSON
-            if (value_json) {
-                auto json_end = builder.pos();
-                builder.consume_spaces();
-                if (builder.pos() == builder.input().size()) {
-                    if (form.raw_argval && !*form.raw_argval && (value_json->json.is_string() || value_json->json.is_object() || value_json->json.is_array())) {
-                        arguments[key] = value_json->json;
-                        auto json_str = arguments.dump();
-                        if (!value_json->healing_marker.json_dump_marker.empty()) {
-                            GGML_ASSERT(std::string::npos != json_str.rfind(value_json->healing_marker.json_dump_marker));
-                            json_str.resize(json_str.rfind(value_json->healing_marker.json_dump_marker));
-                        } else {
-                            GGML_ASSERT(json_str.back() == '}');
-                            json_str.resize(json_str.size() - 1);
-                        }
-                        builder.add_tool_call(function_name, "", json_str);
-                    } else {
-                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
-                    }
-                    LOG_DBG("Possible JSON arg_value: %s\n", value_json->json.dump().c_str());
-                    throw common_chat_msg_partial_exception("JSON arg_value detected. Waiting for more tokens for validations.");
-                }
-                builder.move_to(json_end);
-                auto [val_end_size, tc] = try_find_val_end();
-                if (tc && all_space(tc->prelude) && value_json->healing_marker.marker.empty()) {
-                    if (tc->groups[0].end - tc->groups[0].begin != val_end_size) {
-                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
-                        LOG_DBG("Possible terminated JSON arg_value: %s\n", value_json->json.dump().c_str());
-                        throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.val_end) + (form.last_val_end ? gbnf_format_literal(*form.last_val_end) : ""));
-                    } else arguments[key] = value_json->json;
-                } else builder.move_to(val_start);
-            }
-
-            // If not, parse as plain text
-            if (val_start == builder.pos()) {
-                if (auto [val_end_size, value_plain] = try_find_val_end(); value_plain) {
-                    auto &value_str = value_plain->prelude;
-                    if (form.trim_raw_argval) value_str = string_strip(value_str);
-                    if (value_plain->groups[0].end - value_plain->groups[0].begin != val_end_size) {
-                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = value_str + needle;});
-                        throw common_chat_msg_partial_exception(
-                                "Expected " + gbnf_format_literal(form.val_end) +
-                                " after " + gbnf_format_literal(form.key_val_sep) +
-                                (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
-                        );
-                    }
-                    arguments[key] = value_str;
-                } else {
-                    if (form.trim_raw_argval) {
-                        gen_partial_args([&](auto &rest, auto &needle) {arguments[key] = string_strip(rest) + needle;});
-                    } else {
-                        gen_partial_args([&](auto &rest, auto &needle) {arguments[key] = rest + needle;});
-                    }
-                    throw common_chat_msg_partial_exception(
-                            "Expected " + gbnf_format_literal(form.val_end) +
-                            " after " + gbnf_format_literal(form.key_val_sep) +
-                            (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
-                    );
-                }
-            }
-        }
-
-        // Consume closing tag
-        if (auto [tool_end_size, tc] = try_find_tool_end(); tc) {
-            if (!all_space(tc->prelude)) {
-                LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
-                        gbnf_format_literal(form.tool_end).c_str(),
-                        gbnf_format_literal(tc->prelude).c_str()
-                );
-                return return_error(builder, start_pos, recovery);
-            }
-            if (tc->groups[0].end - tc->groups[0].begin == tool_end_size) {
-                // Add the parsed tool call
-                if (!builder.add_tool_call(function_name, "", arguments.dump())) {
-                    throw common_chat_msg_partial_exception("Failed to add XML-Style tool call");
-                }
-                recovery = false;
-                continue;
-            }
-        }
-
-        auto tool_call_arg = arguments.dump();
-        if (tool_call_arg.size() != 0 && tool_call_arg[tool_call_arg.size() - 1] == '}') {
-            tool_call_arg.resize(tool_call_arg.size() - 1);
-        }
-        builder.add_tool_call(function_name, "", tool_call_arg);
-        throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(form.tool_end) + " after " + gbnf_format_literal(form.val_end));
-    }
-    if (auto tc = builder.try_find_literal(form.scope_end)) {
-        if (!all_space(tc->prelude)) {
-            LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
-                    gbnf_format_literal(form.scope_end).c_str(),
-                    gbnf_format_literal(tc->prelude).c_str()
-            );
-            return return_error(builder, start_pos, recovery);
-        }
-    } else {
-        if (all_space(form.scope_end)) return true;
-        builder.consume_spaces();
-        if (builder.pos() == builder.input().size())
-            throw common_chat_msg_partial_exception("incomplete tool calls");
-        LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
-                gbnf_format_literal(form.scope_end).c_str(),
-                gbnf_format_literal(builder.consume_rest()).c_str()
-        );
-        return return_error(builder, start_pos, recovery);
-    }
-
-    return true;
-}
-
-/**
- * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
- * May cause std::runtime_error if there is invalid syntax because partial valid tool call is already sent out to client.
- * form.scope_start, form.tool_sep and form.scope_end can be empty.
- */
-bool common_chat_msg_parser::try_consume_xml_tool_calls(const struct xml_tool_call_format & form) {
-    auto pos = pos_;
-    auto tsize = result_.tool_calls.size();
-    try { return parse_xml_tool_calls(*this, form); }
-    catch (const xml_toolcall_syntax_exception&) {}
-    move_to(pos);
-    result_.tool_calls.resize(tsize);
-    return false;
-}
-
-/**
- * Parse content uses reasoning and XML-Style tool call
- * TODO: Note that form.allow_toolcall_in_think is not tested yet. If anyone confirms it works, this comment can be removed.
- */
-inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, const struct xml_tool_call_format & form, const std::string & start_think = "<think>", const std::string & end_think = "</think>") {
-    constexpr auto rstrip = [](std::string &s) {
-        s.resize(std::distance(s.begin(), std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) { return !std::isspace(ch); }).base()));
-    };
-    // Erase substring from l to r, along with additional spaces nearby
-    constexpr auto erase_spaces = [](auto &str, size_t l, size_t r) {
-        while (/* l > -1 && */ --l < str.size() && std::isspace(static_cast<unsigned char>(str[l])));
-        ++l;
-        while (++r < str.size() && std::isspace(static_cast<unsigned char>(str[r])));
-        if (l < r) str[l] = '\n';
-        if (l + 1 < r) str[l + 1] = '\n';
-        if (l != 0) l += 2;
-        str.erase(l, r - l);
-        return l;
-    };
-    constexpr auto trim_suffix = [](std::string &content, std::initializer_list<std::string_view> list) {
-        auto best_match = content.size();
-        for (auto pattern: list) {
-            if (pattern.size() == 0) continue;
-            for (auto match_idx = content.size() - std::min(pattern.size(), content.size()); content.size() > match_idx; match_idx++) {
-                auto match_len = content.size() - match_idx;
-                if (content.compare(match_idx, match_len, pattern.data(), match_len) == 0 && best_match > match_idx) {
-                    best_match = match_idx;
-                }
-            }
-        }
-        if (content.size() > best_match) {
-            content.erase(best_match);
-        }
-    };
-    const auto trim_potential_partial_word = [&start_think, &end_think, &form, trim_suffix](std::string &content) {
-        return trim_suffix(content, {
-            start_think, end_think, form.scope_start, form.tool_start, form.tool_sep, form.key_start,
-            form.key_val_sep, form.key_val_sep2 ? form.key_val_sep2->c_str() : "",
-            form.val_end, form.last_val_end ? form.last_val_end->c_str() : "",
-            form.tool_end, form.last_tool_end ? form.last_tool_end->c_str() : "",
-            form.scope_end
-        });
-    };
-
-
-    // Trim leading spaces without affecting keyword matching
-    static const common_regex spaces_regex("\\s*");
-    {
-        auto tc = builder.consume_regex(spaces_regex);
-        auto spaces = builder.str(tc.groups[0]);
-        auto s1 = spaces.size();
-        trim_potential_partial_word(spaces);
-        auto s2 = spaces.size();
-        builder.move_to(builder.pos() - (s1 - s2));
-    }
-
-    // Parse content
-    bool reasoning_unclosed = builder.syntax().thinking_forced_open;
-    std::string unclosed_reasoning_content("");
-    for (;;) {
-        auto tc = try_find_2_literal_splited_by_spaces(builder, form.scope_start, form.tool_start);
-        std::string content;
-        std::string tool_call_start;
-
-        if (tc) {
-            content = std::move(tc->prelude);
-            tool_call_start = builder.str(tc->groups[0]);
-            LOG_DBG("Matched tool start: %s\n", gbnf_format_literal(tool_call_start).c_str());
-        } else {
-            content = builder.consume_rest();
-            utf8_truncate_safe_resize(content);
-        }
-
-        // Handle unclosed think block
-        if (reasoning_unclosed) {
-            if (auto pos = content.find(end_think); pos == std::string::npos && builder.pos() != builder.input().size()) {
-                unclosed_reasoning_content += content;
-                if (!(form.allow_toolcall_in_think && tc)) {
-                    unclosed_reasoning_content += tool_call_start;
-                    continue;
-                }
-            } else {
-                reasoning_unclosed = false;
-                std::string reasoning_content;
-                if (pos == std::string::npos) {
-                    reasoning_content = std::move(content);
-                } else {
-                    reasoning_content = content.substr(0, pos);
-                    content.erase(0, pos + end_think.size());
-                }
-                if (builder.pos() == builder.input().size() && all_space(content)) {
-                    rstrip(reasoning_content);
-                    trim_potential_partial_word(reasoning_content);
-                    rstrip(reasoning_content);
-                    if (reasoning_content.empty()) {
-                        rstrip(unclosed_reasoning_content);
-                        trim_potential_partial_word(unclosed_reasoning_content);
-                        rstrip(unclosed_reasoning_content);
-                        if (unclosed_reasoning_content.empty()) continue;
-                    }
-                }
-                if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
-                    builder.add_content(start_think);
-                    builder.add_content(unclosed_reasoning_content);
-                    builder.add_content(reasoning_content);
-                    if (builder.pos() != builder.input().size() || !all_space(content))
-                        builder.add_content(end_think);
-                } else {
-                    builder.add_reasoning_content(unclosed_reasoning_content);
-                    builder.add_reasoning_content(reasoning_content);
-                }
-                unclosed_reasoning_content.clear();
-            }
-        }
-
-        // Handle multiple think block
-        bool toolcall_in_think = false;
-        for (auto think_start = content.find(start_think); think_start != std::string::npos; think_start = content.find(start_think, think_start)) {
-            if (auto think_end = content.find(end_think, think_start + start_think.size()); think_end != std::string::npos) {
-                if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
-                    auto reasoning_content = content.substr(think_start + start_think.size(), think_end - think_start - start_think.size());
-                    builder.add_reasoning_content(reasoning_content);
-                    think_start = erase_spaces(content, think_start, think_end + end_think.size() - 1);
-                } else {
-                    think_start = think_end + end_think.size() - 1;
-                }
-            } else {
-                // This <tool_call> start is in thinking block, skip this tool call
-                // This <tool_call> start is in thinking block
-                if (form.allow_toolcall_in_think) {
-                    unclosed_reasoning_content = content.substr(think_start + start_think.size());
-                } else {
-                    unclosed_reasoning_content = content.substr(think_start + start_think.size()) + tool_call_start;
-                }
-                reasoning_unclosed = true;
-                content.resize(think_start);
-                toolcall_in_think = true;
-            }
-        }
-
-        if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
-            rstrip(content);
-            // Handle unclosed </think> token from content: delete all </think> token
-            if (auto pos = content.rfind(end_think); pos != std::string::npos) {
-                while (pos != std::string::npos) {
-                    pos = erase_spaces(content, pos, pos + end_think.size() - 1);
-                    pos = content.rfind(end_think, pos);
-                }
-            }
-            // Strip if needed
-            if (content.size() > 0 && std::isspace(static_cast<unsigned char>(content[0]))) {
-                content = string_strip(content);
-            }
-        }
-
-        // remove potential partial suffix
-        if (builder.pos() == builder.input().size()) {
-            if (unclosed_reasoning_content.empty()) {
-                rstrip(content);
-                trim_potential_partial_word(content);
-                rstrip(content);
-            } else {
-                rstrip(unclosed_reasoning_content);
-                trim_potential_partial_word(unclosed_reasoning_content);
-                rstrip(unclosed_reasoning_content);
-            }
-        }
-
-        // consume unclosed_reasoning_content if allow_toolcall_in_think is set
-        if (form.allow_toolcall_in_think && !unclosed_reasoning_content.empty()) {
-            if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
-                builder.add_reasoning_content(unclosed_reasoning_content);
-            } else {
-                if (content.empty()) {
-                    content = start_think + unclosed_reasoning_content;
-                } else {
-                    content += "\n\n" + start_think;
-                    content += unclosed_reasoning_content;
-                }
-            }
-            unclosed_reasoning_content.clear();
-        }
-
-        // Add content
-        if (!content.empty()) {
-            // If there are multiple content blocks
-            if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content && builder.result().content.size() != 0) {
-                builder.add_content("\n\n");
-            }
-            builder.add_content(content);
-        }
-
-        // This <tool_call> start is in thinking block and toolcall_in_think not set, skip this tool call
-        if (toolcall_in_think && !form.allow_toolcall_in_think) {
-            continue;
-        }
-
-        // There is no tool call and all content is parsed
-        if (!tc) {
-            GGML_ASSERT(builder.pos() == builder.input().size());
-            GGML_ASSERT(unclosed_reasoning_content.empty());
-            if (!form.allow_toolcall_in_think) GGML_ASSERT(!reasoning_unclosed);
-            break;
-        }
-
-        builder.move_to(tc->groups[0].begin);
-        if (builder.try_consume_xml_tool_calls(form)) {
-            auto end_of_tool = builder.pos();
-            builder.consume_spaces();
-            if (builder.pos() != builder.input().size()) {
-                builder.move_to(end_of_tool);
-                if (!builder.result().content.empty()) {
-                    builder.add_content("\n\n");
-                }
-            }
-        } else {
-            static const common_regex next_char_regex(".");
-            auto c = builder.str(builder.consume_regex(next_char_regex).groups[0]);
-            rstrip(c);
-            builder.add_content(c);
-        }
-    }
-}
-
-/**
- * Parse content uses reasoning and XML-Style tool call
- */
-void common_chat_msg_parser::consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think, const std::string & end_think) {
-    parse_msg_with_xml_tool_calls(*this, form, start_think, end_think);
-}
--- a/common/chat-parser-xml-toolcall.h
+++ b/common/chat-parser-xml-toolcall.h
@@ -1,45 +0,0 @@
-#pragma once
-
-#include "chat.h"
-
-#include <nlohmann/json.hpp>
-
-#include <optional>
-#include <string>
-#include <vector>
-
-
-// Sample config:
-// MiniMax-M2 (left): <minimax:tool_call>\n<invoke name="tool-name">\n<parameter name="key">value</parameter>\n...</invoke>\n...</minimax:tool_call>
-// GLM 4.5   (right): <tool_call>function_name\n<arg_key>key</arg_key>\n<arg_value>value</arg_value>\n</tool_call>
-struct xml_tool_call_format {
-    std::string scope_start; // <minimax:tool_call>\n  // \n                      // can be empty
-    std::string tool_start;  // <invoke name=\"        // <tool_call>
-    std::string tool_sep;    // \">\n                  // \n                      // can be empty only for parse_xml_tool_calls
-    std::string key_start;   // <parameter name=\"     // <arg_key>
-    std::string key_val_sep; // \">                    // </arg_key>\n<arg_value>
-    std::string val_end;     // </parameter>\n         // </arg_value>\n
-    std::string tool_end;    // </invoke>\n            // </tool_call>\n
-    std::string scope_end;   // </minimax:tool_call>   //                         // can be empty
-    // Set this if there can be dynamic spaces inside key_val_sep.
-    // e.g. key_val_sep=</arg_key> key_val_sep2=<arg_value> for GLM4.5
-    std::optional<std::string> key_val_sep2 = std::nullopt;
-    // Set true if argval should only be raw string. e.g. Hello "world" hi
-    // Set false if argval should only be json string. e.g. "Hello \"world\" hi"
-    // Defaults to std::nullopt, both will be allowed.
-    std::optional<bool> raw_argval = std::nullopt;
-    std::optional<std::string> last_val_end = std::nullopt;
-    std::optional<std::string> last_tool_end = std::nullopt;
-    bool trim_raw_argval = false;
-    bool allow_toolcall_in_think = false;
-};
-
-// make a GBNF that accept any strings except those containing any of the forbidden strings.
-std::string make_gbnf_excluding(std::vector<std::string> forbids);
-
-/**
- * Build grammar for xml-style tool call
- * form.scope_start and form.scope_end can be empty.
- * Requires data.format for model-specific hacks.
- */
-void build_grammar_xml_tool_call(common_chat_params & data, const nlohmann::ordered_json & tools, const struct xml_tool_call_format & form);
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
--- a/common/chat-parser.h
+++ b/common/chat-parser.h
@@ -1,7 +1,6 @@
 #pragma once

 #include "chat.h"
-#include "chat-parser-xml-toolcall.h"
 #include "json-partial.h"
 #include "regex-partial.h"

@@ -120,14 +119,5 @@ class common_chat_msg_parser {
        const std::vector<std::vector<std::string>> & content_paths = {}
    );

-    /**
-     * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
-     * form.scope_start, form.tool_sep and form.scope_end can be empty.
-     */
-    bool try_consume_xml_tool_calls(const struct xml_tool_call_format & form);
-
-    // Parse content uses reasoning and XML-Style tool call
-    void consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think = "<think>", const std::string & end_think = "</think>");
-
    void clear_tools();
 };
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@@ -1,114 +0,0 @@
-#include "chat-peg-parser.h"
-
-#include <nlohmann/json.hpp>
-
-using json = nlohmann::json;
-
-static std::string_view trim_trailing_space(std::string_view sv) {
-    while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
-        sv.remove_suffix(1);
-    }
-    return sv;
-}
-
-void common_chat_peg_mapper::from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result) {
-    arena.visit(result, [this](const common_peg_ast_node & node) {
-        map(node);
-    });
-}
-
-void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
-    bool is_reasoning = node.tag == common_chat_peg_builder::REASONING;
-    bool is_content = node.tag == common_chat_peg_builder::CONTENT;
-
-    if (is_reasoning) {
-        result.reasoning_content = std::string(trim_trailing_space(node.text));
-    }
-
-    if (is_content) {
-        result.content = std::string(trim_trailing_space(node.text));
-    }
-}
-
-void common_chat_peg_native_mapper::map(const common_peg_ast_node & node) {
-    common_chat_peg_mapper::map(node);
-
-    bool is_tool_open = node.tag == common_chat_peg_native_builder::TOOL_OPEN;
-    bool is_tool_name = node.tag == common_chat_peg_native_builder::TOOL_NAME;
-    bool is_tool_id = node.tag == common_chat_peg_native_builder::TOOL_ID;
-    bool is_tool_args = node.tag == common_chat_peg_native_builder::TOOL_ARGS;
-
-    if (is_tool_open) {
-        result.tool_calls.emplace_back();
-        current_tool = &result.tool_calls.back();
-    }
-
-    if (is_tool_id && current_tool) {
-        current_tool->id = std::string(trim_trailing_space(node.text));
-    }
-
-    if (is_tool_name && current_tool) {
-        current_tool->name = std::string(trim_trailing_space(node.text));
-    }
-
-    if (is_tool_args && current_tool) {
-        current_tool->arguments = std::string(trim_trailing_space(node.text));
-    }
-}
-
-void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
-    common_chat_peg_mapper::map(node);
-
-    bool is_tool_open = node.tag == common_chat_peg_constructed_builder::TOOL_OPEN;
-    bool is_tool_name = node.tag == common_chat_peg_constructed_builder::TOOL_NAME;
-    bool is_tool_close = node.tag == common_chat_peg_constructed_builder::TOOL_CLOSE;
-    bool is_arg_open = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_OPEN;
-    bool is_arg_close = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_CLOSE;
-    bool is_arg_name = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_NAME;
-    bool is_arg_string = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_STRING_VALUE;
-    bool is_arg_json = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_JSON_VALUE;
-
-    if (is_tool_open) {
-        result.tool_calls.emplace_back();
-        current_tool = &result.tool_calls.back();
-        arg_count = 0;
-    }
-
-    if (is_tool_name) {
-        current_tool->name = std::string(node.text);
-        current_tool->arguments = "{";
-    }
-
-    if (is_arg_open) {
-        needs_closing_quote = false;
-    }
-
-    if (is_arg_name && current_tool) {
-        if (arg_count > 0) {
-            current_tool->arguments += ",";
-        }
-        current_tool->arguments += json(trim_trailing_space(node.text)).dump() + ":";
-        ++arg_count;
-    }
-
-    if (is_arg_string && current_tool) {
-        // Serialize to JSON, but exclude the end quote
-        std::string dumped = json(node.text).dump();
-        current_tool->arguments += dumped.substr(0, dumped.size() - 1);
-        needs_closing_quote = true;
-    }
-
-    if (is_arg_close && current_tool) {
-        if (needs_closing_quote) {
-            current_tool->arguments += "\"";
-        }
-    }
-
-    if (is_arg_json && current_tool) {
-        current_tool->arguments += std::string(trim_trailing_space(node.text));
-    }
-
-    if (is_tool_close && current_tool) {
-        current_tool->arguments += "}";
-    }
-}
--- a/common/chat-peg-parser.h
+++ b/common/chat-peg-parser.h
@@ -1,105 +0,0 @@
-#pragma once
-
-#include "chat.h"
-#include "peg-parser.h"
-
-class common_chat_peg_builder : public common_peg_parser_builder {
-  public:
-    static constexpr const char * REASONING_BLOCK = "reasoning-block";
-    static constexpr const char * REASONING = "reasoning";
-    static constexpr const char * CONTENT = "content";
-
-    common_peg_parser reasoning_block(const common_peg_parser & p) { return tag(REASONING_BLOCK, p); }
-    common_peg_parser reasoning(const common_peg_parser & p) { return tag(REASONING, p); }
-    common_peg_parser content(const common_peg_parser & p) { return tag(CONTENT, p); }
-};
-
-inline common_peg_arena build_chat_peg_parser(const std::function<common_peg_parser(common_chat_peg_builder & builder)> & fn) {
-    common_chat_peg_builder builder;
-    builder.set_root(fn(builder));
-    return builder.build();
-}
-
-class common_chat_peg_mapper {
-  public:
-    common_chat_msg & result;
-
-    common_chat_peg_mapper(common_chat_msg & msg) : result(msg) {}
-
-    virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
-    virtual void map(const common_peg_ast_node & node);
-};
-
-class common_chat_peg_native_builder : public common_chat_peg_builder {
-  public:
-    static constexpr const char * TOOL = "tool";
-    static constexpr const char * TOOL_OPEN = "tool-open";
-    static constexpr const char * TOOL_CLOSE = "tool-close";
-    static constexpr const char * TOOL_ID = "tool-id";
-    static constexpr const char * TOOL_NAME = "tool-name";
-    static constexpr const char * TOOL_ARGS = "tool-args";
-
-    common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
-    common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
-    common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
-    common_peg_parser tool_id(const common_peg_parser & p) { return atomic(tag(TOOL_ID, p)); }
-    common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
-    common_peg_parser tool_args(const common_peg_parser & p) { return tag(TOOL_ARGS, p); }
-};
-
-class common_chat_peg_native_mapper : public common_chat_peg_mapper {
-    common_chat_tool_call * current_tool;
-
-  public:
-    common_chat_peg_native_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
-
-    void map(const common_peg_ast_node & node) override;
-};
-
-inline common_peg_arena build_chat_peg_native_parser(const std::function<common_peg_parser(common_chat_peg_native_builder & builder)> & fn) {
-    common_chat_peg_native_builder builder;
-    builder.set_root(fn(builder));
-    return builder.build();
-}
-
-class common_chat_peg_constructed_builder : public common_chat_peg_builder {
-  public:
-    static constexpr const char * TOOL = "tool";
-    static constexpr const char * TOOL_OPEN = "tool-open";
-    static constexpr const char * TOOL_CLOSE = "tool-close";
-    static constexpr const char * TOOL_NAME = "tool-name";
-    static constexpr const char * TOOL_ARG = "tool-arg";
-    static constexpr const char * TOOL_ARG_OPEN = "tool-arg-open";
-    static constexpr const char * TOOL_ARG_CLOSE = "tool-arg-close";
-    static constexpr const char * TOOL_ARG_NAME = "tool-arg-name";
-    static constexpr const char * TOOL_ARG_STRING_VALUE = "tool-arg-string-value";
-    static constexpr const char * TOOL_ARG_JSON_VALUE = "tool-arg-json-value";
-
-    common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
-    common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
-    common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
-    common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
-    common_peg_parser tool_arg(const common_peg_parser & p) { return tag(TOOL_ARG, p); }
-    common_peg_parser tool_arg_open(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_OPEN, p)); }
-    common_peg_parser tool_arg_close(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_CLOSE, p)); }
-    common_peg_parser tool_arg_name(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_NAME, p)); }
-    common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
-    common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return tag(TOOL_ARG_JSON_VALUE, p); }
-};
-
-class common_chat_peg_constructed_mapper : public common_chat_peg_mapper {
-    common_chat_tool_call * current_tool;
-    int arg_count = 0;
-    bool needs_closing_quote = false;
-
-  public:
-    common_chat_peg_constructed_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
-
-    void map(const common_peg_ast_node & node) override;
-};
-
-inline common_peg_arena build_chat_peg_constructed_parser(const std::function<common_peg_parser(common_chat_peg_constructed_builder & builder)> & fn) {
-    common_chat_peg_constructed_builder builder;
-    builder.set_root(fn(builder));
-    return builder.build();
-}
--- a/common/chat.cpp
+++ b/common/chat.cpp
--- a/common/chat.h
+++ b/common/chat.h
@@ -3,7 +3,6 @@
 #pragma once

 #include "common.h"
-#include "peg-parser.h"
 #include <functional>
 #include <chrono>
 #include <string>
@@ -77,7 +76,7 @@ struct common_chat_msg_diff {
    size_t tool_call_index = std::string::npos;
    common_chat_tool_call tool_call_delta;

-    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & msg_prv, const common_chat_msg & msg_new);
+    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);

    bool operator==(const common_chat_msg_diff & other) const {
        return content_delta == other.content_delta
@@ -118,17 +117,6 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_NEMOTRON_V2,
    COMMON_CHAT_FORMAT_APERTUS,
    COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS,
-    COMMON_CHAT_FORMAT_GLM_4_5,
-    COMMON_CHAT_FORMAT_MINIMAX_M2,
-    COMMON_CHAT_FORMAT_KIMI_K2,
-    COMMON_CHAT_FORMAT_QWEN3_CODER_XML,
-    COMMON_CHAT_FORMAT_APRIEL_1_5,
-    COMMON_CHAT_FORMAT_XIAOMI_MIMO,
-
-    // These are intended to be parsed by the PEG parser
-    COMMON_CHAT_FORMAT_PEG_SIMPLE,
-    COMMON_CHAT_FORMAT_PEG_NATIVE,
-    COMMON_CHAT_FORMAT_PEG_CONSTRUCTED,

    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
@@ -160,7 +148,6 @@ struct common_chat_params {
    std::vector<common_grammar_trigger> grammar_triggers;
    std::vector<std::string>            preserved_tokens;
    std::vector<std::string>            additional_stops;
-    std::string                         parser;
 };

 struct common_chat_syntax {
@@ -170,7 +157,6 @@ struct common_chat_syntax {
    bool                     reasoning_in_content  = false;
    bool                     thinking_forced_open  = false;
    bool                     parse_tool_calls      = true;
-    common_peg_arena         parser                = {};
 };

 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
@@ -214,7 +200,6 @@ const char*               common_chat_format_name(common_chat_format format);
 const char*               common_reasoning_format_name(common_reasoning_format format);
 common_reasoning_format   common_reasoning_format_from_name(const std::string & format);
 common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
-common_chat_msg           common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax);

 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);

--- a/common/common.cpp
+++ b/common/common.cpp
@@ -8,7 +8,6 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
-#include "sampling.h"

 #include <algorithm>
 #include <cinttypes>
@@ -27,6 +26,7 @@
 #include <sstream>
 #include <string>
 #include <thread>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>

@@ -60,14 +60,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-common_time_meas::common_time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
-
-common_time_meas::~common_time_meas() {
-    if (t_start_us >= 0) {
-        t_acc += ggml_time_us() - t_start_us;
-    }
-}
-
 //
 // CPU utils
 //
@@ -694,7 +686,7 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over

 // Validate if a filename is safe to use
 // To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
-bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
+bool fs_validate_filename(const std::string & filename) {
    if (!filename.length()) {
        // Empty filename invalid
        return false;
@@ -754,14 +746,10 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
            || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
            || c == 0xFFFD // Replacement Character (UTF-8)
            || c == 0xFEFF // Byte Order Mark (BOM)
-            || c == ':' || c == '*' // Illegal characters
+            || c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
            || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
            return false;
        }
-        if (!allow_subdirs && (c == '/' || c == '\\')) {
-            // Subdirectories not allowed, reject path separators
-            return false;
-        }
    }

    // Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
@@ -786,29 +774,11 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
 #include <iostream>


-#ifdef _WIN32
-static std::wstring utf8_to_wstring(const std::string & str) {
-    if (str.empty()) {
-        return std::wstring();
-    }
-
-    int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), NULL, 0);
-
-    if (size <= 0) {
-        return std::wstring();
-    }
-
-    std::wstring wstr(size, 0);
-    MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], size);
-
-    return wstr;
-}
-#endif
-
 // returns true if successful, false otherwise
 bool fs_create_directory_with_parents(const std::string & path) {
 #ifdef _WIN32
-    std::wstring wpath = utf8_to_wstring(path);
+    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
+    std::wstring wpath = converter.from_bytes(path);

    // if the path already exists, check whether it's a directory
    const DWORD attributes = GetFileAttributesW(wpath.c_str());
@@ -881,11 +851,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
 #endif // _WIN32
 }

-bool fs_is_directory(const std::string & path) {
-    std::filesystem::path dir(path);
-    return std::filesystem::exists(dir) && std::filesystem::is_directory(dir);
-}
-
 std::string fs_get_cache_directory() {
    std::string cache_directory = "";
    auto ensure_trailing_slash = [](std::string p) {
@@ -920,8 +885,6 @@ std::string fs_get_cache_directory() {
        cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
 #elif defined(_WIN32)
        cache_directory = std::getenv("LOCALAPPDATA");
-#elif defined(__EMSCRIPTEN__)
-        GGML_ABORT("not implemented on this platform");
 #else
 #  error Unknown architecture
 #endif
@@ -941,7 +904,7 @@ std::string fs_get_cache_file(const std::string & filename) {
    return cache_directory + filename;
 }

-std::vector<common_file_info> fs_list(const std::string & path, bool include_directories) {
+std::vector<common_file_info> fs_list_files(const std::string & path) {
    std::vector<common_file_info> files;
    if (path.empty()) return files;

@@ -956,22 +919,14 @@ std::vector<common_file_info> fs_list(const std::string & path, bool include_dir
            const auto & p = entry.path();
            if (std::filesystem::is_regular_file(p)) {
                common_file_info info;
-                info.path   = p.string();
-                info.name   = p.filename().string();
-                info.is_dir = false;
+                info.path = p.string();
+                info.name = p.filename().string();
                try {
                    info.size = static_cast<size_t>(std::filesystem::file_size(p));
                } catch (const std::filesystem::filesystem_error &) {
                    info.size = 0;
                }
                files.push_back(std::move(info));
-            } else if (include_directories && std::filesystem::is_directory(p)) {
-                common_file_info info;
-                info.path   = p.string();
-                info.name   = p.filename().string();
-                info.size   = 0; // Directories have no size
-                info.is_dir = true;
-                files.push_back(std::move(info));
            }
        } catch (const std::filesystem::filesystem_error &) {
            // skip entries we cannot inspect
@@ -982,89 +937,11 @@ std::vector<common_file_info> fs_list(const std::string & path, bool include_dir
    return files;
 }

-//
-// TTY utils
-//
-
-bool tty_can_use_colors() {
-    // Check NO_COLOR environment variable (https://no-color.org/)
-    if (const char * no_color = std::getenv("NO_COLOR")) {
-        if (no_color[0] != '\0') {
-            return false;
-        }
-    }
-
-    // Check TERM environment variable
-    if (const char * term = std::getenv("TERM")) {
-        if (std::strcmp(term, "dumb") == 0) {
-            return false;
-        }
-    }
-
-    // Check if stdout and stderr are connected to a terminal
-    // We check both because log messages can go to either
-    bool stdout_is_tty = isatty(fileno(stdout));
-    bool stderr_is_tty = isatty(fileno(stderr));
-
-    return stdout_is_tty || stderr_is_tty;
-}

 //
 // Model utils
 //

-static inline void common_init_sampler_from_model(
-    const llama_model * model,
-    common_params_sampling & sparams) {
-
-    const uint64_t config = sparams.user_sampling_config;
-
-    auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
-        if (config & user_config) return;
-
-        char buf[64] = {0};
-        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
-            char * end = nullptr;
-            int32_t v = strtol(buf, &end, 10);
-            if (end && end != buf) dst = v;
-        }
-    };
-
-    auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
-        if (config & user_config) return;
-
-        char buf[128] = {0};
-        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
-            char * end = nullptr;
-            float v = strtof(buf, &end);
-            if (end && end != buf) dst = v;
-        }
-    };
-
-    // Sampling sequence
-    if (!(config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS)) {
-        char buf[512] = {0};
-        if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
-            const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
-            if (!sampler_names.empty()) {
-                sparams.samplers = common_sampler_types_from_names(sampler_names, true);
-            }
-        }
-    }
-
-    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_K),           sparams.top_k,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K);
-    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_P),           sparams.top_p,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P);
-    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIN_P),           sparams.min_p,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P);
-    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY), sparams.xtc_probability, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY);
-    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD),   sparams.xtc_threshold,   common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD);
-    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TEMP),            sparams.temp,            common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP);
-    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N),  sparams.penalty_last_n,  common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N);
-    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT),  sparams.penalty_repeat,  common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT);
-    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT),        sparams.mirostat,        common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT);
-    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU),    sparams.mirostat_tau,    common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU);
-    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA),    sparams.mirostat_eta,    common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
-}
-
 struct common_init_result common_init_from_params(common_params & params) {
    common_init_result iparams;
    auto mparams = common_model_params_to_llama(params);
@@ -1076,8 +953,6 @@ struct common_init_result common_init_from_params(common_params & params) {
        return iparams;
    }

-    common_init_sampler_from_model(model, params.sampling);
-
    const llama_vocab * vocab = llama_model_get_vocab(model);

    auto cparams = common_context_params_to_llama(params);
--- a/common/common.h
+++ b/common/common.h
@@ -2,19 +2,17 @@

 #pragma once

-#include "ggml-opt.h"
-#include "llama-cpp.h"
-
 #include <set>
 #include <sstream>
 #include <string>
 #include <string_view>
 #include <vector>
 #include <map>
+#include <sstream>
+#include <cmath>

-#if defined(_WIN32) && !defined(_WIN32_WINNT)
-#define _WIN32_WINNT 0x0A00
-#endif
+#include "ggml-opt.h"
+#include "llama-cpp.h"

 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -30,14 +28,7 @@
    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 } while(0)

-struct common_time_meas {
-    common_time_meas(int64_t & t_acc, bool disable = false);
-    ~common_time_meas();
-
-    const int64_t t_start_us;
-
-    int64_t & t_acc;
-};
+#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"

 struct common_adapter_lora_info {
    std::string path;
@@ -82,8 +73,7 @@ int32_t cpu_get_num_math();
 enum llama_example {
    LLAMA_EXAMPLE_COMMON,
    LLAMA_EXAMPLE_SPECULATIVE,
-    LLAMA_EXAMPLE_COMPLETION,
-    LLAMA_EXAMPLE_CLI,
+    LLAMA_EXAMPLE_MAIN,
    LLAMA_EXAMPLE_EMBEDDING,
    LLAMA_EXAMPLE_PERPLEXITY,
    LLAMA_EXAMPLE_RETRIEVAL,
@@ -143,22 +133,6 @@ struct common_grammar_trigger {
    llama_token token = LLAMA_TOKEN_NULL;
 };

-enum common_params_sampling_config : uint64_t {
-    COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS        = 1 << 0,
-    COMMON_PARAMS_SAMPLING_CONFIG_TOP_K           = 1 << 1,
-    COMMON_PARAMS_SAMPLING_CONFIG_TOP_P           = 1 << 2,
-    COMMON_PARAMS_SAMPLING_CONFIG_MIN_P           = 1 << 3,
-    COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
-    COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD   = 1 << 5,
-    COMMON_PARAMS_SAMPLING_CONFIG_TEMP            = 1 << 6,
-    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N  = 1 << 7,
-    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT  = 1 << 8,
-    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT        = 1 << 9,
-    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU    = 1 << 10,
-    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA    = 1 << 11,
-};
-
-
 // sampling parameters
 struct common_params_sampling {
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -191,8 +165,6 @@ struct common_params_sampling {
    bool    no_perf            = false; // disable performance metrics
    bool    timing_per_token   = false;

-    uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
-
    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY


@@ -226,7 +198,6 @@ struct common_params_model {
    std::string hf_repo     = ""; // HF repo                                                // NOLINT
    std::string hf_file     = ""; // HF file                                                // NOLINT
    std::string docker_repo = ""; // Docker repo                                            // NOLINT
-    std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
 };

 struct common_params_speculative {
@@ -373,7 +344,7 @@ struct common_params {

    std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale

-    int32_t verbosity                  = 3;  // LOG_LEVEL_INFO
+    int32_t verbosity                  = 0;
    int32_t control_vector_layer_start = -1; // layer range for control vector
    int32_t control_vector_layer_end   = -1; // layer range for control vector
    bool    offline                    = false;
@@ -407,7 +378,6 @@ struct common_params {
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
    bool no_perf           = false; // disable performance metrics
-    bool show_timings      = true;  // show timing information on CLI
    bool ctx_shift         = false; // context shift on infinite text generation
    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
    bool kv_unified        = false; // enable unified KV cache
@@ -464,7 +434,7 @@ struct common_params {
    std::string public_path   = "";                                                                         // NOLINT
    std::string api_prefix    = "";                                                                         // NOLINT
    std::string chat_template = "";                                                                         // NOLINT
-    bool use_jinja = true;                                                                                  // NOLINT
+    bool use_jinja = false;                                                                                 // NOLINT
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int reasoning_budget = -1;
@@ -483,16 +453,9 @@ struct common_params {
    bool endpoint_props   = false; // only control POST requests, not GET
    bool endpoint_metrics = false;

-    // router server configs
-    std::string models_dir    = ""; // directory containing models for the router server
-    std::string models_preset = ""; // directory containing model presets for the router server
-    int models_max = 4;             // maximum number of models to load simultaneously
-    bool models_autoload = true;    // automatically load models when requested via the router server
-
    bool log_json = false;

    std::string slot_save_path;
-    std::string media_path; // path to directory for loading media files

    float slot_prompt_similarity = 0.1f;

@@ -643,9 +606,8 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
 // Filesystem utils
 //

-bool fs_validate_filename(const std::string & filename, bool allow_subdirs = false);
+bool fs_validate_filename(const std::string & filename);
 bool fs_create_directory_with_parents(const std::string & path);
-bool fs_is_directory(const std::string & path);

 std::string fs_get_cache_directory();
 std::string fs_get_cache_file(const std::string & filename);
@@ -654,16 +616,8 @@ struct common_file_info {
    std::string path;
    std::string name;
    size_t      size = 0; // in bytes
-    bool        is_dir = false;
 };
-std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
-
-//
-// TTY utils
-//
-
-// Auto-detect if colors can be enabled based on terminal and environment
-bool tty_can_use_colors();
+std::vector<common_file_info> fs_list_files(const std::string & path);

 //
 // Model utils
--- a/common/console.cpp
+++ b/common/console.cpp
@@ -1,16 +1,6 @@
 #include "console.h"
-#include "log.h"
 #include <vector>
 #include <iostream>
-#include <cassert>
-#include <cstddef>
-#include <cctype>
-#include <cwctype>
-#include <cstdint>
-#include <condition_variable>
-#include <mutex>
-#include <thread>
-#include <stdarg.h>

 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
@@ -40,44 +30,26 @@
 #define ANSI_COLOR_BLUE    "\x1b[34m"
 #define ANSI_COLOR_MAGENTA "\x1b[35m"
 #define ANSI_COLOR_CYAN    "\x1b[36m"
-#define ANSI_COLOR_GRAY    "\x1b[90m"
 #define ANSI_COLOR_RESET   "\x1b[0m"
 #define ANSI_BOLD          "\x1b[1m"

 namespace console {

-#if defined (_WIN32)
-    namespace {
-        // Use private-use unicode values to represent special keys that are not reported
-        // as characters (e.g. arrows on Windows). These values should never clash with
-        // real input and let the rest of the code handle navigation uniformly.
-        static constexpr char32_t KEY_ARROW_LEFT       = 0xE000;
-        static constexpr char32_t KEY_ARROW_RIGHT      = 0xE001;
-        static constexpr char32_t KEY_ARROW_UP         = 0xE002;
-        static constexpr char32_t KEY_ARROW_DOWN       = 0xE003;
-        static constexpr char32_t KEY_HOME             = 0xE004;
-        static constexpr char32_t KEY_END              = 0xE005;
-        static constexpr char32_t KEY_CTRL_ARROW_LEFT  = 0xE006;
-        static constexpr char32_t KEY_CTRL_ARROW_RIGHT = 0xE007;
-        static constexpr char32_t KEY_DELETE           = 0xE008;
-    }
-
    //
    // Console state
    //
-#endif

-    static bool         advanced_display = false;
-    static bool         simple_io        = true;
-    static display_type current_display  = DISPLAY_TYPE_RESET;
+    static bool      advanced_display = false;
+    static bool      simple_io        = true;
+    static display_t current_display  = reset;

-    static FILE*        out              = stdout;
+    static FILE*     out              = stdout;

 #if defined (_WIN32)
-    static void*        hConsole;
+    static void*     hConsole;
 #else
-    static FILE*        tty              = nullptr;
-    static termios      initial_state;
+    static FILE*     tty              = nullptr;
+    static termios   initial_state;
 #endif

    //
@@ -148,7 +120,7 @@ namespace console {

    void cleanup() {
        // Reset console display
-        set_display(DISPLAY_TYPE_RESET);
+        set_display(reset);

 #if !defined(_WIN32)
        // Restore settings on POSIX systems
@@ -168,26 +140,20 @@ namespace console {
    //

    // Keep track of current display and only emit ANSI code if it changes
-    void set_display(display_type display) {
+    void set_display(display_t display) {
        if (advanced_display && current_display != display) {
-            common_log_flush(common_log_main());
+            fflush(stdout);
            switch(display) {
-                case DISPLAY_TYPE_RESET:
+                case reset:
                    fprintf(out, ANSI_COLOR_RESET);
                    break;
-                case DISPLAY_TYPE_INFO:
-                    fprintf(out, ANSI_COLOR_MAGENTA);
-                    break;
-                case DISPLAY_TYPE_PROMPT:
+                case prompt:
                    fprintf(out, ANSI_COLOR_YELLOW);
                    break;
-                case DISPLAY_TYPE_REASONING:
-                    fprintf(out, ANSI_COLOR_GRAY);
-                    break;
-                case DISPLAY_TYPE_USER_INPUT:
+                case user_input:
                    fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
                    break;
-                case DISPLAY_TYPE_ERROR:
+                case error:
                    fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
            }
            current_display = display;
@@ -210,18 +176,7 @@ namespace console {
            if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
                wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
                if (wc == 0) {
-                    const DWORD ctrl_mask = LEFT_CTRL_PRESSED | RIGHT_CTRL_PRESSED;
-                    const bool ctrl_pressed = (record.Event.KeyEvent.dwControlKeyState & ctrl_mask) != 0;
-                    switch (record.Event.KeyEvent.wVirtualKeyCode) {
-                        case VK_LEFT:   return ctrl_pressed ? KEY_CTRL_ARROW_LEFT  : KEY_ARROW_LEFT;
-                        case VK_RIGHT:  return ctrl_pressed ? KEY_CTRL_ARROW_RIGHT : KEY_ARROW_RIGHT;
-                        case VK_UP:     return KEY_ARROW_UP;
-                        case VK_DOWN:   return KEY_ARROW_DOWN;
-                        case VK_HOME:   return KEY_HOME;
-                        case VK_END:    return KEY_END;
-                        case VK_DELETE: return KEY_DELETE;
-                        default:        continue;
-                    }
+                    continue;
                }

                if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
@@ -360,52 +315,6 @@ namespace console {
 #endif
    }

-    static char32_t decode_utf8(const std::string & input, size_t pos, size_t & advance) {
-        unsigned char c = static_cast<unsigned char>(input[pos]);
-        if ((c & 0x80u) == 0u) {
-            advance = 1;
-            return c;
-        }
-        if ((c & 0xE0u) == 0xC0u && pos + 1 < input.size()) {
-            unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
-            if ((c1 & 0xC0u) != 0x80u) {
-                advance = 1;
-                return 0xFFFD;
-            }
-            advance = 2;
-            return ((c & 0x1Fu) << 6) | (static_cast<unsigned char>(input[pos + 1]) & 0x3Fu);
-        }
-        if ((c & 0xF0u) == 0xE0u && pos + 2 < input.size()) {
-            unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
-            unsigned char c2 = static_cast<unsigned char>(input[pos + 2]);
-            if ((c1 & 0xC0u) != 0x80u || (c2 & 0xC0u) != 0x80u) {
-                advance = 1;
-                return 0xFFFD;
-            }
-            advance = 3;
-            return ((c & 0x0Fu) << 12) |
-                   ((static_cast<unsigned char>(input[pos + 1]) & 0x3Fu) << 6) |
-                   (static_cast<unsigned char>(input[pos + 2]) & 0x3Fu);
-        }
-        if ((c & 0xF8u) == 0xF0u && pos + 3 < input.size()) {
-            unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
-            unsigned char c2 = static_cast<unsigned char>(input[pos + 2]);
-            unsigned char c3 = static_cast<unsigned char>(input[pos + 3]);
-            if ((c1 & 0xC0u) != 0x80u || (c2 & 0xC0u) != 0x80u || (c3 & 0xC0u) != 0x80u) {
-                advance = 1;
-                return 0xFFFD;
-            }
-            advance = 4;
-            return ((c & 0x07u) << 18) |
-                   ((static_cast<unsigned char>(input[pos + 1]) & 0x3Fu) << 12) |
-                   ((static_cast<unsigned char>(input[pos + 2]) & 0x3Fu) << 6) |
-                   (static_cast<unsigned char>(input[pos + 3]) & 0x3Fu);
-        }
-
-        advance = 1;
-        return 0xFFFD; // replacement character for invalid input
-    }
-
    static void append_utf8(char32_t ch, std::string & out) {
        if (ch <= 0x7F) {
            out.push_back(static_cast<unsigned char>(ch));
@@ -427,319 +336,22 @@ namespace console {
    }

    // Helper function to remove the last UTF-8 character from a string
-    static size_t prev_utf8_char_pos(const std::string & line, size_t pos) {
-        if (pos == 0) return 0;
-        pos--;
-        while (pos > 0 && (line[pos] & 0xC0) == 0x80) {
-            pos--;
-        }
-        return pos;
-    }
-
-    static size_t next_utf8_char_pos(const std::string & line, size_t pos) {
-        if (pos >= line.length()) return line.length();
-        pos++;
-        while (pos < line.length() && (line[pos] & 0xC0) == 0x80) {
-            pos++;
-        }
-        return pos;
-    }
-
-    static void move_cursor(int delta);
-    static void move_word_left(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
-    static void move_word_right(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
-    static void move_to_line_start(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths);
-    static void move_to_line_end(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
-
-    static void delete_at_cursor(std::string & line, std::vector<int> & widths, size_t & char_pos, size_t & byte_pos) {
-        if (char_pos >= widths.size()) {
+    static void pop_back_utf8_char(std::string & line) {
+        if (line.empty()) {
            return;
        }

-        size_t next_pos = next_utf8_char_pos(line, byte_pos);
-        int w = widths[char_pos];
-        size_t char_len = next_pos - byte_pos;
+        size_t pos = line.length() - 1;

-        line.erase(byte_pos, char_len);
-        widths.erase(widths.begin() + char_pos);
-
-        size_t p = byte_pos;
-        int tail_width = 0;
-        for (size_t i = char_pos; i < widths.size(); ++i) {
-            size_t following = next_utf8_char_pos(line, p);
-            put_codepoint(line.c_str() + p, following - p, widths[i]);
-            tail_width += widths[i];
-            p = following;
+        // Find the start of the last UTF-8 character (checking up to 4 bytes back)
+        for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
+            if ((line[pos] & 0xC0) != 0x80) {
+                break; // Found the start of the character
+            }
        }
-
-        for (int i = 0; i < w; ++i) {
-            fputc(' ', out);
-        }
-
-        move_cursor(-(tail_width + w));
+        line.erase(pos);
    }

-    static void clear_current_line(const std::vector<int> & widths) {
-        int total_width = 0;
-        for (int w : widths) {
-            total_width += (w > 0 ? w : 1);
-        }
-
-        if (total_width > 0) {
-            std::string spaces(total_width, ' ');
-            fwrite(spaces.c_str(), 1, total_width, out);
-            move_cursor(-total_width);
-        }
-    }
-
-    static void set_line_contents(std::string new_line, std::string & line, std::vector<int> & widths, size_t & char_pos,
-                                  size_t & byte_pos) {
-        move_to_line_start(char_pos, byte_pos, widths);
-        clear_current_line(widths);
-
-        line = std::move(new_line);
-        widths.clear();
-        byte_pos = 0;
-        char_pos = 0;
-
-        size_t idx = 0;
-        while (idx < line.size()) {
-            size_t advance = 0;
-            char32_t cp = decode_utf8(line, idx, advance);
-            int expected_width = estimateWidth(cp);
-            int real_width = put_codepoint(line.c_str() + idx, advance, expected_width);
-            if (real_width < 0) real_width = 0;
-            widths.push_back(real_width);
-            idx += advance;
-            ++char_pos;
-            byte_pos = idx;
-        }
-    }
-
-    static void move_to_line_start(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths) {
-        int back_width = 0;
-        for (size_t i = 0; i < char_pos; ++i) {
-            back_width += widths[i];
-        }
-        move_cursor(-back_width);
-        char_pos = 0;
-        byte_pos = 0;
-    }
-
-    static void move_to_line_end(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
-        int forward_width = 0;
-        for (size_t i = char_pos; i < widths.size(); ++i) {
-            forward_width += widths[i];
-        }
-        move_cursor(forward_width);
-        char_pos = widths.size();
-        byte_pos = line.length();
-    }
-
-    static bool has_ctrl_modifier(const std::string & params) {
-        size_t start = 0;
-        while (start < params.size()) {
-            size_t end = params.find(';', start);
-            size_t len = (end == std::string::npos) ? params.size() - start : end - start;
-            if (len > 0) {
-                int value = 0;
-                for (size_t i = 0; i < len; ++i) {
-                    char ch = params[start + i];
-                    if (!std::isdigit(static_cast<unsigned char>(ch))) {
-                        value = -1;
-                        break;
-                    }
-                    value = value * 10 + (ch - '0');
-                }
-                if (value == 5) {
-                    return true;
-                }
-            }
-
-            if (end == std::string::npos) {
-                break;
-            }
-            start = end + 1;
-        }
-        return false;
-    }
-
-    static bool is_space_codepoint(char32_t cp) {
-        return std::iswspace(static_cast<wint_t>(cp)) != 0;
-    }
-
-    static void move_word_left(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
-        if (char_pos == 0) {
-            return;
-        }
-
-        size_t new_char_pos = char_pos;
-        size_t new_byte_pos = byte_pos;
-        int move_width = 0;
-
-        while (new_char_pos > 0) {
-            size_t prev_byte = prev_utf8_char_pos(line, new_byte_pos);
-            size_t advance = 0;
-            char32_t cp = decode_utf8(line, prev_byte, advance);
-            if (!is_space_codepoint(cp)) {
-                break;
-            }
-            move_width += widths[new_char_pos - 1];
-            new_char_pos--;
-            new_byte_pos = prev_byte;
-        }
-
-        while (new_char_pos > 0) {
-            size_t prev_byte = prev_utf8_char_pos(line, new_byte_pos);
-            size_t advance = 0;
-            char32_t cp = decode_utf8(line, prev_byte, advance);
-            if (is_space_codepoint(cp)) {
-                break;
-            }
-            move_width += widths[new_char_pos - 1];
-            new_char_pos--;
-            new_byte_pos = prev_byte;
-        }
-
-        move_cursor(-move_width);
-        char_pos = new_char_pos;
-        byte_pos = new_byte_pos;
-    }
-
-    static void move_word_right(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
-        if (char_pos >= widths.size()) {
-            return;
-        }
-
-        size_t new_char_pos = char_pos;
-        size_t new_byte_pos = byte_pos;
-        int move_width = 0;
-
-        while (new_char_pos < widths.size()) {
-            size_t advance = 0;
-            char32_t cp = decode_utf8(line, new_byte_pos, advance);
-            if (!is_space_codepoint(cp)) {
-                break;
-            }
-            move_width += widths[new_char_pos];
-            new_char_pos++;
-            new_byte_pos += advance;
-        }
-
-        while (new_char_pos < widths.size()) {
-            size_t advance = 0;
-            char32_t cp = decode_utf8(line, new_byte_pos, advance);
-            if (is_space_codepoint(cp)) {
-                break;
-            }
-            move_width += widths[new_char_pos];
-            new_char_pos++;
-            new_byte_pos += advance;
-        }
-
-        while (new_char_pos < widths.size()) {
-            size_t advance = 0;
-            char32_t cp = decode_utf8(line, new_byte_pos, advance);
-            if (!is_space_codepoint(cp)) {
-                break;
-            }
-            move_width += widths[new_char_pos];
-            new_char_pos++;
-            new_byte_pos += advance;
-        }
-
-        move_cursor(move_width);
-        char_pos = new_char_pos;
-        byte_pos = new_byte_pos;
-    }
-
-    static void move_cursor(int delta) {
-        if (delta == 0) return;
-#if defined(_WIN32)
-        if (hConsole != NULL) {
-            CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
-            GetConsoleScreenBufferInfo(hConsole, &bufferInfo);
-            COORD newCursorPosition = bufferInfo.dwCursorPosition;
-            int width = bufferInfo.dwSize.X;
-            int newX = newCursorPosition.X + delta;
-            int newY = newCursorPosition.Y;
-
-            while (newX >= width) {
-                newX -= width;
-                newY++;
-            }
-            while (newX < 0) {
-                newX += width;
-                newY--;
-            }
-
-            newCursorPosition.X = newX;
-            newCursorPosition.Y = newY;
-            SetConsoleCursorPosition(hConsole, newCursorPosition);
-        }
-#else
-        if (delta < 0) {
-            for (int i = 0; i < -delta; i++) fprintf(out, "\b");
-        } else {
-            for (int i = 0; i < delta; i++) fprintf(out, "\033[C");
-        }
-#endif
-    }
-
-    struct history_t {
-        std::vector<std::string> entries;
-        size_t viewing_idx = SIZE_MAX;
-        std::string backup_line; // current line before viewing history
-        void add(const std::string & line) {
-            if (line.empty()) {
-                return;
-            }
-            // avoid duplicates with the last entry
-            if (entries.empty() || entries.back() != line) {
-                entries.push_back(line);
-            }
-            // also clear viewing state
-            end_viewing();
-        }
-        bool prev(std::string & cur_line) {
-            if (entries.empty()) {
-                return false;
-            }
-            if (viewing_idx == SIZE_MAX) {
-                return false;
-            }
-            if (viewing_idx > 0) {
-                viewing_idx--;
-            }
-            cur_line = entries[viewing_idx];
-            return true;
-        }
-        bool next(std::string & cur_line) {
-            if (entries.empty() || viewing_idx == SIZE_MAX) {
-                return false;
-            }
-            viewing_idx++;
-            if (viewing_idx >= entries.size()) {
-                cur_line = backup_line;
-                end_viewing();
-            } else {
-                cur_line = entries[viewing_idx];
-            }
-            return true;
-        }
-        void begin_viewing(const std::string & line) {
-            backup_line = line;
-            viewing_idx = entries.size();
-        }
-        void end_viewing() {
-            viewing_idx = SIZE_MAX;
-            backup_line.clear();
-        }
-        bool is_viewing() const {
-            return viewing_idx != SIZE_MAX;
-        }
-    } history;
-
    static bool readline_advanced(std::string & line, bool multiline_input) {
        if (out != stdout) {
            fflush(stdout);
@@ -750,33 +362,8 @@ namespace console {
        bool is_special_char = false;
        bool end_of_stream = false;

-        size_t byte_pos = 0; // current byte index
-        size_t char_pos = 0; // current character index (one char can be multiple bytes)
-
        char32_t input_char;
        while (true) {
-            assert(char_pos <= byte_pos);
-            assert(char_pos <= widths.size());
-            auto history_prev = [&]() {
-                if (!history.is_viewing()) {
-                    history.begin_viewing(line);
-                }
-                std::string new_line;
-                if (!history.prev(new_line)) {
-                    return;
-                }
-                set_line_contents(new_line, line, widths, char_pos, byte_pos);
-            };
-            auto history_next = [&]() {
-                if (history.is_viewing()) {
-                    std::string new_line;
-                    if (!history.next(new_line)) {
-                        return;
-                    }
-                    set_line_contents(new_line, line, widths, char_pos, byte_pos);
-                }
-            };
-
            fflush(out); // Ensure all output is displayed before waiting for input
            input_char = getchar32();

@@ -784,83 +371,20 @@ namespace console {
                break;
            }

-            if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D */) {
+            if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
                end_of_stream = true;
                break;
            }

            if (is_special_char) {
+                set_display(user_input);
                replace_last(line.back());
                is_special_char = false;
            }

            if (input_char == '\033') { // Escape sequence
                char32_t code = getchar32();
-                if (code == '[') {
-                    std::string params;
-                    while (true) {
-                        code = getchar32();
-                        if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~' || code == (char32_t) WEOF) {
-                            break;
-                        }
-                        params.push_back(static_cast<char>(code));
-                    }
-
-                    const bool ctrl_modifier = has_ctrl_modifier(params);
-
-                    if (code == 'D') { // left
-                        if (ctrl_modifier) {
-                            move_word_left(char_pos, byte_pos, widths, line);
-                        } else if (char_pos > 0) {
-                            int w = widths[char_pos - 1];
-                            move_cursor(-w);
-                            char_pos--;
-                            byte_pos = prev_utf8_char_pos(line, byte_pos);
-                        }
-                    } else if (code == 'C') { // right
-                        if (ctrl_modifier) {
-                            move_word_right(char_pos, byte_pos, widths, line);
-                        } else if (char_pos < widths.size()) {
-                            int w = widths[char_pos];
-                            move_cursor(w);
-                            char_pos++;
-                            byte_pos = next_utf8_char_pos(line, byte_pos);
-                        }
-                    } else if (code == 'H') { // home
-                        move_to_line_start(char_pos, byte_pos, widths);
-                    } else if (code == 'F') { // end
-                        move_to_line_end(char_pos, byte_pos, widths, line);
-                    } else if (code == 'A' || code == 'B') {
-                        // up/down
-                        if (code == 'A') {
-                            history_prev();
-                            is_special_char = false;
-                        } else if (code == 'B') {
-                            history_next();
-                            is_special_char = false;
-                        }
-                    } else if ((code == '~' || (code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z')) && !params.empty()) {
-                        std::string digits;
-                        for (char ch : params) {
-                            if (ch == ';') {
-                                break;
-                            }
-                            if (std::isdigit(static_cast<unsigned char>(ch))) {
-                                digits.push_back(ch);
-                            }
-                        }
-
-                        if (code == '~') {
-                            if (digits == "1" || digits == "7") { // home
-                                move_to_line_start(char_pos, byte_pos, widths);
-                            } else if (digits == "4" || digits == "8") { // end
-                                move_to_line_end(char_pos, byte_pos, widths, line);
-                            } else if (digits == "3") { // delete
-                                delete_at_cursor(line, widths, char_pos, byte_pos);
-                            }
-                        }
-                    }
-                } else if (code == 0x1B) {
+                if (code == '[' || code == 0x1B) {
                    // Discard the rest of the escape sequence
                    while ((code = getchar32()) != (char32_t) WEOF) {
                        if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
@@ -868,110 +392,32 @@ namespace console {
                        }
                    }
                }
-#if defined(_WIN32)
-            } else if (input_char == KEY_ARROW_LEFT) {
-                if (char_pos > 0) {
-                    int w = widths[char_pos - 1];
-                    move_cursor(-w);
-                    char_pos--;
-                    byte_pos = prev_utf8_char_pos(line, byte_pos);
-                }
-            } else if (input_char == KEY_ARROW_RIGHT) {
-                if (char_pos < widths.size()) {
-                    int w = widths[char_pos];
-                    move_cursor(w);
-                    char_pos++;
-                    byte_pos = next_utf8_char_pos(line, byte_pos);
-                }
-            } else if (input_char == KEY_CTRL_ARROW_LEFT) {
-                move_word_left(char_pos, byte_pos, widths, line);
-            } else if (input_char == KEY_CTRL_ARROW_RIGHT) {
-                move_word_right(char_pos, byte_pos, widths, line);
-            } else if (input_char == KEY_HOME) {
-                move_to_line_start(char_pos, byte_pos, widths);
-            } else if (input_char == KEY_END) {
-                move_to_line_end(char_pos, byte_pos, widths, line);
-            } else if (input_char == KEY_DELETE) {
-                delete_at_cursor(line, widths, char_pos, byte_pos);
-            } else if (input_char == KEY_ARROW_UP || input_char == KEY_ARROW_DOWN) {
-                if (input_char == KEY_ARROW_UP) {
-                    history_prev();
-                    is_special_char = false;
-                } else if (input_char == KEY_ARROW_DOWN) {
-                    history_next();
-                    is_special_char = false;
-                }
-#endif
            } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
-                if (char_pos > 0) {
-                    int w = widths[char_pos - 1];
-                    move_cursor(-w);
-                    char_pos--;
-                    size_t prev_pos = prev_utf8_char_pos(line, byte_pos);
-                    size_t char_len = byte_pos - prev_pos;
-                    byte_pos = prev_pos;
-
-                    // remove the character
-                    line.erase(byte_pos, char_len);
-                    widths.erase(widths.begin() + char_pos);
-
-                    // redraw tail
-                    size_t p = byte_pos;
-                    int tail_width = 0;
-                    for (size_t i = char_pos; i < widths.size(); ++i) {
-                        size_t next_p = next_utf8_char_pos(line, p);
-                        put_codepoint(line.c_str() + p, next_p - p, widths[i]);
-                        tail_width += widths[i];
-                        p = next_p;
-                    }
-
-                    // clear display
-                    for (int i = 0; i < w; ++i) {
-                        fputc(' ', out);
-                    }
-                    move_cursor(-(tail_width + w));
+                if (!widths.empty()) {
+                    int count;
+                    do {
+                        count = widths.back();
+                        widths.pop_back();
+                        // Move cursor back, print space, and move cursor back again
+                        for (int i = 0; i < count; i++) {
+                            replace_last(' ');
+                            pop_cursor();
+                        }
+                        pop_back_utf8_char(line);
+                    } while (count == 0 && !widths.empty());
                }
            } else {
-                // insert character
-                std::string new_char_str;
-                append_utf8(input_char, new_char_str);
-                int w = estimateWidth(input_char);
-
-                if (char_pos == widths.size()) {
-                    // insert at the end
-                    line += new_char_str;
-                    int real_w = put_codepoint(new_char_str.c_str(), new_char_str.length(), w);
-                    if (real_w < 0) real_w = 0;
-                    widths.push_back(real_w);
-                    byte_pos += new_char_str.length();
-                    char_pos++;
-                } else {
-                    // insert in middle
-                    line.insert(byte_pos, new_char_str);
-
-                    int real_w = put_codepoint(new_char_str.c_str(), new_char_str.length(), w);
-                    if (real_w < 0) real_w = 0;
-
-                    widths.insert(widths.begin() + char_pos, real_w);
-
-                    // print the tail
-                    size_t p = byte_pos + new_char_str.length();
-                    int tail_width = 0;
-                    for (size_t i = char_pos + 1; i < widths.size(); ++i) {
-                        size_t next_p = next_utf8_char_pos(line, p);
-                        put_codepoint(line.c_str() + p, next_p - p, widths[i]);
-                        tail_width += widths[i];
-                        p = next_p;
-                    }
-
-                    move_cursor(-tail_width);
-
-                    byte_pos += new_char_str.length();
-                    char_pos++;
+                int offset = line.length();
+                append_utf8(input_char, line);
+                int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
+                if (width < 0) {
+                    width = 0;
                }
+                widths.push_back(width);
            }

            if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
+                set_display(prompt);
                replace_last(line.back());
                is_special_char = true;
            }
@@ -1005,15 +451,6 @@ namespace console {
            }
        }

-        if (!end_of_stream && !line.empty()) {
-            // remove the trailing newline for history storage
-            if (!line.empty() && line.back() == '\n') {
-                line.pop_back();
-            }
-            // TODO: maybe support multiline history entries?
-            history.add(line);
-        }
-
        fflush(out);
        return has_more;
    }
@@ -1056,82 +493,12 @@ namespace console {
    }

    bool readline(std::string & line, bool multiline_input) {
+        set_display(user_input);
+
        if (simple_io) {
            return readline_simple(line, multiline_input);
        }
        return readline_advanced(line, multiline_input);
    }

-    namespace spinner {
-        static const char LOADING_CHARS[] = {'|', '/', '-', '\\'};
-        static std::condition_variable cv_stop;
-        static std::thread th;
-        static size_t frame = 0; // only modified by one thread
-        static bool running = false;
-        static std::mutex mtx;
-        static auto wait_time = std::chrono::milliseconds(100);
-        static void draw_next_frame() {
-            // don't need lock because only one thread modifies running
-            frame = (frame + 1) % sizeof(LOADING_CHARS);
-            replace_last(LOADING_CHARS[frame]);
-            fflush(out);
-        }
-        void start() {
-            std::unique_lock<std::mutex> lock(mtx);
-            if (simple_io || running) {
-                return;
-            }
-            common_log_flush(common_log_main());
-            fprintf(out, "%c", LOADING_CHARS[0]);
-            fflush(out);
-            frame = 1;
-            running = true;
-            th = std::thread([]() {
-                std::unique_lock<std::mutex> lock(mtx);
-                while (true) {
-                    if (cv_stop.wait_for(lock, wait_time, []{ return !running; })) {
-                        break;
-                    }
-                    draw_next_frame();
-                }
-            });
-        }
-        void stop() {
-            {
-                std::unique_lock<std::mutex> lock(mtx);
-                if (simple_io || !running) {
-                    return;
-                }
-                running = false;
-                cv_stop.notify_all();
-            }
-            if (th.joinable()) {
-                th.join();
-            }
-            replace_last(' ');
-            pop_cursor();
-            fflush(out);
-        }
-    }
-
-    void log(const char * fmt, ...) {
-        va_list args;
-        va_start(args, fmt);
-        vfprintf(out, fmt, args);
-        va_end(args);
-    }
-
-    void error(const char * fmt, ...) {
-        va_list args;
-        va_start(args, fmt);
-        display_type cur = current_display;
-        set_display(DISPLAY_TYPE_ERROR);
-        vfprintf(out, fmt, args);
-        set_display(cur); // restore previous color
-        va_end(args);
-    }
-
-    void flush() {
-        fflush(out);
-    }
 }
--- a/common/console.h
+++ b/common/console.h
@@ -2,40 +2,18 @@

 #pragma once

-#include "common.h"
-
 #include <string>

-enum display_type {
-    DISPLAY_TYPE_RESET = 0,
-    DISPLAY_TYPE_INFO,
-    DISPLAY_TYPE_PROMPT,
-    DISPLAY_TYPE_REASONING,
-    DISPLAY_TYPE_USER_INPUT,
-    DISPLAY_TYPE_ERROR
-};
-
 namespace console {
+    enum display_t {
+        reset = 0,
+        prompt,
+        user_input,
+        error
+    };
+
    void init(bool use_simple_io, bool use_advanced_display);
    void cleanup();
-    void set_display(display_type display);
+    void set_display(display_t display);
    bool readline(std::string & line, bool multiline_input);
-
-    namespace spinner {
-        void start();
-        void stop();
-    }
-
-    // note: the logging API below output directly to stdout
-    // it can negatively impact performance if used on inference thread
-    // only use in in a dedicated CLI thread
-    // for logging in inference thread, use log.h instead
-
-    LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
-    void log(const char * fmt, ...);
-
-    LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
-    void error(const char * fmt, ...);
-
-    void flush();
 }
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -12,8 +12,6 @@
 #include <filesystem>
 #include <fstream>
 #include <future>
-#include <map>
-#include <mutex>
 #include <regex>
 #include <string>
 #include <thread>
@@ -26,7 +24,6 @@
 #include "http.h"
 #endif

-#ifndef __EMSCRIPTEN__
 #ifdef __linux__
 #include <linux/limits.h>
 #elif defined(_WIN32)
@@ -38,8 +35,6 @@
 #else
 #include <sys/syslimits.h>
 #endif
-#endif
-
 #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083

 // isatty
@@ -435,7 +430,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
-    curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 0L);
+    curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 1L);
    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
        auto data_vec = static_cast<std::vector<char> *>(data);
@@ -474,79 +469,36 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &

 #elif defined(LLAMA_USE_HTTPLIB)

-class ProgressBar {
-    static inline std::mutex mutex;
-    static inline std::map<const ProgressBar *, int> lines;
-    static inline int max_line = 0;
-
-    static void cleanup(const ProgressBar * line) {
-        lines.erase(line);
-        if (lines.empty()) {
-            max_line = 0;
-        }
-    }
-
-    static bool is_output_a_tty() {
+static bool is_output_a_tty() {
 #if defined(_WIN32)
-        return _isatty(_fileno(stdout));
+    return _isatty(_fileno(stdout));
 #else
-        return isatty(1);
+    return isatty(1);
 #endif
+}
+
+static void print_progress(size_t current, size_t total) {
+    if (!is_output_a_tty()) {
+        return;
    }

-public:
-    ProgressBar() = default;
-
-    ~ProgressBar() {
-        std::lock_guard<std::mutex> lock(mutex);
-        cleanup(this);
+    if (!total) {
+        return;
    }

-    void update(size_t current, size_t total) {
-        if (!is_output_a_tty()) {
-            return;
-        }
+    size_t width = 50;
+    size_t pct = (100 * current) / total;
+    size_t pos = (width * current) / total;

-        if (!total) {
-            return;
-        }
-
-        std::lock_guard<std::mutex> lock(mutex);
-
-        if (lines.find(this) == lines.end()) {
-            lines[this] = max_line++;
-            std::cout << "\n";
-        }
-        int lines_up = max_line - lines[this];
-
-        size_t width = 50;
-        size_t pct = (100 * current) / total;
-        size_t pos = (width * current) / total;
-
-        std::cout << "\033[s";
-
-        if (lines_up > 0) {
-            std::cout << "\033[" << lines_up << "A";
-        }
-        std::cout << "\033[2K\r["
-            << std::string(pos, '=')
-            << (pos < width ? ">" : "")
-            << std::string(width - pos, ' ')
-            << "] " << std::setw(3) << pct << "%  ("
-            << current / (1024 * 1024) << " MB / "
-            << total / (1024 * 1024) << " MB) "
-            << "\033[u";
-
-        std::cout.flush();
-
-        if (current == total) {
-             cleanup(this);
-        }
-    }
-
-    ProgressBar(const ProgressBar &) = delete;
-    ProgressBar & operator=(const ProgressBar &) = delete;
-};
+    std::cout << "["
+              << std::string(pos, '=')
+              << (pos < width ? ">" : "")
+              << std::string(width - pos, ' ')
+              << "] " << std::setw(3) << pct << "%  ("
+              << current / (1024 * 1024) << " MB / "
+              << total / (1024 * 1024) << " MB)\r";
+    std::cout.flush();
+}

 static bool common_pull_file(httplib::Client & cli,
                             const std::string & resolve_path,
@@ -565,19 +517,16 @@ static bool common_pull_file(httplib::Client & cli,
        headers.emplace("Range", "bytes=" + std::to_string(existing_size) + "-");
    }

-    const char * func = __func__; // avoid __func__ inside a lambda
-    size_t downloaded = existing_size;
-    size_t progress_step = 0;
-    ProgressBar bar;
+    std::atomic<size_t> downloaded{existing_size};

    auto res = cli.Get(resolve_path, headers,
        [&](const httplib::Response &response) {
            if (existing_size > 0 && response.status != 206) {
-                LOG_WRN("%s: server did not respond with 206 Partial Content for a resume request. Status: %d\n", func, response.status);
+                LOG_WRN("%s: server did not respond with 206 Partial Content for a resume request. Status: %d\n", __func__, response.status);
                return false;
            }
            if (existing_size == 0 && response.status != 200) {
-                LOG_WRN("%s: download received non-successful status code: %d\n", func, response.status);
+                LOG_WRN("%s: download received non-successful status code: %d\n", __func__, response.status);
                return false;
            }
            if (total_size == 0 && response.has_header("Content-Length")) {
@@ -585,7 +534,7 @@ static bool common_pull_file(httplib::Client & cli,
                    size_t content_length = std::stoull(response.get_header_value("Content-Length"));
                    total_size = existing_size + content_length;
                } catch (const std::exception &e) {
-                    LOG_WRN("%s: invalid Content-Length header: %s\n", func, e.what());
+                    LOG_WRN("%s: invalid Content-Length header: %s\n", __func__, e.what());
                }
            }
            return true;
@@ -593,21 +542,18 @@ static bool common_pull_file(httplib::Client & cli,
        [&](const char *data, size_t len) {
            ofs.write(data, len);
            if (!ofs) {
-                LOG_ERR("%s: error writing to file: %s\n", func, path_tmp.c_str());
+                LOG_ERR("%s: error writing to file: %s\n", __func__, path_tmp.c_str());
                return false;
            }
            downloaded += len;
-            progress_step += len;
-
-            if (progress_step >= total_size / 1000 || downloaded == total_size) {
-                bar.update(downloaded, total_size);
-                progress_step = 0;
-            }
+            print_progress(downloaded, total_size);
            return true;
        },
        nullptr
    );

+    std::cout << "\n";
+
    if (!res) {
        LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
        return false;
@@ -1101,7 +1047,7 @@ std::string common_docker_resolve_model(const std::string &) {
 std::vector<common_cached_model_info> common_list_cached_models() {
    std::vector<common_cached_model_info> models;
    const std::string cache_dir = fs_get_cache_directory();
-    const std::vector<common_file_info> files = fs_list(cache_dir, false);
+    const std::vector<common_file_info> files = fs_list_files(cache_dir);
    for (const auto & file : files) {
        if (string_starts_with(file.name, "manifest=") && string_ends_with(file.name, ".json")) {
            common_cached_model_info model_info;
--- a/common/download.h
+++ b/common/download.h
@@ -14,10 +14,8 @@ struct common_cached_model_info {
    std::string model;
    std::string tag;
    size_t      size = 0; // GGUF size in bytes
-    // return string representation like "user/model:tag"
-    // if tag is "latest", it will be omitted
    std::string to_string() const {
-        return user + "/" + model + (tag == "latest" ? "" : ":" + tag);
+        return user + "/" + model + ":" + tag;
    }
 };

--- a/common/json-partial.cpp
+++ b/common/json-partial.cpp
@@ -297,25 +297,8 @@ bool common_json_parse(
            it = temptative_end;
            return true;
        }
-        // handle unclosed top-level primitive
-        if (err_loc.position != 0 && !healing_marker.empty() && err_loc.stack.empty()) {
-            std::string str(it, temptative_end);
-            const auto & magic_seed = out.healing_marker.marker = healing_marker;
-            if (can_parse(str + "\"")) {
-                // Was inside an string
-                str += (out.healing_marker.json_dump_marker = magic_seed) + "\"";
-            } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"")) {
-                // Was inside an string after an escape
-                str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"";
-            } else {
-                // TODO: handle more unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
-                // fprintf(stderr, "Closing: TODO\n");
-                return false;
-            }
-            out.json = json::parse(str);
-            it = temptative_end;
-            return true;
-        }
+        // TODO: handle unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
+        // fprintf(stderr, "Closing: TODO\n");
        return false;
    }
    out.json = json::parse(it, end);
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -268,10 +268,10 @@ static bool is_reserved_name(const std::string & name) {
 }

 std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+");
-std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"\\\\]");
+std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"]");
 std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]");
 std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
-    {'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}, {'\\', "\\\\"}
+    {'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}
 };

 std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
@@ -303,8 +303,6 @@ static std::string format_literal(const std::string & literal) {
    return "\"" + escaped + "\"";
 }

-std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
-
 class SchemaConverter {
 private:
    friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
@@ -974,7 +972,7 @@ public:

    void check_errors() {
        if (!_errors.empty()) {
-            throw std::invalid_argument("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
+            throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
        }
        if (!_warnings.empty()) {
            fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -18,6 +18,4 @@ struct common_grammar_options {
    bool dotall = false;
 };

-std::string gbnf_format_literal(const std::string & literal);
-
 std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -1,4 +1,3 @@
-#include "common.h"
 #include "log.h"

 #include <chrono>
@@ -27,6 +26,30 @@ void common_log_set_verbosity_thold(int verbosity) {
    common_log_verbosity_thold = verbosity;
 }

+// Auto-detect if colors should be enabled based on terminal and environment
+static bool common_log_should_use_colors_auto() {
+    // Check NO_COLOR environment variable (https://no-color.org/)
+    if (const char * no_color = std::getenv("NO_COLOR")) {
+        if (no_color[0] != '\0') {
+            return false;
+        }
+    }
+
+    // Check TERM environment variable
+    if (const char * term = std::getenv("TERM")) {
+        if (std::strcmp(term, "dumb") == 0) {
+            return false;
+        }
+    }
+
+    // Check if stdout and stderr are connected to a terminal
+    // We check both because log messages can go to either
+    bool stdout_is_tty = isatty(fileno(stdout));
+    bool stderr_is_tty = isatty(fileno(stderr));
+
+    return stdout_is_tty || stderr_is_tty;
+}
+
 static int64_t t_us() {
    return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
 }
@@ -368,7 +391,7 @@ struct common_log * common_log_main() {
    static std::once_flag    init_flag;
    std::call_once(init_flag, [&]() {
        // Set default to auto-detect colors
-        log.set_colors(tty_can_use_colors());
+        log.set_colors(common_log_should_use_colors_auto());
    });

    return &log;
@@ -399,7 +422,7 @@ void common_log_set_file(struct common_log * log, const char * file) {

 void common_log_set_colors(struct common_log * log, log_colors colors) {
    if (colors == LOG_COLORS_AUTO) {
-        log->set_colors(tty_can_use_colors());
+        log->set_colors(common_log_should_use_colors_auto());
        return;
    }

@@ -420,27 +443,8 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) {
    log->set_timestamps(timestamps);
 }

-void common_log_flush(struct common_log * log) {
-    log->pause();
-    log->resume();
-}
-
-static int common_get_verbosity(enum ggml_log_level level) {
-    switch (level) {
-        case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
-        case GGML_LOG_LEVEL_INFO:  return LOG_LEVEL_INFO;
-        case GGML_LOG_LEVEL_WARN:  return LOG_LEVEL_WARN;
-        case GGML_LOG_LEVEL_ERROR: return LOG_LEVEL_ERROR;
-        case GGML_LOG_LEVEL_CONT:  return LOG_LEVEL_INFO; // same as INFO
-        case GGML_LOG_LEVEL_NONE:
-        default:
-            return LOG_LEVEL_OUTPUT;
-    }
-}
-
 void common_log_default_callback(enum ggml_log_level level, const char * text, void * /*user_data*/) {
-    auto verbosity = common_get_verbosity(level);
-    if (verbosity <= common_log_verbosity_thold) {
+    if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
        common_log_add(common_log_main(), level, "%s", text);
    }
 }
--- a/common/log.h
+++ b/common/log.h
@@ -21,14 +21,8 @@
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif

-#define LOG_LEVEL_DEBUG  4
-#define LOG_LEVEL_INFO   3
-#define LOG_LEVEL_WARN   2
-#define LOG_LEVEL_ERROR  1
-#define LOG_LEVEL_OUTPUT 0 // output data from tools
-
-#define LOG_DEFAULT_DEBUG LOG_LEVEL_DEBUG
-#define LOG_DEFAULT_LLAMA LOG_LEVEL_INFO
+#define LOG_DEFAULT_DEBUG 1
+#define LOG_DEFAULT_LLAMA 0

 enum log_colors {
    LOG_COLORS_AUTO     = -1,
@@ -73,18 +67,16 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
 //   0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
 //   0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
 //
+// I - info    (stdout, V = 0)
+// W - warning (stderr, V = 0)
+// E - error   (stderr, V = 0)
 // D - debug   (stderr, V = LOG_DEFAULT_DEBUG)
-// I - info    (stdout, V = LOG_DEFAULT_INFO)
-// W - warning (stderr, V = LOG_DEFAULT_WARN)
-// E - error   (stderr, V = LOG_DEFAULT_ERROR)
-// O - output  (stdout, V = LOG_DEFAULT_OUTPUT)
 //

 void common_log_set_file      (struct common_log * log, const char * file); // not thread-safe
 void common_log_set_colors    (struct common_log * log, log_colors colors); // not thread-safe
 void common_log_set_prefix    (struct common_log * log, bool prefix);       // whether to output prefix to each log
 void common_log_set_timestamps(struct common_log * log, bool timestamps);   // whether to output timestamps in the prefix
-void common_log_flush         (struct common_log * log);                    // flush all pending log messages

 // helper macros for logging
 // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
@@ -103,14 +95,14 @@ void common_log_flush         (struct common_log * log);                    // f
        } \
    } while (0)

-#define LOG(...)             LOG_TMPL(GGML_LOG_LEVEL_NONE, LOG_LEVEL_OUTPUT, __VA_ARGS__)
-#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity,        __VA_ARGS__)
+#define LOG(...)             LOG_TMPL(GGML_LOG_LEVEL_NONE, 0,         __VA_ARGS__)
+#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)

-#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_LEVEL_DEBUG,  __VA_ARGS__)
-#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  LOG_LEVEL_INFO,   __VA_ARGS__)
-#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  LOG_LEVEL_WARN,   __VA_ARGS__)
-#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, LOG_LEVEL_ERROR,  __VA_ARGS__)
-#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  LOG_LEVEL_INFO,   __VA_ARGS__) // same as INFO
+#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  0,                 __VA_ARGS__)
+#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  0,                 __VA_ARGS__)
+#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0,                 __VA_ARGS__)
+#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
+#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  0,                 __VA_ARGS__)

 #define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
 #define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
--- a/common/peg-parser.cpp
+++ b/common/peg-parser.cpp
--- a/common/peg-parser.h
+++ b/common/peg-parser.h
@@ -1,459 +0,0 @@
-#pragma once
-
-#include <nlohmann/json_fwd.hpp>
-
-#include <memory>
-#include <unordered_map>
-#include <string>
-#include <string_view>
-#include <functional>
-#include <vector>
-#include <variant>
-
-struct common_grammar_builder;
-
-class common_peg_parser_builder;
-
-using common_peg_parser_id = size_t;
-constexpr common_peg_parser_id COMMON_PEG_INVALID_PARSER_ID = static_cast<common_peg_parser_id>(-1);
-
-using common_peg_ast_id = size_t;
-constexpr common_peg_ast_id COMMON_PEG_INVALID_AST_ID = static_cast<common_peg_ast_id>(-1);
-
-// Lightweight wrapper around common_peg_parser_id for convenience
-class common_peg_parser {
-    common_peg_parser_id id_;
-    common_peg_parser_builder & builder_;
-
-  public:
-    common_peg_parser(const common_peg_parser & other) : id_(other.id_), builder_(other.builder_) {}
-    common_peg_parser(common_peg_parser_id id, common_peg_parser_builder & builder) : id_(id), builder_(builder) {}
-
-    common_peg_parser & operator=(const common_peg_parser & other);
-    common_peg_parser & operator+=(const common_peg_parser & other);
-    common_peg_parser & operator|=(const common_peg_parser & other);
-
-    operator common_peg_parser_id() const { return id_; }
-    common_peg_parser_id id() const { return id_; }
-
-    common_peg_parser_builder & builder() const { return builder_; }
-
-    // Creates a sequence
-    common_peg_parser operator+(const common_peg_parser & other) const;
-
-    // Creates a sequence separated by spaces.
-    common_peg_parser operator<<(const common_peg_parser & other) const;
-
-    // Creates a choice
-    common_peg_parser operator|(const common_peg_parser & other) const;
-
-    common_peg_parser operator+(const char * str) const;
-    common_peg_parser operator+(const std::string & str) const;
-    common_peg_parser operator<<(const char * str) const;
-    common_peg_parser operator<<(const std::string & str) const;
-    common_peg_parser operator|(const char * str) const;
-    common_peg_parser operator|(const std::string & str) const;
-};
-
-common_peg_parser operator+(const char * str, const common_peg_parser & p);
-common_peg_parser operator+(const std::string & str, const common_peg_parser & p);
-common_peg_parser operator<<(const char * str, const common_peg_parser & p);
-common_peg_parser operator<<(const std::string & str, const common_peg_parser & p);
-common_peg_parser operator|(const char * str, const common_peg_parser & p);
-common_peg_parser operator|(const std::string & str, const common_peg_parser & p);
-
-enum common_peg_parse_result_type {
-    COMMON_PEG_PARSE_RESULT_FAIL            = 0,
-    COMMON_PEG_PARSE_RESULT_SUCCESS         = 1,
-    COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT = 2,
-};
-
-const char * common_peg_parse_result_type_name(common_peg_parse_result_type type);
-
-struct common_peg_ast_node {
-    common_peg_ast_id id;
-    std::string rule;
-    std::string tag;
-    size_t start;
-    size_t end;
-    std::string_view text;
-    std::vector<common_peg_ast_id> children;
-
-    bool is_partial = false;
-};
-
-struct common_peg_parse_result;
-
-using common_peg_ast_visitor = std::function<void(const common_peg_ast_node & node)>;
-
-class common_peg_ast_arena {
-    std::vector<common_peg_ast_node> nodes_;
-  public:
-    common_peg_ast_id add_node(
-        const std::string & rule,
-        const std::string & tag,
-        size_t start,
-        size_t end,
-        std::string_view text,
-        std::vector<common_peg_ast_id> children,
-        bool is_partial = false
-    ) {
-        common_peg_ast_id id = nodes_.size();
-        nodes_.push_back({id, rule, tag, start, end, text, std::move(children), is_partial});
-        return id;
-    }
-
-    const common_peg_ast_node & get(common_peg_ast_id id) const { return nodes_.at(id); }
-
-    size_t size() const { return nodes_.size(); }
-
-    void clear() { nodes_.clear(); }
-
-    void visit(common_peg_ast_id id, const common_peg_ast_visitor & visitor) const;
-    void visit(const common_peg_parse_result & result, const common_peg_ast_visitor & visitor) const;
-};
-
-struct common_peg_parse_result {
-    common_peg_parse_result_type type = COMMON_PEG_PARSE_RESULT_FAIL;
-    size_t start = 0;
-    size_t end = 0;
-
-    std::vector<common_peg_ast_id> nodes;
-
-    common_peg_parse_result() = default;
-
-    common_peg_parse_result(common_peg_parse_result_type type, size_t start)
-        : type(type), start(start), end(start) {}
-
-    common_peg_parse_result(common_peg_parse_result_type type, size_t start, size_t end)
-        : type(type), start(start), end(end) {}
-
-    common_peg_parse_result(common_peg_parse_result_type type, size_t start, size_t end, std::vector<common_peg_ast_id> nodes)
-        : type(type), start(start), end(end), nodes(std::move(nodes)) {}
-
-    bool fail() const { return type == COMMON_PEG_PARSE_RESULT_FAIL; }
-    bool need_more_input() const { return type == COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT; }
-    bool success() const { return type == COMMON_PEG_PARSE_RESULT_SUCCESS; }
-};
-
-struct common_peg_parse_context {
-    std::string input;
-    bool is_partial;
-    common_peg_ast_arena ast;
-
-    int parse_depth;
-
-    common_peg_parse_context()
-        : is_partial(false), parse_depth(0) {}
-
-    common_peg_parse_context(const std::string & input)
-        : input(input), is_partial(false), parse_depth(0) {}
-
-    common_peg_parse_context(const std::string & input, bool is_partial)
-        : input(input), is_partial(is_partial), parse_depth(0) {}
-};
-
-class common_peg_arena;
-
-// Parser variants
-struct common_peg_epsilon_parser {};
-
-struct common_peg_start_parser {};
-
-struct common_peg_end_parser {};
-
-struct common_peg_literal_parser {
-    std::string literal;
-};
-
-struct common_peg_sequence_parser {
-    std::vector<common_peg_parser_id> children;
-};
-
-struct common_peg_choice_parser {
-    std::vector<common_peg_parser_id> children;
-};
-
-struct common_peg_repetition_parser {
-    common_peg_parser_id child;
-    int min_count;
-    int max_count;  // -1 for unbounded
-};
-
-struct common_peg_and_parser {
-    common_peg_parser_id child;
-};
-
-struct common_peg_not_parser {
-    common_peg_parser_id child;
-};
-
-struct common_peg_any_parser {};
-
-struct common_peg_space_parser {};
-
-struct common_peg_chars_parser {
-    struct char_range {
-        uint32_t start;
-        uint32_t end;
-        bool contains(uint32_t codepoint) const { return codepoint >= start && codepoint <= end; }
-    };
-
-    std::string pattern;
-    std::vector<char_range> ranges;
-    bool negated;
-    int min_count;
-    int max_count;  // -1 for unbounded
-};
-
-struct common_peg_json_string_parser {};
-
-struct common_peg_until_parser {
-    std::vector<std::string> delimiters;
-};
-
-struct common_peg_schema_parser {
-    common_peg_parser_id child;
-    std::string name;
-    std::shared_ptr<nlohmann::ordered_json> schema;
-
-    // Indicates if the GBNF should accept a raw string that matches the schema.
-    bool raw;
-};
-
-struct common_peg_rule_parser {
-    std::string name;
-    common_peg_parser_id child;
-    bool trigger;
-};
-
-struct common_peg_ref_parser {
-    std::string name;
-};
-
-struct common_peg_atomic_parser {
-    common_peg_parser_id child;
-};
-
-struct common_peg_tag_parser {
-    common_peg_parser_id child;
-    std::string tag;
-};
-
-// Variant holding all parser types
-using common_peg_parser_variant = std::variant<
-    common_peg_epsilon_parser,
-    common_peg_start_parser,
-    common_peg_end_parser,
-    common_peg_literal_parser,
-    common_peg_sequence_parser,
-    common_peg_choice_parser,
-    common_peg_repetition_parser,
-    common_peg_and_parser,
-    common_peg_not_parser,
-    common_peg_any_parser,
-    common_peg_space_parser,
-    common_peg_chars_parser,
-    common_peg_json_string_parser,
-    common_peg_until_parser,
-    common_peg_schema_parser,
-    common_peg_rule_parser,
-    common_peg_ref_parser,
-    common_peg_atomic_parser,
-    common_peg_tag_parser
->;
-
-class common_peg_arena {
-    std::vector<common_peg_parser_variant> parsers_;
-    std::unordered_map<std::string, common_peg_parser_id> rules_;
-    common_peg_parser_id root_ = COMMON_PEG_INVALID_PARSER_ID;
-
-  public:
-    const common_peg_parser_variant & get(common_peg_parser_id id) const { return parsers_.at(id); }
-    common_peg_parser_variant & get(common_peg_parser_id id) { return parsers_.at(id); }
-
-    size_t size() const { return parsers_.size(); }
-    bool empty() const { return parsers_.empty(); }
-
-    common_peg_parser_id get_rule(const std::string & name) const;
-    bool has_rule(const std::string & name) const { return rules_.find(name) != rules_.end(); }
-
-    common_peg_parser_id root() const { return root_; }
-    void set_root(common_peg_parser_id id) { root_ = id; }
-
-    common_peg_parse_result parse(common_peg_parse_context & ctx, size_t start = 0) const;
-    common_peg_parse_result parse(common_peg_parser_id id, common_peg_parse_context & ctx, size_t start) const;
-
-    void resolve_refs();
-
-    void build_grammar(const common_grammar_builder & builder, bool lazy = false) const;
-
-    std::string dump(common_peg_parser_id id) const;
-
-    nlohmann::json to_json() const;
-    static common_peg_arena from_json(const nlohmann::json & j);
-
-    std::string save() const;
-    void load(const std::string & data);
-
-    friend class common_peg_parser_builder;
-
-  private:
-    common_peg_parser_id add_parser(common_peg_parser_variant parser);
-    void add_rule(const std::string & name, common_peg_parser_id id);
-
-    common_peg_parser_id resolve_ref(common_peg_parser_id id);
-};
-
-class common_peg_parser_builder {
-    common_peg_arena arena_;
-
-    common_peg_parser wrap(common_peg_parser_id id) { return common_peg_parser(id, *this); }
-    common_peg_parser add(const common_peg_parser_variant & p) { return wrap(arena_.add_parser(p)); }
-
-  public:
-    common_peg_parser_builder();
-
-    // Match nothing, always succeed.
-    //   S -> ε
-    common_peg_parser eps() { return add(common_peg_epsilon_parser{}); }
-
-    // Matches the start of the input.
-    //   S -> ^
-    common_peg_parser start() { return add(common_peg_start_parser{}); }
-
-    // Matches the end of the input.
-    //   S -> $
-    common_peg_parser end() { return add(common_peg_end_parser{}); }
-
-    // Matches an exact literal string.
-    //   S -> "hello"
-    common_peg_parser literal(const std::string & literal) { return add(common_peg_literal_parser{literal}); }
-
-    // Matches a sequence of parsers in order, all must succeed.
-    //   S -> A B C
-    common_peg_parser sequence() { return add(common_peg_sequence_parser{}); }
-    common_peg_parser sequence(const std::vector<common_peg_parser_id> & parsers);
-    common_peg_parser sequence(const std::vector<common_peg_parser> & parsers);
-    common_peg_parser sequence(std::initializer_list<common_peg_parser> parsers);
-
-    // Matches the first parser that succeeds from a list of alternatives.
-    //   S -> A | B | C
-    common_peg_parser choice() { return add(common_peg_choice_parser{}); }
-    common_peg_parser choice(const std::vector<common_peg_parser_id> & parsers);
-    common_peg_parser choice(const std::vector<common_peg_parser> & parsers);
-    common_peg_parser choice(std::initializer_list<common_peg_parser> parsers);
-
-    // Matches one or more repetitions of a parser.
-    //   S -> A+
-    common_peg_parser one_or_more(const common_peg_parser & p) { return repeat(p, 1, -1); }
-
-    // Matches zero or more repetitions of a parser, always succeeds.
-    //   S -> A*
-    common_peg_parser zero_or_more(const common_peg_parser & p) { return repeat(p, 0, -1); }
-
-    // Matches zero or one occurrence of a parser, always succeeds.
-    //   S -> A?
-    common_peg_parser optional(const common_peg_parser & p) { return repeat(p, 0, 1); }
-
-    // Positive lookahead: succeeds if child parser succeeds, consumes no input.
-    //   S -> &A
-    common_peg_parser peek(const common_peg_parser & p) { return add(common_peg_and_parser{p}); }
-
-    // Negative lookahead: succeeds if child parser fails, consumes no input.
-    //   S -> !A
-    common_peg_parser negate(const common_peg_parser & p) { return add(common_peg_not_parser{p}); }
-
-    // Matches any single character.
-    //   S -> .
-    common_peg_parser any() { return add(common_peg_any_parser{}); }
-
-    // Matches between min and max repetitions of characters from a character class.
-    //   S -> [a-z]{m,n}
-    //
-    // Use -1 for max to represent unbounded repetition (equivalent to {m,})
-    common_peg_parser chars(const std::string & classes, int min = 1, int max = -1);
-
-    // Creates a lightweight reference to a named rule (resolved during build()).
-    // Use this for forward references in recursive grammars.
-    //   expr_ref -> expr
-    common_peg_parser ref(const std::string & name) { return add(common_peg_ref_parser{name}); }
-
-    // Matches zero or more whitespace characters (space, tab, newline).
-    //   S -> [ \t\n]*
-    common_peg_parser space() { return add(common_peg_space_parser{}); }
-
-    // Matches all characters until a delimiter is found (delimiter not consumed).
-    //   S -> (!delim .)*
-    common_peg_parser until(const std::string & delimiter) { return add(common_peg_until_parser{{delimiter}}); }
-
-    // Matches all characters until one of the delimiters in the list is found (delimiter not consumed).
-    //   S -> (!delim .)*
-    common_peg_parser until_one_of(const std::vector<std::string> & delimiters) { return add(common_peg_until_parser{delimiters}); }
-
-    // Matches everything
-    //   S -> .*
-    common_peg_parser rest() { return until_one_of({}); }
-
-    // Matches between min and max repetitions of a parser (inclusive).
-    //   S -> A{m,n}
-    // Use -1 for max to represent unbounded repetition (equivalent to {m,})
-    common_peg_parser repeat(const common_peg_parser & p, int min, int max) { return add(common_peg_repetition_parser{p, min,max}); }
-
-    // Matches exactly n repetitions of a parser.
-    //   S -> A{n}
-    common_peg_parser repeat(const common_peg_parser & p, int n) { return repeat(p, n, n); }
-
-    // Creates a complete JSON parser supporting objects, arrays, strings, numbers, booleans, and null.
-    //   value -> object | array | string | number | true | false | null
-    common_peg_parser json();
-    common_peg_parser json_object();
-    common_peg_parser json_string();
-    common_peg_parser json_array();
-    common_peg_parser json_number();
-    common_peg_parser json_bool();
-    common_peg_parser json_null();
-
-    // Matches JSON string content without the surrounding quotes.
-    // Useful for extracting content within a JSON string.
-    common_peg_parser json_string_content();
-
-    // Matches a JSON object member with a key and associated parser as the
-    // value.
-    common_peg_parser json_member(const std::string & key, const common_peg_parser & p);
-
-    // Wraps a parser with JSON schema metadata for grammar generation.
-    // Used internally to convert JSON schemas to GBNF grammar rules.
-    common_peg_parser schema(const common_peg_parser & p, const std::string & name, const nlohmann::ordered_json & schema, bool raw = false);
-
-    // Creates a named rule, stores it in the grammar, and returns a ref.
-    // If trigger=true, marks this rule as an entry point for lazy grammar generation.
-    //   auto json = p.rule("json", json_obj | json_arr | ...)
-    common_peg_parser rule(const std::string & name, const common_peg_parser & p, bool trigger = false);
-
-    // Creates a named rule using a builder function, and returns a ref.
-    // If trigger=true, marks this rule as an entry point for lazy grammar generation.
-    //   auto json = p.rule("json", [&]() { return json_object() | json_array() | ... })
-    common_peg_parser rule(const std::string & name, const std::function<common_peg_parser()> & builder, bool trigger = false);
-
-    // Creates a trigger rule. When generating a lazy grammar from the parser,
-    // only trigger rules and descendents are emitted.
-    common_peg_parser trigger_rule(const std::string & name, const common_peg_parser & p) { return rule(name, p, true); }
-    common_peg_parser trigger_rule(const std::string & name, const std::function<common_peg_parser()> & builder) { return rule(name, builder, true); }
-
-    // Creates an atomic parser. Atomic parsers do not create an AST node if
-    // the child results in a partial parse, i.e. NEEDS_MORE_INPUT. This is
-    // intended for situations where partial output is undesirable.
-    common_peg_parser atomic(const common_peg_parser & p) { return add(common_peg_atomic_parser{p}); }
-
-    // Tags create nodes in the generated AST for semantic purposes.
-    // Unlike rules, you can tag multiple nodes with the same tag.
-    common_peg_parser tag(const std::string & tag, const common_peg_parser & p) { return add(common_peg_tag_parser{p.id(), tag}); }
-
-    void set_root(const common_peg_parser & p);
-
-    common_peg_arena build();
-};
-
-// Helper function for building parsers
-common_peg_arena build_peg_parser(const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn);
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -1,186 +0,0 @@
-#include "arg.h"
-#include "preset.h"
-#include "peg-parser.h"
-#include "log.h"
-
-#include <fstream>
-#include <sstream>
-#include <filesystem>
-
-static std::string rm_leading_dashes(const std::string & str) {
-    size_t pos = 0;
-    while (pos < str.size() && str[pos] == '-') {
-        ++pos;
-    }
-    return str.substr(pos);
-}
-
-std::vector<std::string> common_preset::to_args() const {
-    std::vector<std::string> args;
-
-    for (const auto & [opt, value] : options) {
-        args.push_back(opt.args.back()); // use the last arg as the main arg
-        if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
-            // flag option, no value
-            if (common_arg_utils::is_falsey(value)) {
-                // use negative arg if available
-                if (!opt.args_neg.empty()) {
-                    args.back() = opt.args_neg.back();
-                } else {
-                    // otherwise, skip the flag
-                    // TODO: maybe throw an error instead?
-                    args.pop_back();
-                }
-            }
-        }
-        if (opt.value_hint != nullptr) {
-            // single value
-            args.push_back(value);
-        }
-        if (opt.value_hint != nullptr && opt.value_hint_2 != nullptr) {
-            throw std::runtime_error(string_format(
-                "common_preset::to_args(): option '%s' has two values, which is not supported yet",
-                opt.args.back()
-            ));
-        }
-    }
-
-    return args;
-}
-
-std::string common_preset::to_ini() const {
-    std::ostringstream ss;
-
-    ss << "[" << name << "]\n";
-    for (const auto & [opt, value] : options) {
-        auto espaced_value = value;
-        string_replace_all(espaced_value, "\n", "\\\n");
-        ss << rm_leading_dashes(opt.args.back()) << " = ";
-        ss << espaced_value << "\n";
-    }
-    ss << "\n";
-
-    return ss.str();
-}
-
-static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
-    std::map<std::string, std::map<std::string, std::string>> parsed;
-
-    if (!std::filesystem::exists(path)) {
-        throw std::runtime_error("preset file does not exist: " + path);
-    }
-
-    std::ifstream file(path);
-    if (!file.good()) {
-        throw std::runtime_error("failed to open server preset file: " + path);
-    }
-
-    std::string contents((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
-
-    static const auto parser = build_peg_parser([](auto & p) {
-        // newline ::= "\r\n" / "\n" / "\r"
-        auto newline = p.rule("newline", p.literal("\r\n") | p.literal("\n") | p.literal("\r"));
-
-        // ws ::= [ \t]*
-        auto ws = p.rule("ws", p.chars("[ \t]", 0, -1));
-
-        // comment ::= [;#] (!newline .)*
-        auto comment = p.rule("comment", p.chars("[;#]", 1, 1) + p.zero_or_more(p.negate(newline) + p.any()));
-
-        // eol ::= ws comment? (newline / EOF)
-        auto eol = p.rule("eol", ws + p.optional(comment) + (newline | p.end()));
-
-        // ident ::= [a-zA-Z_] [a-zA-Z0-9_.-]*
-        auto ident = p.rule("ident", p.chars("[a-zA-Z_]", 1, 1) + p.chars("[a-zA-Z0-9_.-]", 0, -1));
-
-        // value ::= (!eol-start .)*
-        auto eol_start = p.rule("eol-start", ws + (p.chars("[;#]", 1, 1) | newline | p.end()));
-        auto value = p.rule("value", p.zero_or_more(p.negate(eol_start) + p.any()));
-
-        // header-line ::= "[" ws ident ws "]" eol
-        auto header_line = p.rule("header-line", "[" + ws + p.tag("section-name", p.chars("[^]]")) + ws + "]" + eol);
-
-        // kv-line ::= ident ws "=" ws value eol
-        auto kv_line = p.rule("kv-line", p.tag("key", ident) + ws + "=" + ws + p.tag("value", value) + eol);
-
-        // comment-line ::= ws comment (newline / EOF)
-        auto comment_line = p.rule("comment-line", ws + comment + (newline | p.end()));
-
-        // blank-line ::= ws (newline / EOF)
-        auto blank_line = p.rule("blank-line", ws + (newline | p.end()));
-
-        // line ::= header-line / kv-line / comment-line / blank-line
-        auto line = p.rule("line", header_line | kv_line | comment_line | blank_line);
-
-        // ini ::= line* EOF
-        auto ini = p.rule("ini", p.zero_or_more(line) + p.end());
-
-        return ini;
-    });
-
-    common_peg_parse_context ctx(contents);
-    const auto result = parser.parse(ctx);
-    if (!result.success()) {
-        throw std::runtime_error("failed to parse server config file: " + path);
-    }
-
-    std::string current_section = COMMON_PRESET_DEFAULT_NAME;
-    std::string current_key;
-
-    ctx.ast.visit(result, [&](const auto & node) {
-        if (node.tag == "section-name") {
-            const std::string section = std::string(node.text);
-            current_section = section;
-            parsed[current_section] = {};
-        } else if (node.tag == "key") {
-            const std::string key = std::string(node.text);
-            current_key = key;
-        } else if (node.tag == "value" && !current_key.empty() && !current_section.empty()) {
-            parsed[current_section][current_key] = std::string(node.text);
-            current_key.clear();
-        }
-    });
-
-    return parsed;
-}
-
-static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
-    std::map<std::string, common_arg> mapping;
-    for (const auto & opt : ctx_params.options) {
-        for (const auto & env : opt.get_env()) {
-            mapping[env] = opt;
-        }
-        for (const auto & arg : opt.get_args()) {
-            mapping[rm_leading_dashes(arg)] = opt;
-        }
-    }
-    return mapping;
-}
-
-common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
-    common_presets out;
-    auto key_to_opt = get_map_key_opt(ctx_params);
-    auto ini_data = parse_ini_from_file(path);
-
-    for (auto section : ini_data) {
-        common_preset preset;
-        if (section.first.empty()) {
-            preset.name = COMMON_PRESET_DEFAULT_NAME;
-        } else {
-            preset.name = section.first;
-        }
-        LOG_DBG("loading preset: %s\n", preset.name.c_str());
-        for (const auto & [key, value] : section.second) {
-            LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
-            if (key_to_opt.find(key) != key_to_opt.end()) {
-                preset.options[key_to_opt[key]] = value;
-                LOG_DBG("accepted option: %s = %s\n", key.c_str(), value.c_str());
-            } else {
-                // TODO: maybe warn about unknown key?
-            }
-        }
-        out[preset.name] = preset;
-    }
-
-    return out;
-}
--- a/common/preset.h
+++ b/common/preset.h
@@ -1,32 +0,0 @@
-#pragma once
-
-#include "common.h"
-#include "arg.h"
-
-#include <string>
-#include <vector>
-#include <map>
-
-//
-// INI preset parser and writer
-//
-
-constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default";
-
-struct common_preset {
-    std::string name;
-    // TODO: support repeated args in the future
-    std::map<common_arg, std::string> options;
-
-    // convert preset to CLI argument list
-    std::vector<std::string> to_args() const;
-
-    // convert preset to INI format string
-    std::string to_ini() const;
-
-    // TODO: maybe implement to_env() if needed
-};
-
-// interface for multiple presets in one file
-using common_presets = std::map<std::string, common_preset>;
-common_presets common_presets_load(const std::string & path, common_params_context & ctx_params);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -3,10 +3,9 @@
 #include "common.h"
 #include "log.h"

-#include <algorithm>
 #include <cmath>
-#include <cstring>
 #include <unordered_map>
+#include <algorithm>

 // the ring buffer works similarly to std::deque, but with a fixed capacity
 // TODO: deduplicate with llama-impl.h
@@ -113,13 +112,6 @@ struct common_sampler {

    llama_token_data_array cur_p;

-    void reset() {
-        prev.clear();
-
-        llama_sampler_reset(grmr);
-        llama_sampler_reset(chain);
-    }
-
    void set_logits(struct llama_context * ctx, int idx) {
        const auto * logits = llama_get_logits_ith(ctx, idx);

@@ -136,12 +128,6 @@ struct common_sampler {

        cur_p = { cur.data(), cur.size(), -1, false };
    }
-
-    common_time_meas tm() {
-        return common_time_meas(t_total_us, params.no_perf);
-    }
-
-    mutable int64_t t_total_us = 0;
 };

 std::string common_params_sampling::print() const {
@@ -312,8 +298,6 @@ void common_sampler_free(struct common_sampler * gsmpl) {
 }

 void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
-    const auto tm = gsmpl->tm();
-
    if (accept_grammar) {
        llama_sampler_accept(gsmpl->grmr, token);
    }
@@ -324,7 +308,9 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
 }

 void common_sampler_reset(struct common_sampler * gsmpl) {
-    gsmpl->reset();
+    llama_sampler_reset(gsmpl->grmr);
+
+    llama_sampler_reset(gsmpl->chain);
 }

 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
@@ -341,54 +327,16 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
 void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
    // TODO: measure grammar performance

-    const double t_sampling_ms = gsmpl ? 1e-3*gsmpl->t_total_us : 0;
-
-    llama_perf_sampler_data data_smpl;
-    llama_perf_context_data data_ctx;
-
-    memset(&data_smpl, 0, sizeof(data_smpl));
-    memset(&data_ctx,  0, sizeof(data_ctx));
-
    if (gsmpl) {
-        auto & data = data_smpl;
-
-        data = llama_perf_sampler(gsmpl->chain);
-
-        // note: the sampling time includes the samplers time + extra time spent in common/sampling
-        LOG_INF("%s:    sampling time = %10.2f ms\n", __func__, t_sampling_ms);
-        LOG_INF("%s:    samplers time = %10.2f ms / %5d tokens\n", __func__, data.t_sample_ms, data.n_sample);
+        llama_perf_sampler_print(gsmpl->chain);
    }
-
    if (ctx) {
-        auto & data = data_ctx;
-
-        data = llama_perf_context(ctx);
-
-        const double t_end_ms = 1e-3 * ggml_time_us();
-
-        const double t_total_ms = t_end_ms - data.t_start_ms;
-        const double t_unacc_ms = t_total_ms - (t_sampling_ms + data.t_p_eval_ms + data.t_eval_ms);
-        const double t_unacc_pc = 100.0 * t_unacc_ms /  t_total_ms;
-
-        LOG_INF("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
-        LOG_INF("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
-                __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
-        LOG_INF("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-                __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
-        LOG_INF("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
-        LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %%      (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
-        LOG_INF("%s:    graphs reused = %10d\n", __func__, data.n_reused);
-
+        llama_perf_context_print(ctx);
        llama_memory_breakdown_print(ctx);
    }
 }

 llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
-    llama_synchronize(ctx);
-
-    // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
-    const auto tm = gsmpl->tm();
-
    gsmpl->set_logits(ctx, idx);

    auto & grmr  = gsmpl->grmr;
@@ -480,8 +428,6 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
 // helpers

 llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
-    const auto tm = gsmpl->tm();
-
    auto * res = &gsmpl->cur_p;

    if (do_sort && !res->sorted) {
--- a/common/unicode.cpp
+++ b/common/unicode.cpp
@@ -1,64 +0,0 @@
-#include "unicode.h"
-
-// implementation adopted from src/unicode.cpp
-
-size_t utf8_sequence_length(unsigned char first_byte) {
-    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
-    uint8_t highbits = static_cast<uint8_t>(first_byte) >> 4;
-    return lookup[highbits];
-}
-
-utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset) {
-    if (offset >= input.size()) {
-        return utf8_parse_result(utf8_parse_result::INCOMPLETE);
-    }
-
-    // ASCII fast path
-    if (!(input[offset] & 0x80)) {
-        return utf8_parse_result(utf8_parse_result::SUCCESS, input[offset], 1);
-    }
-
-    // Invalid: continuation byte as first byte
-    if (!(input[offset] & 0x40)) {
-        return utf8_parse_result(utf8_parse_result::INVALID);
-    }
-
-    // 2-byte sequence
-    if (!(input[offset] & 0x20)) {
-        if (offset + 1 >= input.size()) {
-            return utf8_parse_result(utf8_parse_result::INCOMPLETE);
-        }
-        if ((input[offset + 1] & 0xc0) != 0x80) {
-            return utf8_parse_result(utf8_parse_result::INVALID);
-        }
-        auto result = ((input[offset] & 0x1f) << 6) | (input[offset + 1] & 0x3f);
-        return utf8_parse_result(utf8_parse_result::SUCCESS, result, 2);
-    }
-
-    // 3-byte sequence
-    if (!(input[offset] & 0x10)) {
-        if (offset + 2 >= input.size()) {
-            return utf8_parse_result(utf8_parse_result::INCOMPLETE);
-        }
-        if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80) {
-            return utf8_parse_result(utf8_parse_result::INVALID);
-        }
-        auto result = ((input[offset] & 0x0f) << 12) | ((input[offset + 1] & 0x3f) << 6) | (input[offset + 2] & 0x3f);
-        return utf8_parse_result(utf8_parse_result::SUCCESS, result, 3);
-    }
-
-    // 4-byte sequence
-    if (!(input[offset] & 0x08)) {
-        if (offset + 3 >= input.size()) {
-            return utf8_parse_result(utf8_parse_result::INCOMPLETE);
-        }
-        if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80 || (input[offset + 3] & 0xc0) != 0x80) {
-            return utf8_parse_result(utf8_parse_result::INVALID);
-        }
-        auto result = ((input[offset] & 0x07) << 18) | ((input[offset + 1] & 0x3f) << 12) | ((input[offset + 2] & 0x3f) << 6) | (input[offset + 3] & 0x3f);
-        return utf8_parse_result(utf8_parse_result::SUCCESS, result, 4);
-    }
-
-    // Invalid first byte
-    return utf8_parse_result(utf8_parse_result::INVALID);
-}
--- a/common/unicode.h
+++ b/common/unicode.h
@@ -1,22 +0,0 @@
-#pragma once
-
-#include <cstdint>
-#include <string_view>
-
-// UTF-8 parsing utilities for streaming-aware unicode support
-
-struct utf8_parse_result {
-    uint32_t codepoint;      // Decoded codepoint (only valid if status == SUCCESS)
-    size_t bytes_consumed;   // How many bytes this codepoint uses (1-4)
-    enum status { SUCCESS, INCOMPLETE, INVALID } status;
-
-    utf8_parse_result(enum status s, uint32_t cp = 0, size_t bytes = 0)
-        : codepoint(cp), bytes_consumed(bytes), status(s) {}
-};
-
-// Determine the expected length of a UTF-8 sequence from its first byte
-// Returns 0 for invalid first bytes
-size_t utf8_sequence_length(unsigned char first_byte);
-
-// Parse a single UTF-8 codepoint from input
-utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -383,17 +383,6 @@ class ModelBase:
                        s = self.model_tensors[name]
                        self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
                        tensors_to_remove.append(name)
-                    if name.endswith(".activation_scale"):  # unused
-                        tensors_to_remove.append(name)
-                    # mistral format
-                    if name.endswith(".qscale_weight"):
-                        weight_name = name.removesuffix("qscale_weight") + "weight"
-                        w = self.model_tensors[weight_name]
-                        s = self.model_tensors[name]
-                        self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
-                        tensors_to_remove.append(name)
-                    if name.endswith(".qscale_act"):
-                        tensors_to_remove.append(name)
            elif quant_method == "gptq":
                for name in self.model_tensors.keys():
                    if name.endswith(".qweight"):
@@ -576,7 +565,7 @@ class ModelBase:
                            gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
                        )
                    )
-                    or new_name[-7:] not in (".weight", ".lora_a", ".lora_b")
+                    or not new_name.endswith(".weight")
                ):
                    data_qtype = gguf.GGMLQuantizationType.F32

@@ -1535,79 +1524,6 @@ class TextModel(ModelBase):
        special_vocab._set_special_token("bos", 151643)
        special_vocab.add_to_gguf(self.gguf_writer)

-    def _set_vocab_mistral(self):
-        if not _mistral_common_installed:
-            raise ImportError(_mistral_import_error_msg)
-
-        vocab = MistralVocab(self.dir_model)
-        logger.info(
-            f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
-        )
-
-        self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
-
-        tokens = []
-        scores = []
-        toktypes = []
-
-        for text, score, toktype in vocab.all_tokens():
-            tokens.append(text)
-            scores.append(score)
-            toktypes.append(toktype)
-
-        assert len(tokens) == vocab.vocab_size, (
-            f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
-        )
-
-        if vocab.tokenizer_type == MistralTokenizerType.tekken:
-            self.gguf_writer.add_tokenizer_pre("tekken")
-            self.gguf_writer.add_token_merges(
-                vocab.extract_vocab_merges_from_model()
-            )
-
-        logger.info(
-            f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
-        )
-
-        self.gguf_writer.add_bos_token_id(vocab.bos_id)
-        self.gguf_writer.add_eos_token_id(vocab.eos_id)
-        self.gguf_writer.add_unk_token_id(vocab.unk_id)
-        self.gguf_writer.add_pad_token_id(vocab.pad_id)
-
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_scores(scores)
-        self.gguf_writer.add_token_types(toktypes)
-        self.gguf_writer.add_vocab_size(vocab.vocab_size)
-
-        self.gguf_writer.add_add_bos_token(True)
-        self.gguf_writer.add_add_eos_token(False)
-
-        local_template_file_path = self.dir_model / "chat_template.jinja"
-
-        if self.is_mistral_format and local_template_file_path.is_file():
-            # Ministral-3 and other new Mistral models come with chat templates.
-            # ref: https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512/tree/main
-            logger.info("Using an existing Mistral local chat template.")
-
-            with open(local_template_file_path, "r", encoding="utf-8") as f:
-                template = f.read()
-        elif not self.is_mistral_format or not self.disable_mistral_community_chat_template:
-            template_dir = Path(__file__).parent / "models/templates/"
-
-            # Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`.
-            if self.is_mistral_format:
-                logger.info(
-                    "Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. "
-                    "Mistral recommends to use `mistral-common` to perform tokenization and detokenization."
-                )
-            template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format)
-        else:
-            logger.info("Not using a Mistral local or community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.")
-            template = None
-
-        if template is not None:
-            self.gguf_writer.add_chat_template(template)
-

 class MmprojModel(ModelBase):
    model_type = ModelType.MMPROJ
@@ -1665,27 +1581,10 @@ class MmprojModel(ModelBase):

        # load preprocessor config
        self.preprocessor_config = {}
-
-        # prefer preprocessor_config.json if possible
-        preprocessor_config_path = self.dir_model / "preprocessor_config.json"
-        if preprocessor_config_path.is_file():
-            with open(preprocessor_config_path, "r", encoding="utf-8") as f:
+        if not self.is_mistral_format:
+            with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
                self.preprocessor_config = json.load(f)

-        # prefer processor_config.json if possible
-        processor_config_path = self.dir_model / "processor_config.json"
-        if processor_config_path.is_file():
-            with open(processor_config_path, "r", encoding="utf-8") as f:
-                cfg = json.load(f)
-                # move image_processor to root level for compat
-                if "image_processor" in cfg:
-                    cfg = {
-                        **cfg,
-                        **cfg["image_processor"],
-                    }
-                # merge configs
-                self.preprocessor_config = {**self.preprocessor_config, **cfg}
-
    def get_vision_config(self) -> dict[str, Any] | None:
        config_name = "vision_config" if not self.is_mistral_format else "vision_encoder"
        return self.global_config.get(config_name)
@@ -1774,9 +1673,11 @@ class GPTNeoXModel(TextModel):
    model_arch = gguf.MODEL_ARCH.GPTNEOX

    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
+
        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_rope_dimension_count(
            int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
@@ -1834,7 +1735,7 @@ class BloomModel(TextModel):
        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
        self.gguf_writer.add_embedding_length(n_embed)
        self.gguf_writer.add_feed_forward_length(4 * n_embed)
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(self.hparams["n_layer"])
        self.gguf_writer.add_head_count(n_head)
        self.gguf_writer.add_head_count_kv(n_head)
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
@@ -1897,9 +1798,10 @@ class MPTModel(TextModel):
            self.gguf_writer.add_unk_token_id(0)

    def set_gguf_parameters(self):
+        block_count = self.hparams["n_layers"]
        self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
        self.gguf_writer.add_head_count(self.hparams["n_heads"])
        if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
@@ -1932,6 +1834,7 @@ class OrionModel(TextModel):
        self._set_vocab_sentencepiece()

    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
        head_count = self.hparams["num_attention_heads"]
        head_count_kv = self.hparams.get("num_key_value_heads", head_count)

@@ -1949,7 +1852,7 @@ class OrionModel(TextModel):
        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
        self.gguf_writer.add_context_length(ctx_length)
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_head_count(head_count)
        self.gguf_writer.add_head_count_kv(head_count_kv)
@@ -1966,6 +1869,7 @@ class BaichuanModel(TextModel):
        self._set_vocab_sentencepiece()

    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
        head_count = self.hparams["num_attention_heads"]
        head_count_kv = self.hparams.get("num_key_value_heads", head_count)

@@ -1982,7 +1886,7 @@ class BaichuanModel(TextModel):
        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
        self.gguf_writer.add_context_length(ctx_length)
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
        self.gguf_writer.add_head_count(head_count)
@@ -2089,6 +1993,7 @@ class XverseModel(TextModel):
        special_vocab.add_to_gguf(self.gguf_writer)

    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
        head_count = self.hparams["num_attention_heads"]
        head_count_kv = self.hparams.get("num_key_value_heads", head_count)

@@ -2105,7 +2010,7 @@ class XverseModel(TextModel):
        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
        self.gguf_writer.add_context_length(ctx_length)
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
        self.gguf_writer.add_head_count(head_count)
@@ -2148,6 +2053,10 @@ class FalconModel(TextModel):
    model_arch = gguf.MODEL_ARCH.FALCON

    def set_gguf_parameters(self):
+        block_count = self.hparams.get("num_hidden_layers")
+        if block_count is None:
+            block_count = self.hparams["n_layer"]  # old name
+
        n_head = self.hparams.get("num_attention_heads")
        if n_head is None:
            n_head = self.hparams["n_head"]  # old name
@@ -2160,7 +2069,7 @@ class FalconModel(TextModel):
        self.gguf_writer.add_tensor_data_layout("jploski")  # qkv tensor transform
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
        self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(n_head)
        self.gguf_writer.add_head_count_kv(n_head_kv)
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
@@ -2198,10 +2107,12 @@ class StarCoderModel(TextModel):
    model_arch = gguf.MODEL_ARCH.STARCODER

    def set_gguf_parameters(self):
+        block_count = self.hparams["n_layer"]
+
        self.gguf_writer.add_context_length(self.hparams["n_positions"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(self.hparams["n_head"])
        self.gguf_writer.add_head_count_kv(1)
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
@@ -2231,12 +2142,14 @@ class RefactModel(TextModel):
        multiple_of = 256
        ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)

+        block_count = self.hparams["n_layer"]
+
        # refact uses Alibi. So this is from config.json which might be used by training.
        self.gguf_writer.add_context_length(self.hparams["n_positions"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])

        self.gguf_writer.add_feed_forward_length(ff_dim)
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(self.hparams["n_head"])
        self.gguf_writer.add_head_count_kv(1)
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
@@ -2283,10 +2196,11 @@ class StableLMModel(TextModel):

    def set_gguf_parameters(self):
        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]

        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
@@ -2378,6 +2292,67 @@ class LlamaModel(TextModel):
        if self.hf_arch == "VLlama3ForCausalLM":
            self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)

+    def _set_vocab_mistral(self):
+        if not _mistral_common_installed:
+            raise ImportError(_mistral_import_error_msg)
+
+        vocab = MistralVocab(self.dir_model)
+        logger.info(
+            f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
+        )
+
+        self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
+
+        tokens = []
+        scores = []
+        toktypes = []
+
+        for text, score, toktype in vocab.all_tokens():
+            tokens.append(text)
+            scores.append(score)
+            toktypes.append(toktype)
+
+        assert len(tokens) == vocab.vocab_size, (
+            f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
+        )
+
+        if vocab.tokenizer_type == MistralTokenizerType.tekken:
+            self.gguf_writer.add_tokenizer_pre("tekken")
+            self.gguf_writer.add_token_merges(
+                vocab.extract_vocab_merges_from_model()
+            )
+
+        logger.info(
+            f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
+        )
+
+        self.gguf_writer.add_bos_token_id(vocab.bos_id)
+        self.gguf_writer.add_eos_token_id(vocab.eos_id)
+        self.gguf_writer.add_unk_token_id(vocab.unk_id)
+        self.gguf_writer.add_pad_token_id(vocab.pad_id)
+
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+        self.gguf_writer.add_vocab_size(vocab.vocab_size)
+
+        self.gguf_writer.add_add_bos_token(True)
+        self.gguf_writer.add_add_eos_token(False)
+
+        template_dir = Path(__file__).parent / "models/templates/"
+
+        if not self.is_mistral_format or not self.disable_mistral_community_chat_template:
+            # Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`.
+            if self.is_mistral_format:
+                logger.info(
+                    "Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. "
+                    "Mistral recommends to use `mistral-common` to perform tokenization and detokenization."
+                )
+            template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format)
+            self.gguf_writer.add_chat_template(template)
+        else:
+            logger.info("Not using a Mistral community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.")
+
    def set_vocab(self):
        if self.is_mistral_format:
            return self._set_vocab_mistral()
@@ -2837,38 +2812,12 @@ class Llama4VisionModel(MmprojModel):

@ModelBase.register("Mistral3ForConditionalGeneration")
 class Mistral3Model(LlamaModel):
-    model_arch = gguf.MODEL_ARCH.MISTRAL3
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        # for compatibility, we use LLAMA arch for older models
-        # TODO: remove this once everyone has migrated to newer version of llama.cpp
-        if self.hparams.get("model_type") != "ministral3":
-            self.model_arch = gguf.MODEL_ARCH.LLAMA
-            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
-            self.gguf_writer.add_architecture()
-            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        rope_params = self.hparams.get("rope_parameters")
-        if self.hparams.get("model_type") == "ministral3":
-            assert rope_params is not None, "ministral3 must have 'rope_parameters' config"
-            assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'"
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_params["factor"])
-            self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_params["beta_fast"])
-            self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_params["beta_slow"])
-            self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"])
-            self.gguf_writer.add_rope_freq_base(rope_params["rope_theta"])
-            self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
+    model_arch = gguf.MODEL_ARCH.LLAMA

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
        name = name.replace("language_model.", "")
        if "multi_modal_projector" in name or "vision_tower" in name:
            return []
-
        return super().modify_tensors(data_torch, name, bid)


@@ -3202,7 +3151,7 @@ class DbrxModel(TextModel):
    def set_gguf_parameters(self):
        ffn_config = self.hparams["ffn_config"]
        attn_config = self.hparams["attn_config"]
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(self.hparams["n_layers"])

        self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
@@ -3404,7 +3353,7 @@ class QwenModel(TextModel):

    def set_gguf_parameters(self):
        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
@@ -4249,51 +4198,6 @@ class Qwen3MoeModel(Qwen2MoeModel):
        super().set_vocab()


-@ModelBase.register("Qwen3NextForCausalLM")
-class Qwen3NextModel(Qwen2MoeModel):
-    model_arch = gguf.MODEL_ARCH.QWEN3NEXT
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_ssm_conv_kernel(self.hparams["linear_conv_kernel_dim"])
-        self.gguf_writer.add_ssm_state_size(self.hparams["linear_key_head_dim"])
-        self.gguf_writer.add_ssm_group_count(self.hparams["linear_num_key_heads"])
-        self.gguf_writer.add_ssm_time_step_rank(self.hparams["linear_num_value_heads"])
-        self.gguf_writer.add_ssm_inner_size(self.hparams["linear_value_head_dim"] * self.hparams["linear_num_value_heads"])
-        if (rope_dim := self.hparams.get("head_dim")) is None:
-            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25)))
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if name.startswith("mtp"):
-            return [] # ignore MTP layers for now
-        if name.endswith(".A_log"):
-            data_torch = -torch.exp(data_torch)
-        elif name.endswith(".dt_bias"):
-            name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
-        elif "conv1d" in name:
-            data_torch = data_torch.squeeze()
-        elif name.endswith("norm.weight") and not name.endswith("linear_attn.norm.weight"):
-            data_torch = data_torch + 1
-
-        yield from super().modify_tensors(data_torch, name, bid)
-
-
-@ModelBase.register("RND1")
-class RND1Model(Qwen2MoeModel):
-    model_arch = gguf.MODEL_ARCH.RND1
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-
-        # RND1 specific parameters
-        # RND1 uses bidirectional attention
-        self.gguf_writer.add_causal_attention(False)
-
-        if (mask_token_id := self.hparams.get("mask_token_id")) is not None:
-            self.gguf_writer.add_mask_token_id(mask_token_id)
-
-
@ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration")
 class Qwen3VLVisionModel(MmprojModel):
    def __init__(self, *args, **kwargs):
@@ -4480,7 +4384,7 @@ class GPT2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.GPT2

    def set_gguf_parameters(self):
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(self.hparams["n_layer"])
        self.gguf_writer.add_context_length(self.hparams["n_ctx"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
@@ -4512,6 +4416,8 @@ class Phi2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.PHI2

    def set_gguf_parameters(self):
+        block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
+
        rot_pct = self.find_hparam(["partial_rotary_factor"])
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
@@ -4520,7 +4426,7 @@ class Phi2Model(TextModel):

        self.gguf_writer.add_embedding_length(n_embd)
        self.gguf_writer.add_feed_forward_length(4 * n_embd)
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(n_head)
        self.gguf_writer.add_head_count_kv(n_head)
        self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]))
@@ -4638,6 +4544,8 @@ class Phi3MiniModel(TextModel):
        special_vocab.add_to_gguf(self.gguf_writer)

    def set_gguf_parameters(self):
+        block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
+
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
@@ -4651,7 +4559,7 @@ class Phi3MiniModel(TextModel):
        self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
        self.gguf_writer.add_embedding_length(n_embd)
        self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"]))
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(n_head)
        self.gguf_writer.add_head_count_kv(n_head_kv)
        self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
@@ -4771,11 +4679,12 @@ class PlamoModel(TextModel):

    def set_gguf_parameters(self):
        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]

        self.gguf_writer.add_context_length(4096)  # not in config.json
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(5)  # hparams["num_key_value_heads"]) is wrong
        self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
@@ -4898,6 +4807,7 @@ class Plamo2Model(TextModel):

    def set_gguf_parameters(self):
        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]
        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])

        # Which layers are Mamba layers
@@ -4909,10 +4819,10 @@ class Plamo2Model(TextModel):
        num_attention_heads = []

        if mamba_enabled:
-            for i in range(self.block_count):
-                if self.block_count <= (mamba_step // 2):
+            for i in range(block_count):
+                if block_count <= (mamba_step // 2):
                    # use attention in last layer
-                    is_mamba = (i != self.block_count - 1)
+                    is_mamba = (i != block_count - 1)
                else:
                    is_mamba = (i % mamba_step) != (mamba_step // 2)
                if is_mamba:
@@ -4930,7 +4840,7 @@ class Plamo2Model(TextModel):
        self.gguf_writer.add_embedding_length(hparams.get("hidden_size", 4096))
        self.gguf_writer.add_key_length(hparams.get("hidden_size_per_head", 128))
        self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128))
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
        self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))

@@ -4987,10 +4897,12 @@ class CodeShellModel(TextModel):
    model_arch = gguf.MODEL_ARCH.CODESHELL

    def set_gguf_parameters(self):
+        block_count = self.hparams["n_layer"]
+
        self.gguf_writer.add_context_length(self.hparams["n_positions"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(self.hparams["n_head"])
        self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
@@ -5132,7 +5044,7 @@ class InternLM2Model(TextModel):

    def set_gguf_parameters(self):
        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
@@ -5753,10 +5665,11 @@ class GemmaModel(TextModel):

    def set_gguf_parameters(self):
        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]

        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
@@ -5792,10 +5705,11 @@ class Gemma2Model(TextModel):

    def set_gguf_parameters(self):
        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]

        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
@@ -5833,19 +5747,18 @@ class Gemma3Model(TextModel):
    norm_shift = 1.0  # Gemma3RMSNorm adds 1.0 to the norm value

    def set_vocab(self):
-        if (self.dir_model / "tokenizer.model").is_file():
-            self._set_vocab_sentencepiece()
-            self.gguf_writer.add_add_space_prefix(False)
-        else:
-            self._set_vocab_gpt2()
+        self._set_vocab_sentencepiece()
+
+        self.gguf_writer.add_add_space_prefix(False)

    def set_gguf_parameters(self):
        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]

        # some default values are not specified in the hparams
        self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
        self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
@@ -5855,24 +5768,13 @@ class Gemma3Model(TextModel):
        self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
        # attn_logit_softcapping is removed in Gemma3
        assert hparams.get("attn_logit_softcapping") is None
-        if (final_logit_softcap := hparams.get("final_logit_softcapping")):
-            self.gguf_writer.add_final_logit_softcapping(final_logit_softcap)
-        if hparams.get("sliding_window_pattern") != 1:
-            self.gguf_writer.add_sliding_window(hparams["sliding_window"])
+        self.gguf_writer.add_sliding_window(hparams["sliding_window"])
        self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
        if hparams.get("rope_scaling") is not None:
-            rope_scaling = hparams["rope_scaling"]
-            if rope_scaling["rope_type"] == "linear":
-                # important: this rope_scaling is only applied for global layers, and not used by 1B model
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-                self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            elif rope_scaling["rope_type"] == "yarn":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-                self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-                self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
-                self.gguf_writer.add_rope_scaling_yarn_ext_factor(rope_scaling["extrapolation_factor"])
-                self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_scaling["beta_fast"])
-                self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_scaling["beta_slow"])
+            assert hparams["rope_scaling"]["rope_type"] == "linear"
+            # important: this rope_scaling is only applied for global layers, and not used by 1B model
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+            self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
@@ -5886,10 +5788,8 @@ class Gemma3Model(TextModel):

        # remove OOV (out-of-vocabulary) rows in token_embd
        if "embed_tokens.weight" in name:
-            if (self.dir_model / "tokenizer.model").is_file():
-                tokens = self._create_vocab_sentencepiece()[0]
-            else:
-                tokens = self.get_vocab_base()[0]
+            vocab = self._create_vocab_sentencepiece()
+            tokens = vocab[0]
            data_torch = data_torch[:len(tokens)]

        # ref code in Gemma3RMSNorm
@@ -6134,6 +6034,7 @@ class Rwkv6Model(TextModel):
        self._set_vocab_rwkv_world()

    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
        head_size = self.hparams["head_size"]
        hidden_size = self.hparams["hidden_size"]
        layer_norm_eps = self.hparams["layer_norm_epsilon"]
@@ -6145,7 +6046,7 @@ class Rwkv6Model(TextModel):
        # RWKV isn't context limited
        self.gguf_writer.add_context_length(1048576)
        self.gguf_writer.add_embedding_length(hidden_size)
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
        self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
        self.gguf_writer.add_wkv_head_size(head_size)
@@ -6209,6 +6110,7 @@ class RWKV6Qwen2Model(Rwkv6Model):
            self._set_vocab_gpt2()

    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
        num_attention_heads = self.hparams["num_attention_heads"]
        num_key_value_heads = self.hparams["num_key_value_heads"]
        hidden_size = self.hparams["hidden_size"]
@@ -6221,7 +6123,7 @@ class RWKV6Qwen2Model(Rwkv6Model):
        # RWKV isn't context limited
        self.gguf_writer.add_context_length(1048576)
        self.gguf_writer.add_embedding_length(hidden_size)
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_wkv_head_size(head_size)
        self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
        self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
@@ -6262,6 +6164,7 @@ class Rwkv7Model(TextModel):
        return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32

    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
        try:
            head_size = self.hparams["head_size"]
            layer_norm_eps = self.hparams["layer_norm_epsilon"]
@@ -6286,7 +6189,7 @@ class Rwkv7Model(TextModel):
        # RWKV isn't context limited
        self.gguf_writer.add_context_length(1048576)
        self.gguf_writer.add_embedding_length(hidden_size)
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
        self.gguf_writer.add_wkv_head_size(head_size)
        self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
@@ -6380,6 +6283,7 @@ class ARwkv7Model(Rwkv7Model):
            self._set_vocab_gpt2()

    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
        hidden_size = self.hparams["hidden_size"]
        head_size = self.hparams["head_size"]
        rms_norm_eps = self.hparams["rms_norm_eps"]
@@ -6396,7 +6300,7 @@ class ARwkv7Model(Rwkv7Model):
        # RWKV isn't context limited
        self.gguf_writer.add_context_length(1048576)
        self.gguf_writer.add_embedding_length(hidden_size)
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
        self.gguf_writer.add_wkv_head_size(head_size)
        self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
@@ -7286,10 +7190,6 @@ class DeepseekV2Model(TextModel):
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
-
-            # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
-            # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
-            # ref https://github.com/ggml-org/llama.cpp/pull/17945
            self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])

    _experts: list[dict[str, Tensor]] | None = None
@@ -7624,7 +7524,7 @@ class T5Model(TextModel):
        self.gguf_writer.add_context_length(n_ctx)
        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
        self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(self.hparams["num_layers"])
        if (dec_n_layer := self.hparams.get("num_decoder_layers")) is not None:
            self.gguf_writer.add_decoder_block_count(dec_n_layer)
        self.gguf_writer.add_head_count(self.hparams["num_heads"])
@@ -7763,7 +7663,7 @@ class T5EncoderModel(TextModel):
        self.gguf_writer.add_context_length(n_ctx)
        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
        self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(self.hparams["num_layers"])
        self.gguf_writer.add_head_count(self.hparams["num_heads"])
        self.gguf_writer.add_key_length(self.hparams["d_kv"])
        self.gguf_writer.add_value_length(self.hparams["d_kv"])
@@ -7826,7 +7726,7 @@ class JaisModel(TextModel):
        self._set_vocab_gpt2()

    def set_gguf_parameters(self):
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(self.hparams["n_layer"])
        self.gguf_writer.add_context_length(self.hparams["n_positions"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
        self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
@@ -8168,7 +8068,7 @@ class ChatGLMModel(TextModel):
        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
        self.gguf_writer.add_embedding_length(n_embed)
        self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed)))
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"]))
        self.gguf_writer.add_head_count(n_head)
        self.gguf_writer.add_head_count_kv(n_head_kv)
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5))
@@ -8250,6 +8150,7 @@ class ExaoneModel(TextModel):
        num_kv_heads = hparams.get("num_key_value_heads", num_heads)
        layer_norm_eps = hparams["layer_norm_epsilon"]
        intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
+        num_layers = hparams["num_layers"]
        # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
        # attention_dropout_rate = hparams["attention_dropout"]
        # ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
@@ -8260,7 +8161,7 @@ class ExaoneModel(TextModel):
        self.gguf_writer.add_context_length(max_position_embeddings)
        self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
        self.gguf_writer.add_feed_forward_length(intermediate_size)
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(num_layers)
        self.gguf_writer.add_file_type(self.ftype)

        if (rope_theta := self.hparams.get("rope_theta")) is not None:
@@ -9894,34 +9795,12 @@ class ApertusModel(LlamaModel):


 class MistralModel(LlamaModel):
-    model_arch = gguf.MODEL_ARCH.MISTRAL3
+    model_arch = gguf.MODEL_ARCH.LLAMA
    model_name = "Mistral"
    hf_arch = ""
    is_mistral_format = True
    undo_permute = False

-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        # for compatibility, we use LLAMA arch for older models
-        # TODO: remove this once everyone migrates to newer version of llama.cpp
-        if "llama_4_scaling" not in self.hparams:
-            self.model_arch = gguf.MODEL_ARCH.LLAMA
-            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
-            self.gguf_writer.add_architecture()
-            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
-
-    def dequant_model(self):
-        # transform quantization config into HF format
-        quant_config = self.hparams.get("quantization")
-        if quant_config is not None:
-            assert quant_config["qformat_weight"] == "fp8_e4m3"
-            self.hparams["quantization_config"] = {
-                "activation_scheme": "static",
-                "quant_method": "fp8",
-                "weight_block_size": None,
-            }
-        return super().dequant_model()
-
    @staticmethod
    def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool):
        assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg
@@ -9961,116 +9840,6 @@ class MistralModel(LlamaModel):

        return template

-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
-
-    @staticmethod
-    def set_mistral_config(gguf_writer: gguf.GGUFWriter, hparams: dict):
-        if "yarn" in hparams:
-            yarn_params = hparams["yarn"]
-            gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            gguf_writer.add_rope_scaling_factor(yarn_params["factor"])
-            gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"])
-            gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"])
-            gguf_writer.add_rope_scaling_yarn_log_mul(1.0) # mscale_all_dim
-            gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])
-
-        if "llama_4_scaling" in hparams:
-            gguf_writer.add_attn_temperature_scale(hparams["llama_4_scaling"]["beta"])
-
-
-class MistralMoeModel(DeepseekV2Model):
-    model_arch = gguf.MODEL_ARCH.DEEPSEEK2
-    model_name = "Mistral"
-    hf_arch = ""
-    is_mistral_format = True
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        logger.info("Using MistralMoeModel")
-        # remap hparams from Mistral MoE format to DeepseekV2 format
-        # we do this way to be able to reuse DeepseekV2Model set_gguf_parameters logic
-        # ref: https://github.com/vllm-project/vllm/blob/b294e28db2c5dee61bc25157664edcada8b90b31/vllm/transformers_utils/configs/mistral.py
-        config = self.hparams
-        # Mistral key -> HF key
-        config_mapping = {
-            "dim": "hidden_size",
-            "norm_eps": "rms_norm_eps",
-            "n_kv_heads": "num_key_value_heads",
-            "n_layers": "num_hidden_layers",
-            "n_heads": "num_attention_heads",
-            "hidden_dim": "intermediate_size",
-        }
-        # HF key -> (Mistral key, default value)
-        top_level_mapping_with_default = {
-            "model_type": ("model_type", "transformer"),
-            "hidden_act": ("activation", "silu"),
-            "tie_word_embeddings": ("tied_embeddings", False),
-            "max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)),
-            "max_position_embeddings": ("max_position_embeddings", 128_000),
-        }
-        # mapping top-level keys
-        for key, new_key in config_mapping.items():
-            if key in config:
-                config[new_key] = config[key]
-        for new_key, (key, default_value) in top_level_mapping_with_default.items():
-            config[new_key] = config.get(key, default_value)
-        # mapping MoE-specific keys
-        moe_config_map = {
-            "route_every_n": "moe_layer_freq",
-            "first_k_dense_replace": "first_k_dense_replace",
-            "num_experts_per_tok": "num_experts_per_tok",
-            "num_experts": "n_routed_experts",
-            "expert_hidden_dim": "moe_intermediate_size",
-            "routed_scale": "routed_scaling_factor",
-            "num_shared_experts": "n_shared_experts",
-            "num_expert_groups": "n_group",
-            "num_expert_groups_per_tok": "topk_group",
-        }
-        moe = config["moe"]
-        for key, new_key in moe_config_map.items():
-            if key in moe:
-                config[new_key] = moe[key]
-        # provide missing values
-        config["topk_method"] = None
-        config["norm_topk_prob"] = True
-        config["scoring_func"] = "softmax"
-
-    def set_vocab(self):
-        self._set_vocab_mistral()
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
-        yarn_params = self.hparams["yarn"]
-        self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"])
-
-        # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
-        # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
-        # ref https://github.com/ggml-org/llama.cpp/pull/17945
-        self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
-        if name.startswith("vision_") or name.startswith("patch_merger.") or "mm_projector" in name:
-            return []
-
-        # rename certain tensors so that we can reuse DeepseekV2Model modify_tensors logic
-        if name.endswith(".qscale_act"):
-            name = name.replace(".qscale_act", ".input_scale")
-        if name.endswith(".qscale_weight"):
-            name = name.replace(".qscale_weight", ".weight_scale")
-        if ".wkv_b." in name:
-            name = name.replace(".wkv_b.", ".kv_b_proj.")
-        if ".experts." in name:
-            name = name.replace(".experts.", ".mlp.experts.")
-            name = name.replace(".w1.", ".gate_proj.")
-            name = name.replace(".w2.", ".down_proj.")
-            name = name.replace(".w3.", ".up_proj.")
-            name = "model." + name
-
-        return super().modify_tensors(data_torch, name, bid)
-

 class PixtralModel(LlavaVisionModel):
    model_name = "Pixtral"
@@ -10308,25 +10077,6 @@ class LazyTorchTensor(gguf.LazyBase):
        torch.uint8: np.uint8,
    }

-    # only used when byteswapping data. Only correct size is needed
-    _dtype_byteswap_map: dict[torch.dtype, type] = {
-        torch.float64: np.float64,
-        torch.float32: np.float32,
-        torch.bfloat16: np.float16,
-        torch.float16: np.float16,
-        torch.int64: np.int64,
-        torch.uint64: np.uint64,
-        torch.int32: np.int32,
-        torch.uint32: np.uint32,
-        torch.int16: np.int16,
-        torch.uint16: np.uint16,
-        torch.int8: np.int8,
-        torch.uint8: np.uint8,
-        torch.bool: np.uint8,
-        torch.float8_e4m3fn: np.uint8,
-        torch.float8_e5m2: np.uint8,
-    }
-
    # used for safetensors slices
    # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
    # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
@@ -10370,14 +10120,8 @@ class LazyTorchTensor(gguf.LazyBase):
    @classmethod
    def from_local_tensor(cls, t: gguf.utility.LocalTensor) -> Tensor:
        def load_tensor(tensor: gguf.utility.LocalTensor) -> Tensor:
-            def byteswap_tensor(tensor: np.ndarray, dtype: type) -> np.ndarray:
-                if sys.byteorder == 'big':
-                    # switch data back to big endian
-                    tensor = tensor.view(dtype).byteswap(inplace=False)
-                return tensor
            dtype = cls._dtype_str_map[tensor.dtype]
-            numpy_dtype = cls._dtype_byteswap_map[dtype]
-            return torch.from_numpy(byteswap_tensor(tensor.mmap_bytes(), numpy_dtype)).view(dtype).reshape(tensor.shape)
+            return torch.from_numpy(tensor.mmap_bytes()).view(dtype).reshape(tensor.shape)
        dtype = cls._dtype_str_map[t.dtype]
        shape = t.shape
        lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(t,), func=lambda r: load_tensor(r))
@@ -10385,16 +10129,10 @@ class LazyTorchTensor(gguf.LazyBase):

    @classmethod
    def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor):
-        def byteswap_tensor(tensor: np.ndarray, dtype: type) -> np.ndarray:
-            if sys.byteorder == 'big':
-                # switch data back to big endian
-                tensor = tensor.view(dtype).byteswap(inplace=False)
-            return tensor
        dtype = cls._dtype_str_map[remote_tensor.dtype]
-        numpy_dtype = cls._dtype_byteswap_map[dtype]
        shape = remote_tensor.shape
        meta = cls.meta_with_dtype_and_shape(dtype, shape)
-        lazy = cls(meta=meta, args=(remote_tensor,), func=lambda r: torch.from_numpy(byteswap_tensor(np.frombuffer(r.data(), dtype=numpy_dtype), numpy_dtype)).view(dtype).reshape(shape))
+        lazy = cls(meta=meta, args=(remote_tensor,), func=lambda r: torch.frombuffer(r.data(), dtype=dtype).reshape(shape))
        return cast(torch.Tensor, lazy)

    @classmethod
@@ -10625,8 +10363,6 @@ def main() -> None:
        elif args.mmproj:
            assert hparams.get("vision_encoder") is not None, "This model does not support multimodal"
            model_class = PixtralModel
-        elif "moe" in hparams:
-            model_class = MistralMoeModel
        else:
            model_class = MistralModel

--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -242,7 +242,7 @@ def parse_args() -> argparse.Namespace:
        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
    )
    parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f32",
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
    )
    parser.add_argument(
@@ -277,15 +277,10 @@ def parse_args() -> argparse.Namespace:
    return parser.parse_args()


-def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None]:
-    from huggingface_hub import try_to_load_from_cache
-
+def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
    # normally, adapter does not come with base model config, we need to load it from AutoConfig
    config = AutoConfig.from_pretrained(hf_model_id)
-    cache_dir = try_to_load_from_cache(hf_model_id, "config.json")
-    cache_dir = Path(cache_dir).parent if isinstance(cache_dir, str) else None
-
-    return config.to_dict(), cache_dir
+    return config.to_dict()


 if __name__ == '__main__':
@@ -330,13 +325,13 @@ if __name__ == '__main__':
    # load base model
    if base_model_id is not None:
        logger.info(f"Loading base model from Hugging Face: {base_model_id}")
-        hparams, dir_base_model = load_hparams_from_hf(base_model_id)
+        hparams = load_hparams_from_hf(base_model_id)
    elif dir_base_model is None:
        if "base_model_name_or_path" in lparams:
            model_id = lparams["base_model_name_or_path"]
            logger.info(f"Loading base model from Hugging Face: {model_id}")
            try:
-                hparams, dir_base_model = load_hparams_from_hf(model_id)
+                hparams = load_hparams_from_hf(model_id)
            except OSError as e:
                logger.error(f"Failed to load base model config: {e}")
                logger.error("Please try downloading the base model and add its path to --base")
@@ -485,7 +480,6 @@ if __name__ == '__main__':
            dir_lora_model=dir_lora,
            lora_alpha=alpha,
            hparams=hparams,
-            remote_hf_model_id=base_model_id,
        )

        logger.info("Exporting model...")
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -42,9 +42,6 @@ The following releases are verified and recommended:

 ## News

- 2025.11
-  - Support malloc memory on device more than 4GB.
-
 - 2025.2
  - Optimize MUL_MAT Q4_0 on Intel GPU for all dGPUs and built-in GPUs since MTL. Increase the performance of LLM (llama-2-7b.Q4_0.gguf) 21%-87% on Intel GPUs (MTL, ARL-H, Arc, Flex, PVC).
    |GPU|Base tokens/s|Increased tokens/s|Percent|
@@ -792,8 +789,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
 | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
-| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
-


 ## Known Issues
@@ -840,14 +835,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512
  | The default context is too big. It leads to excessive memory usage.|Set `-c 8192` or a smaller value.|
  | The model is too big and requires more memory than what is available.|Choose a smaller model or change to a smaller quantization, like Q5 -> Q4;<br>Alternatively, use more than one device to load model.|

- `ggml_backend_sycl_buffer_type_alloc_buffer: can't allocate 5000000000 Bytes of memory on device`
-
-  You need to enable to support 4GB memory malloc by:
-  ```
-    export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
-    set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
-  ```
-
 ### **GitHub contribution**:
 Please add the `SYCL :` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.

--- a/docs/backend/ZenDNN.md
+++ b/docs/backend/ZenDNN.md
@@ -1,258 +0,0 @@
-# llama.cpp for AMD ZenDNN
-
-> [!WARNING]
-> **Note:** ZenDNN is **not** the same as zDNN.
-> - **ZenDNN** (this page): AMD's deep learning library for AMD EPYC CPUs
-> - **zDNN**: IBM's Deep Neural Network acceleration library for IBM Z & LinuxONE Mainframes ([see zDNN documentation](zDNN.md))
-
- [Background](#background)
- [OS](#os)
- [Hardware](#hardware)
- [Supported Operations](#supported-operations)
- [DataType Supports](#datatype-supports)
- [Linux](#linux)
- [Environment Variable](#environment-variable)
- [Performance Optimization](#performance-optimization)
- [Known Issues](#known-issues)
- [TODO](#todo)
-
-## Background
-
-**ZenDNN** (Zen Deep Neural Network Library) is AMD's high-performance deep learning inference library optimized for AMD EPYC™ CPUs. It provides optimized implementations of key deep learning primitives and operations, delivering significant performance improvements for neural network workloads on AMD Zen-based processor architectures.
-
-**Llama.cpp + ZenDNN**
-
-The llama.cpp ZenDNN backend leverages AMD's optimized matrix multiplication primitives to accelerate inference on AMD CPUs. It utilizes ZenDNN's **LowOHA (Low Overhead Hardware Accelerated)** MatMul operator for efficient GEMM operations with minimal execution overhead, built-in weight caching, and direct access to backend libraries (AOCL BLIS, LibXSMM, OneDNN).
-
-For more information about ZenDNN, visit: https://www.amd.com/en/developer/zendnn.html
-
-## OS
-
-| OS      | Status  | Verified                                       |
-|:-------:|:-------:|:----------------------------------------------:|
-| Linux   | Support | Ubuntu 20.04, 22.04, 24.04                     |
-
-For the latest list of supported operating systems, see the [ZenDNN Supported OS](https://github.com/amd/ZenDNN/blob/zendnnl/README.md#15-supported-os).
-
-## Hardware
-
-### AMD CPUs
-
-**Recommended Processors**
-
-ZenDNN is optimized for AMD EPYC™ processors and AMD Ryzen™ processors based on "Zen" microarchitecture and newer.
-
-| CPU Family                    | Status  | Notes                              |
-|:-----------------------------:|:-------:|:----------------------------------:|
-| AMD EPYC™ 9005 Series (Turin)| Support | 5th Gen - Zen 5 architecture       |
-| AMD EPYC™ 9004 Series (Genoa)| Support | 4th Gen - Zen 4 architecture       |
-| AMD EPYC™ 7003 Series (Milan)| Support | 3rd Gen - Zen 3 architecture       |
-| AMD Ryzen™ AI MAX (Strix Halo)| Support | High-performance mobile processors |
-
-*Notes:*
-
- Best performance is achieved on AMD EPYC™ processors with high core counts (e.g., EPYC 9005 series).
- ZenDNN leverages AMD's advanced CPU features including AVX2 and AVX-512 instruction sets.
- For optimal performance, ensure your system has sufficient memory bandwidth.
-
-## Supported Operations
-
-The ZenDNN backend currently accelerates **matrix multiplication (MUL_MAT)** operations only. Other operations are handled by the standard CPU backend.
-
-| Operation    | Status  | Notes                                          |
-|:-------------|:-------:|:----------------------------------------------:|
-| MUL_MAT      |    ✓    | Accelerated via ZenDNN LowOHA MatMul           |
-
-*Note:* Since only MUL_MAT is accelerated, models will benefit most from ZenDNN when matrix multiplications dominate the computational workload (which is typical for transformer-based LLMs).
-
-## DataType Supports
-
-| DataType               | Status  | Notes                                         |
-|:----------------------:|:-------:|:---------------------------------------------:|
-| FP32                   | Support | Full precision floating point                 |
-| BF16                   | Support | BFloat16 (best performance on Zen 4/Zen 5)    |
-
-*Notes:*
-
- **BF16** provides best performance on Zen 4 and Zen 5 EPYC™ processors (Genoa, Turin).
-
-## Linux
-
-### I. Setup Environment
-
-You have two options to set up ZenDNN:
-
-#### Option 1: Automatic Download and Build (Recommended)
-
-CMake will automatically download and build ZenDNN for you:
-
-```sh
-# Build llama.cpp - ZenDNN will be automatically downloaded and built
-cmake -B build -DGGML_ZENDNN=ON -DCMAKE_BUILD_TYPE=Release
-cmake --build build --config Release -j $(nproc)
-```
-
-No manual ZenDNN installation required. CMake will handle everything automatically.
-
-#### Option 2: Use Custom ZenDNN Installation
-
-If you want to build ZenDNN yourself or use a specific version:
-
-**Step 1: Build ZenDNN from source**
-
-```sh
-# Clone ZenDNN repository
-git clone https://github.com/amd/ZenDNN.git
-cd ZenDNN
-git checkout zendnnl
-
-# Build and install (requires CMake >= 3.25)
-mkdir build && cd build
-cmake ..
-cmake --build . --target all
-```
-
-Default installation path: `ZenDNN/build/install`
-
-**For detailed build instructions**, refer to the [ZenDNN README](https://github.com/amd/ZenDNN/blob/zendnnl/README.md).
-
-**Step 2: Build llama.cpp with custom ZenDNN path**
-
-```sh
-# Using environment variable
-export ZENDNN_ROOT=/path/to/ZenDNN/build/install
-cmake -B build -DGGML_ZENDNN=ON -DCMAKE_BUILD_TYPE=Release
-cmake --build build --config Release -j $(nproc)
-
-# OR specify path directly in CMake
-cmake -B build -DGGML_ZENDNN=ON -DZENDNN_ROOT=/path/to/ZenDNN/build/install -DCMAKE_BUILD_TYPE=Release
-cmake --build build --config Release -j $(nproc)
-```
-
-### II. Run the Server
-
-#### 1. Download Model
-
-Download LLaMA 3.1 8B Instruct BF16 model:
-
-```sh
-# Download from Hugging Face
-huggingface-cli download meta-llama/Llama-3.1-8B-Instruct-GGUF --local-dir models/
-```
-
-#### 2. Start Server
-
-Run llama.cpp server with ZenDNN acceleration:
-
-```sh
-# Set optimal configuration
-export OMP_NUM_THREADS=64  # Adjust to your CPU core count
-export ZENDNNL_MATMUL_ALGO=2  # Blocked AOCL BLIS for best performance
-
-# Start server
-./build/bin/llama-server \
-    -m models/Llama-3.1-8B-Instruct.BF16.gguf \
-    --host 0.0.0.0 \
-    --port 8080 \
-    -t 64
-```
-
-Access the server at `http://localhost:8080`.
-
-**Performance tips**:
- Set `OMP_NUM_THREADS` to match your physical core count
- Use `ZENDNNL_MATMUL_ALGO=2` for optimal performance
- For NUMA systems: `numactl --cpunodebind=0 --membind=0 ./build/bin/llama-server ...`
-
-## Environment Variable
-
-### Build Time
-
-| Name               | Value                                 | Function                                    |
-|--------------------|---------------------------------------|---------------------------------------------|
-| GGML_ZENDNN        | ON/OFF                                | Enable ZenDNN backend support               |
-| ZENDNN_ROOT        | Path to ZenDNN installation           | Set ZenDNN installation directory           |
-| GGML_OPENMP        | ON/OFF (recommended: ON)              | Enable OpenMP for multi-threading           |
-
-### Runtime
-
-| Name                    | Value                    | Function                                                          |
-|-------------------------|--------------------------|-------------------------------------------------------------------|
-| OMP_NUM_THREADS         | Number (e.g., 64)        | Set number of OpenMP threads (recommended: physical core count)   |
-| ZENDNNL_MATMUL_ALGO     | 0-5                      | Select MatMul backend algorithm (see Performance Optimization)    |
-| ZENDNNL_PROFILE_LOG_LEVEL | 0-4                    | Profiling log level (0=disabled, 4=verbose)                       |
-| ZENDNNL_ENABLE_PROFILER | 0 or 1                   | Enable detailed profiling (1=enabled)                             |
-| ZENDNNL_API_LOG_LEVEL   | 0-4                      | API log level (0=disabled, 4=verbose)                             |
-
-**Example**:
-
-```sh
-export OMP_NUM_THREADS=64
-export ZENDNNL_MATMUL_ALGO=2  # Use Blocked AOCL BLIS for best performance
-./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Test" -n 100
-```
-
-## Performance Optimization
-
-### MatMul Algorithm Selection
-
-ZenDNN's LowOHA MatMul supports multiple backend algorithms. For **best performance**, use the **Blocked AOCL BLIS** algorithm:
-
-```sh
-export ZENDNNL_MATMUL_ALGO=2  # Blocked AOCL BLIS (recommended)
-```
-
-**Available algorithms**:
-
-| Value | Algorithm              | Description                                    |
-|:-----:|:-----------------------|:----------------------------------------------|
-| 0     | Dynamic Dispatch       | Automatic backend selection (default)         |
-| 1     | AOCL BLIS              | AOCL BLIS backend                             |
-| 2     | AOCL BLIS Blocked      | **Blocked AOCL BLIS (recommended)**           |
-| 3     | OneDNN                 | OneDNN backend                                |
-| 4     | OneDNN Blocked         | Blocked OneDNN                                |
-| 5     | LibXSMM                | LibXSMM backend                               |
-
-### Profiling and Debugging
-
-For detailed profiling and logging options, refer to the [ZenDNN Logging Documentation](https://github.com/amd/ZenDNN/blob/zendnnl/docs/logging.md).
-
-## Known Issues
-
- **Limited operation support**: Currently only matrix multiplication (MUL_MAT) is accelerated via ZenDNN. Other operations fall back to the standard CPU backend.
- **BF16 support**: BF16 operations require AMD Zen 4 or Zen 5 architecture (EPYC 9004/9005 series). On older CPUs, operations will use FP32.
- **NUMA awareness**: For multi-socket systems, manual NUMA binding may be required for optimal performance.
-
-## Q&A
-
-**Q: How do I verify that ZenDNN backend is being used?**
-
-A: Check the log output when running llama.cpp. You should see messages indicating the ZenDNN backend is initialized. You can also check the backend name in the output.
-
-**Q: What performance improvement can I expect?**
-
-A: Performance gains vary depending on the model size, batch size, and CPU architecture. On AMD EPYC processors, you can typically expect 1.1x-2x speedup compared to standard CPU inference for matrix multiplication operations.
-
-**Q: Can I use ZenDNN on non-AMD processors?**
-
-A: ZenDNN is optimized specifically for AMD processors. While it may work on other x86-64 CPUs, performance benefits are only guaranteed on AMD Zen-based architectures.
-
-**Q: Does ZenDNN support quantized models?**
-
-A: Currently, ZenDNN primarily supports FP32 and BF16 data types. Quantized model support is not available at this time.
-
-**Q: Why is my inference not faster with ZenDNN?**
-
-A: Ensure:
-1. You're using an AMD EPYC or Ryzen processor (Zen 2 or newer)
-2. `OMP_NUM_THREADS` is set appropriately (physical core count)
-3. `ZENDNNL_MATMUL_ALGO=2` is set for best performance (Blocked AOCL BLIS)
-4. You're using a sufficiently large model (small models may not benefit as much)
-5. Enable profiling to verify ZenDNN MatMul is being called
-
-### **GitHub Contribution**:
-Please add the **[ZenDNN]** prefix/tag in issues/PRs titles to help the ZenDNN-team check/address them without delay.
-
-## TODO
-
- Expand operation support beyond MUL_MAT (attention operations, activations, etc.)
--- a/docs/backend/zDNN.md
+++ b/docs/backend/zDNN.md
@@ -1,10 +1,5 @@
 # llama.cpp for IBM zDNN Accelerator

-> [!WARNING]
-> **Note:** zDNN is **not** the same as ZenDNN.
-> - **zDNN** (this page): IBM's Deep Neural Network acceleration library for IBM Z & LinuxONE Mainframes
-> - **ZenDNN**: AMD's deep learning library for AMD EPYC CPUs ([see ZenDNN documentation](ZenDNN.md))
-
 ## Background

 IBM zDNN (Z Deep Neural Network) is a hardware acceleration library designed specifically to leverage the IBM NNPA (Neural Network Processor Assist) accelerator located within IBM Telum I and II processors. It provides significant performance improvements for neural network inference operations.
--- a/docs/build-riscv64-spacemit.md
+++ b/docs/build-riscv64-spacemit.md
@@ -19,7 +19,6 @@ cmake -B build \
    -DGGML_RVV=ON \
    -DGGML_RV_ZFH=ON \
    -DGGML_RV_ZICBOP=ON \
-    -DGGML_RV_ZIHINTPAUSE=ON \
    -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
    -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake \
    -DCMAKE_INSTALL_PREFIX=build/installed
--- a/docs/build.md
+++ b/docs/build.md
@@ -431,22 +431,11 @@ docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/ren

 ### For Linux users:

-#### Using the LunarG Vulkan SDK
-
 First, follow the official LunarG instructions for the installation and setup of the Vulkan SDK in the [Getting Started with the Linux Tarball Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html) guide.

 > [!IMPORTANT]
 > After completing the first step, ensure that you have used the `source` command on the `setup_env.sh` file inside of the Vulkan SDK in your current terminal session. Otherwise, the build won't work. Additionally, if you close out of your terminal, you must perform this step again if you intend to perform a build. However, there are ways to make this persistent. Refer to the Vulkan SDK guide linked in the first step for more information about any of this.

-#### Using system packages
-
-On Debian / Ubuntu, you can install the required dependencies using:
-```sh
-sudo apt-get install libvulkan-dev glslc
-```
-
-#### Common steps
-
 Second, after verifying that you have followed all of the SDK installation/setup steps, use this command to make sure before proceeding:
 ```bash
 vulkaninfo
@@ -495,38 +484,6 @@ llama_new_context_with_model:       CANN compute buffer size =  1260.81 MiB

 For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).

-## ZenDNN
-
-ZenDNN provides optimized deep learning primitives for AMD EPYC™ CPUs. It accelerates matrix multiplication operations for inference workloads.
-
-### Compilation
-
- Using `CMake` on Linux (automatic build):
-
-    ```bash
-    cmake -B build -DGGML_ZENDNN=ON
-    cmake --build build --config Release
-    ```
-
-    The first build will automatically download and build ZenDNN, which may take 5-10 minutes. Subsequent builds will be much faster.
-
- Using `CMake` with custom ZenDNN installation:
-
-    ```bash
-    cmake -B build -DGGML_ZENDNN=ON -DZENDNN_ROOT=/path/to/zendnn/install
-    cmake --build build --config Release
-    ```
-
-### Testing
-
-You can test with:
-
-```bash
-./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -n 50
-```
-
-For detailed information about hardware support, setup instructions, and performance optimization, refer to [llama.cpp for ZenDNN](./backend/ZenDNN.md).
-
 ## Arm® KleidiAI™
 KleidiAI is a library of optimized microkernels for AI workloads, specifically designed for Arm CPUs. These microkernels enhance performance and can be enabled for use by the CPU backend.

--- a/docs/development/parsing.md
+++ b/docs/development/parsing.md
@@ -1,288 +0,0 @@
-# Parsing Model Output
-
-The `common` library contains a PEG parser implementation suitable for parsing
-model output.
-
-Types with the prefix `common_peg_*` are intended for general use and may have
-applications beyond parsing model output, such as parsing user-provided regex
-patterns.
-
-Types with the prefix `common_chat_peg_*` are specialized helpers for model
-output.
-
-The parser features:
-
- Partial parsing of streaming input
- Built-in JSON parsers
- AST generation with semantics via "tagged" nodes
-
-## Example
-
-Below is a contrived example demonstrating how to use the PEG parser to parse
-output from a model that emits arguments as JSON.
-
-```cpp
-auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
-    // Build a choice of all available tools
-    auto tool_choice = p.choice();
-    for (const auto & tool : tools) {
-        const auto & function = tool.at("function");
-        std::string name = function.at("name");
-        const auto & schema = function.at("parameters");
-
-        auto tool_name = p.json_member("name", "\"" + p.literal(name) + "\"");
-        auto tool_args = p.json_member("arguments", p.schema(p.json(), "tool-" + name + "-schema", schema));
-
-        tool_choice |= p.rule("tool-" + name, "{" << tool_name << "," << tool_args << "}");
-    }
-
-    // Define the tool call structure: <tool_call>[{tool}]</tool_call>
-    auto tool_call = p.trigger_rule("tool-call",
-        p.sequence({
-            p.literal("<tool_call>["),
-            tool_choice,
-            p.literal("]</tool_call>")
-        })
-    );
-
-    // Parser accepts content, optionally followed by a tool call
-    return p.sequence({
-        p.content(p.until("<tool_call>")),
-        p.optional(tool_call),
-        p.end()
-    });
-});
-```
-
-For a more complete example, see `test_example_native()` in
-[tests/test-chat-peg-parser.cpp](tests/test-chat-peg-parser.cpp).
-
-## Parsers/Combinators
-
-### Basic Matchers
-
- **`eps()`** - Matches nothing and always succeeds (epsilon/empty match)
- **`start()`** - Matches the start of input (anchor `^`)
- **`end()`** - Matches the end of input (anchor `$`)
- **`literal(string)`** - Matches an exact literal string
- **`any()`** - Matches any single character (`.`)
-
-### Combinators
-
- **`sequence(...)`** - Matches parsers in order; all must succeed
- **`choice(...)`** - Matches the first parser that succeeds from alternatives (ordered choice)
- **`one_or_more(p)`** - Matches one or more repetitions (`+`)
- **`zero_or_more(p)`** - Matches zero or more repetitions (`*`)
- **`optional(p)`** - Matches zero or one occurrence (`?`)
- **`repeat(p, min, max)`** - Matches between min and max repetitions (use `-1` for unbounded)
- **`repeat(p, n)`** - Matches exactly n repetitions
-
-### Lookahead
-
- **`peek(p)`** - Positive lookahead: succeeds if parser succeeds without consuming input (`&`)
- **`negate(p)`** - Negative lookahead: succeeds if parser fails without consuming input (`!`)
-
-### Character Classes & Utilities
-
- **`chars(classes, min, max)`** - Matches repetitions of characters from a character class
- **`space()`** - Matches zero or more whitespace characters (space, tab, newline)
- **`until(delimiter)`** - Matches characters until delimiter is found (delimiter not consumed)
- **`until_one_of(delimiters)`** - Matches characters until any delimiter in the list is found
- **`rest()`** - Matches everything remaining (`.*`)
-
-### JSON Parsers
-
- **`json()`** - Complete JSON parser (objects, arrays, strings, numbers, booleans, null)
- **`json_object()`** - JSON object parser
- **`json_array()`** - JSON array parser
- **`json_string()`** - JSON string parser
- **`json_number()`** - JSON number parser
- **`json_bool()`** - JSON boolean parser
- **`json_null()`** - JSON null parser
- **`json_string_content()`** - JSON string content without surrounding quotes
- **`json_member(key, p)`** - JSON object member with specific key and value parser
-
-### Grammar Building
-
- **`ref(name)`** - Creates a lightweight reference to a named rule (for recursive grammars)
- **`rule(name, p, trigger)`** - Creates a named rule and returns a reference
- **`trigger_rule(name, p)`** - Creates a trigger rule (entry point for lazy grammar generation)
- **`schema(p, name, schema, raw)`** - Wraps parser with JSON schema metadata for grammar generation
-
-### AST Control
-
- **`atomic(p)`** - Prevents AST node creation for partial parses
- **`tag(tag, p)`** - Creates AST nodes with semantic tags (multiple nodes can share tags)
-
-## GBNF Grammar Generation
-
-The PEG parser also acts as a convenient DSL for generating GBNF grammars, with
-some exceptions.
-
-```cpp
-data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-    foreach_function(params.tools, [&](const json & fn) {
-        builder.resolve_refs(fn.at("parameters"));
-    });
-    parser.build_grammar(builder, data.grammar_lazy);
-});
-```
-
-The notable exception is the `negate(p)` lookahead parser, which cannot be
-defined as a CFG grammar and therefore does not produce a rule. Its usage
-should be limited and preferably hidden behind a `schema()` parser. In many
-cases, `until(delimiter)` or `until_one_of(delimiters)` is a better choice.
-
-Another limitation is that the PEG parser requires an unambiguous grammar. In
-contrast, the `llama-grammar` implementation can support ambiguous grammars,
-though they are difficult to parse.
-
-### Lazy Grammars
-
-During lazy grammar generation, only rules reachable from a `trigger_rule(p)`
-are emitted in the grammar. All trigger rules are added as alternations in the
-root rule. It is still necessary to define trigger patterns, as the parser has
-no interaction with the grammar sampling.
-
-### JSON Schema
-
-The `schema(p, name, schema, raw)` parser will use the `json-schema-to-grammar`
-implementation to generate the grammar instead of the underlying parser.
-
-The `raw` option emits a grammar suitable for a raw string instead of a JSON
-string. In other words, it won't be wrapped in quotes or require escaping
-quotes. It should only be used when `type == "string"`.
-
-The downside is that it can potentially lead to ambiguous grammars. For
-example, if a user provides the pattern `^.*$`, the following grammar may be
-generated:
-
-```
-root ::= "<arg>" .* "</arg>"
-```
-
-This creates an ambiguous grammar that cannot be parsed by the PEG parser. To
-help mitigate this, if `.*` is found in the pattern, the grammar from the
-underlying parser will be emitted instead.
-
-## Common AST Shapes for Chat Parsing
-
-Most model output can be placed in one of the following categories:
-
- Content only
- Tool calling with arguments emitted as a single JSON object
- Tool calling with arguments emitted as separate entities, either XML
-  (Qwen3-Coder, MiniMax M2) or pseudo-function calls (LFM2)
-
-To provide broad coverage,
-[`common/chat-peg-parser.h`](common/chat-peg-parser.h) contains builders and
-mappers that help create parsers and visitors/extractors for these types. They
-require parsers to tag nodes to conform to an AST "shape". This normalization
-makes it easy to extract information and generalize parsing.
-
-### Simple
-
-The `common_chat_peg_builder` builds a `simple` parser that supports
-content-only models with optional reasoning.
-
- **`reasoning(p)`** - Tag node for extracting `reasoning_content`
- **`content(p)`** - Tag node for extracting `content`
-
-```cpp
-build_chat_peg_parser([&](common_chat_peg_parser & p) {
-    return p.sequence({
-        p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>"),
-        p.content(p.until("<tool_call>")),
-        p.end()
-    });
-});
-```
-
-Use `common_chat_peg_mapper` to extract the content. Note that this is already
-done for you in `common_chat_peg_parser` when
-`chat_format == COMMON_CHAT_FORMAT_PEG_SIMPLE`.
-
-```cpp
-auto result = parser.parse(ctx);
-
-common_chat_msg msg;
-auto mapper = common_chat_peg_mapper(msg);
-mapper.from_ast(ctx.ast, result);
-```
-
-### Native
-
-The `common_chat_peg_native_builder` builds a `native` parser suitable for
-models that emit tool arguments as a direct JSON object.
-
- **`reasoning(p)`** - Tag node for `reasoning_content`
- **`content(p)`** - Tag node for `content`
- **`tool(p)`** - Tag entirety of a single tool call
- **`tool_open(p)`** - Tag start of a tool call
- **`tool_close(p)`** - Tag end of a tool call
- **`tool_id(p)`** - Tag the tool call ID (optional)
- **`tool_name(p)`** - Tag the tool name
- **`tool_args(p)`** - Tag the tool arguments
-
-```cpp
-build_chat_peg_native_parser([&](common_chat_peg_native_parser & p) {
-    auto get_weather_tool = p.tool(p.sequence({
-        p.tool_open(p.literal("{")),
-        p.json_member("name", "\"" + p.tool_name(p.literal("get_weather")) + "\""),
-        p.literal(","),
-        p.json_member("arguments", p.tool_args(p.json())),
-        p.tool_close(p.literal("}"))
-    }));
-
-    return p.sequence({
-        p.content(p.until("<tool_call>")),
-        p.literal("<tool_call>"),
-        get_weather_tool,
-        p.literal("</tool_call>"),
-        p.end()
-    });
-});
-```
-
-### Constructed
-
-The `common_chat_peg_constructed_builder` builds a `constructed` parser
-suitable for models that emit tool arguments as separate entities, such as XML
-tags.
-
- **`reasoning(p)`** - Tag node for `reasoning_content`
- **`content(p)`** - Tag node for `content`
- **`tool(p)`** - Tag entirety of a single tool call
- **`tool_open(p)`** - Tag start of a tool call
- **`tool_close(p)`** - Tag end of a tool call
- **`tool_name(p)`** - Tag the tool name
- **`tool_arg(p)`** - Tag a complete tool argument (name + value)
- **`tool_arg_open(p)`** - Tag start of a tool argument
- **`tool_arg_close(p)`** - Tag end of a tool argument
- **`tool_arg_name(p)`** - Tag the argument name
- **`tool_arg_string_value(p)`** - Tag string value for the argument
- **`tool_arg_json_value(p)`** - Tag JSON value for the argument
-
-```cpp
-build_chat_peg_constructed_parser([&](common_chat_peg_constructed_builder & p) {
-    auto location_arg = p.tool_arg(
-        p.tool_arg_open("<parameter name=\"" + p.tool_arg_name(p.literal("location")) + "\">"),
-        p.tool_arg_string_value(p.until("</parameter>")),
-        p.tool_arg_close(p.literal("</parameter>"))
-    );
-
-    auto get_weather_tool = p.tool(p.sequence({
-        p.tool_open("<function name=\"" + p.tool_name(p.literal("get_weather")) + "\">"),
-        location_arg,
-        p.tool_close(p.literal("</function>"))
-    }));
-
-    return p.sequence({
-        p.content(p.until("<tool_call>")),
-        p.literal("<tool_call>"),
-        get_weather_tool,
-        p.literal("</tool_call>"),
-        p.end()
-    });
-});
-```
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -56,7 +56,7 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /model
 or with a server image:

 ```bash
-docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
+docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
 ```

 ## Docker With CUDA
@@ -91,7 +91,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne
 ```bash
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```

 ## Docker With MUSA
@@ -125,5 +125,5 @@ After building locally, Usage is similar to the non-MUSA examples, but you'll ne
 ```bash
 docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -12,112 +12,110 @@ Legend:
 - 🟡 Partially supported by this backend
 - ❌ Not supported by this backend

-| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | WebGPU | ZenDNN | zDNN |
-|-----------|------|------|------|------|------|------|------|------|------|------|------|
-|                              ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
-|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
-|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
-|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
-|                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
-|               CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                           CUMSUM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                             DIAG | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
-|                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                             FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ | ❌ |
-|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
-|                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                      GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
-|                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|               GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                           IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                          L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
-|                              LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
-|                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
-|                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
-|                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
-|                     NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                     OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | ❌ |
-|                              PAD | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
-|                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ | ❌ | ❌ |
-|                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                             ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
-|                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                              SET | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | ❌ |
-|                         SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
-|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
-|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
-|                          SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
-|                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
-|                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
-|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
-|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
-|                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
-|                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                              SUM | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
-|                         SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
-|                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                       SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ❌ | ❌ |
-|                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                            TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
-|                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
-|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
-|                            XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
+| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | zDNN |
+|-----------|------|------|------|------|------|------|------|------|------|
+|                              ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
+|                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
+|                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
+|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ |
+|                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
+|                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ |
+|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
+|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
+|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ |
+|                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
+|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ |
+|                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
+|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
+|                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
+|                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
+|               CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                           CUMSUM | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
+|                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
+|                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
+|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ❌ |
+|                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
+|                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                             FILL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
+|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
+|                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
+|                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
+|                      GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
+|                             GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
+|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
+|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
+|                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
+|                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
+|               GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
+|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
+|                           IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
+|                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                          L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
+|                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | 🟡 | ❌ |
+|                              LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ |
+|                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
+|                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
+|                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
+|                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
+|                             NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
+|                     NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                     OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
+|                              PAD | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
+|                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ |
+|                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
+|                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
+|                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
+|                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ |
+|                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
+|                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
+|                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
+|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                             ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
+|                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
+|                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
+|                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
+|                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
+|                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
+|                              SET | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ❌ | ❌ |
+|                         SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
+|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ❌ |
+|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
+|                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
+|                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
+|                          SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
+|                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ |
+|                        SOLVE_TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
+|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
+|                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
+|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ |
+|                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ❌ |
+|                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
+|                              SUM | ❌ | ✅ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
+|                         SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
+|                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
+|                       SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | 🟡 | ❌ |
+|                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ❌ |
+|               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
+|                              TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
+|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
+|                            XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
--- a/docs/ops/BLAS.csv
+++ b/docs/ops/BLAS.csv
--- a/docs/ops/CPU.csv
+++ b/docs/ops/CPU.csv
@@ -4964,7 +4964,6 @@
 "CPU","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","1","yes","CPU"
 "CPU","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","CPU"
 "CPU","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","CPU"
-"CPU","CONV_TRANSPOSE_2D","ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","1","yes","CPU"
 "CPU","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","1","yes","CPU"
 "CPU","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","1","yes","CPU"
 "CPU","ARGMAX","type=f32,ne=[32,1,1,1]","support","1","yes","CPU"
@@ -5420,45 +5419,17 @@
 "CPU","CPY","type_src=f16,type_dst=f16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
 "CPU","CPY","type_src=f32,type_dst=f32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
 "CPU","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
-"CPU","CPY","type_src=i32,type_dst=i32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
-"CPU","CPY","type_src=i32,type_dst=i32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CPU"
 "CPU","CPY","type_src=f32,type_dst=f32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=bf16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=bf16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=bf16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=bf16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=bf16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=bf16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[10,10,10,1]","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[2,1,1,1]","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[2,1,3,5]","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[2,3,5,7]","support","1","yes","CPU"
+"CPU","CONT","type=f16,ne=[2,1,1,1]","support","1","yes","CPU"
+"CPU","CONT","type=f16,ne=[2,1,3,5]","support","1","yes","CPU"
+"CPU","CONT","type=f16,ne=[2,3,5,7]","support","1","yes","CPU"
+"CPU","CONT","type=bf16,ne=[2,1,1,1]","support","1","yes","CPU"
+"CPU","CONT","type=bf16,ne=[2,1,3,5]","support","1","yes","CPU"
+"CPU","CONT","type=bf16,ne=[2,3,5,7]","support","1","yes","CPU"
 "CPU","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
 "CPU","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
 "CPU","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
@@ -5684,7 +5655,6 @@
 "CPU","MUL","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
 "CPU","DIV","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
 "CPU","ADD1","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
-"CPU","ADD1","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=0.000000,inplace=0","support","1","yes","CPU"
 "CPU","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=0","support","1","yes","CPU"
 "CPU","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=1","support","1","yes","CPU"
@@ -8674,13 +8644,9 @@
 "CPU","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CPU"
 "CPU","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CPU"
 "CPU","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
-"CPU","FLOOR","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
-"CPU","CEIL","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
-"CPU","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
-"CPU","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
 "CPU","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","CPU"
 "CPU","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
@@ -8700,13 +8666,9 @@
 "CPU","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CPU"
 "CPU","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CPU"
 "CPU","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
-"CPU","FLOOR","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","CEIL","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
-"CPU","CEIL","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","ROUND","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
-"CPU","ROUND","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","TRUNC","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
-"CPU","TRUNC","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","DIAG_MASK_INF","type=f32,ne=[10,10,1,1],n_past=5","support","1","yes","CPU"
 "CPU","DIAG_MASK_INF","type=f32,ne=[10,10,3,1],n_past=5","support","1","yes","CPU"
 "CPU","DIAG_MASK_INF","type=f32,ne=[10,10,3,2],n_past=5","support","1","yes","CPU"
@@ -9449,405 +9411,18 @@
 "CPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","CPU"
 "CPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","CPU"
 "CPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1023,2,1,3],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1024,2,1,3],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1025,2,1,3],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2047,2,1,3],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2048,2,1,3],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2049,2,1,3],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[8,1,1,1],order=1","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[16,10,10,10],order=1","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[60,10,10,10],order=1","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1023,2,1,3],order=1","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1024,2,1,3],order=1","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1025,2,1,3],order=1","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2047,2,1,3],order=1","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1024,1,1,1],order=1","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[16384,1,1,1],order=1","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","1","yes","CPU"
@@ -9860,10 +9435,6 @@
 "CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","1","yes","CPU"
-"CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=0","support","1","yes","CPU"
-"CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=1","support","1","yes","CPU"
-"CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","1","yes","CPU"
-"CPU","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","1","yes","CPU"
@@ -9892,30 +9463,15 @@
 "CPU","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","1","yes","CPU"
 "CPU","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","1","yes","CPU"
 "CPU","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0,circular=0","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","1","yes","CPU"
 "CPU","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","1","yes","CPU"
 "CPU","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","1","yes","CPU"
 "CPU","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","1","yes","CPU"
 "CPU","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","1","yes","CPU"
-"CPU","ARANGE","type=f32,start=0.000000,stop=1048576.000000,step=1.000000","support","1","yes","CPU"
 "CPU","TIMESTEP_EMBEDDING","type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000","support","1","yes","CPU"
 "CPU","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","CPU"
 "CPU","CUMSUM","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[127,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[128,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[128,128,4,4]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[255,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[256,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[511,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[512,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[1023,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[1024,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[2047,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[2048,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[242004,1,1,1]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[375960,1,1,1]","support","1","yes","CPU"
 "CPU","XIELU","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
 "CPU","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","CPU"
 "CPU","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","CPU"
@@ -9924,10 +9480,6 @@
 "CPU","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","1","yes","CPU"
 "CPU","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","CPU"
 "CPU","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","CPU"
-"CPU","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","CPU"
-"CPU","DIAG","type=f32,ne=[10,1,4,3]","support","1","yes","CPU"
-"CPU","DIAG","type=f32,ne=[79,1,19,13]","support","1","yes","CPU"
-"CPU","DIAG","type=f32,ne=[256,1,8,16]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","1","yes","CPU"
@@ -9935,16 +9487,10 @@
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","1","yes","CPU"
-"CPU","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[31,128,4,4]","support","1","yes","CPU"
-"CPU","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[300,64,4,4]","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=0","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=0","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=1","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=1","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=0","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=0","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=1","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=1","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1","support","1","yes","CPU"
 "CPU","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","CPU"
 "CPU","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","CPU"
 "CPU","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","1","yes","CPU"
--- a/docs/ops/CUDA.csv
+++ b/docs/ops/CUDA.csv
@@ -4964,7 +4964,6 @@
 "CUDA0","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","1","yes","CUDA"
 "CUDA0","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","CUDA"
 "CUDA0","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","CUDA"
-"CUDA0","CONV_TRANSPOSE_2D","ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","1","yes","CUDA"
 "CUDA0","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","1","yes","CUDA"
 "CUDA0","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","1","yes","CUDA"
 "CUDA0","ARGMAX","type=f32,ne=[32,1,1,1]","support","1","yes","CUDA"
@@ -5420,45 +5419,17 @@
 "CUDA0","CPY","type_src=f16,type_dst=f16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
 "CUDA0","CPY","type_src=f32,type_dst=f32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
 "CUDA0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
-"CUDA0","CPY","type_src=i32,type_dst=i32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
-"CUDA0","CPY","type_src=i32,type_dst=i32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CUDA"
 "CUDA0","CPY","type_src=f32,type_dst=f32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=bf16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=bf16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=bf16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=bf16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=bf16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=bf16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[10,10,10,1]","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[2,1,1,1]","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[2,1,3,5]","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[2,3,5,7]","support","1","yes","CUDA"
+"CUDA0","CONT","type=f16,ne=[2,1,1,1]","support","1","yes","CUDA"
+"CUDA0","CONT","type=f16,ne=[2,1,3,5]","support","1","yes","CUDA"
+"CUDA0","CONT","type=f16,ne=[2,3,5,7]","support","1","yes","CUDA"
+"CUDA0","CONT","type=bf16,ne=[2,1,1,1]","support","1","yes","CUDA"
+"CUDA0","CONT","type=bf16,ne=[2,1,3,5]","support","1","yes","CUDA"
+"CUDA0","CONT","type=bf16,ne=[2,3,5,7]","support","1","yes","CUDA"
 "CUDA0","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
 "CUDA0","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
 "CUDA0","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
@@ -5684,7 +5655,6 @@
 "CUDA0","MUL","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
 "CUDA0","DIV","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
 "CUDA0","ADD1","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
-"CUDA0","ADD1","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=0.000000,inplace=0","support","1","yes","CUDA"
 "CUDA0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=0","support","1","yes","CUDA"
 "CUDA0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=1","support","1","yes","CUDA"
@@ -8674,13 +8644,9 @@
 "CUDA0","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CUDA"
 "CUDA0","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CUDA"
 "CUDA0","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
-"CUDA0","FLOOR","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
-"CUDA0","CEIL","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
-"CUDA0","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
-"CUDA0","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
 "CUDA0","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","CUDA"
 "CUDA0","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
@@ -8700,13 +8666,9 @@
 "CUDA0","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CUDA"
 "CUDA0","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CUDA"
 "CUDA0","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
-"CUDA0","FLOOR","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","CEIL","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
-"CUDA0","CEIL","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","ROUND","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
-"CUDA0","ROUND","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","TRUNC","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
-"CUDA0","TRUNC","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","DIAG_MASK_INF","type=f32,ne=[10,10,1,1],n_past=5","support","1","yes","CUDA"
 "CUDA0","DIAG_MASK_INF","type=f32,ne=[10,10,3,1],n_past=5","support","1","yes","CUDA"
 "CUDA0","DIAG_MASK_INF","type=f32,ne=[10,10,3,2],n_past=5","support","1","yes","CUDA"
@@ -9449,405 +9411,18 @@
 "CUDA0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","no","CUDA"
 "CUDA0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","CUDA"
 "CUDA0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","no","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1023,2,1,3],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1024,2,1,3],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1025,2,1,3],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2047,2,1,3],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2048,2,1,3],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2049,2,1,3],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[8,1,1,1],order=1","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[16,10,10,10],order=1","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[60,10,10,10],order=1","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1023,2,1,3],order=1","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1024,2,1,3],order=1","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1025,2,1,3],order=1","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2047,2,1,3],order=1","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1024,1,1,1],order=1","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[16384,1,1,1],order=1","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","0","no","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","1","yes","CUDA"
@@ -9860,10 +9435,6 @@
 "CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","1","yes","CUDA"
-"CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=0","support","1","yes","CUDA"
-"CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=1","support","1","yes","CUDA"
-"CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","1","yes","CUDA"
-"CUDA0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","1","yes","CUDA"
@@ -9892,59 +9463,34 @@
 "CUDA0","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","1","yes","CUDA"
 "CUDA0","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","1","yes","CUDA"
 "CUDA0","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0,circular=0","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","1","yes","CUDA"
 "CUDA0","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","1","yes","CUDA"
 "CUDA0","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","1","yes","CUDA"
 "CUDA0","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","1","yes","CUDA"
 "CUDA0","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","1","yes","CUDA"
-"CUDA0","ARANGE","type=f32,start=0.000000,stop=1048576.000000,step=1.000000","support","1","yes","CUDA"
 "CUDA0","TIMESTEP_EMBEDDING","type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000","support","1","yes","CUDA"
 "CUDA0","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[127,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[128,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[128,128,4,4]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[255,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[256,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[511,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[512,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[1023,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[1024,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[2047,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[2048,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[242004,1,1,1]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[375960,1,1,1]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[10,5,4,3]","support","0","no","CUDA"
 "CUDA0","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","CUDA"
-"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","CUDA"
-"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","CUDA"
-"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","1","yes","CUDA"
-"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","1","yes","CUDA"
-"CUDA0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","1","yes","CUDA"
-"CUDA0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","CUDA"
-"CUDA0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","CUDA"
-"CUDA0","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","CUDA"
-"CUDA0","DIAG","type=f32,ne=[10,1,4,3]","support","1","yes","CUDA"
-"CUDA0","DIAG","type=f32,ne=[79,1,19,13]","support","1","yes","CUDA"
-"CUDA0","DIAG","type=f32,ne=[256,1,8,16]","support","1","yes","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","1","yes","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","1","yes","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","1","yes","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","1","yes","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","1","yes","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","1","yes","CUDA"
+"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","CUDA"
+"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","CUDA"
+"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","0","no","CUDA"
+"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","0","no","CUDA"
+"CUDA0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","0","no","CUDA"
+"CUDA0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","0","no","CUDA"
+"CUDA0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","0","no","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","0","no","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","0","no","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","0","no","CUDA"
 "CUDA0","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","0","no","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[31,128,4,4]","support","0","no","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[300,64,4,4]","support","0","no","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=0","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=0","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=1","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=1","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=0","support","0","no","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=0","support","0","no","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=1","support","0","no","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=1","support","0","no","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1","support","0","no","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1","support","0","no","CUDA"
 "CUDA0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","CUDA"
 "CUDA0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","CUDA"
 "CUDA0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","CUDA"
--- a/docs/ops/Metal.csv
+++ b/docs/ops/Metal.csv
--- a/docs/ops/OpenCL.csv
+++ b/docs/ops/OpenCL.csv
--- a/docs/ops/Vulkan.csv
+++ b/docs/ops/Vulkan.csv
@@ -5,8 +5,8 @@
 "Vulkan0","SGN","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
 "Vulkan0","NEG","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
 "Vulkan0","NEG","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","STEP","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","STEP","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
+"Vulkan0","STEP","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","STEP","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
 "Vulkan0","TANH","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
 "Vulkan0","TANH","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
 "Vulkan0","ELU","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
@@ -29,18 +29,18 @@
 "Vulkan0","EXP","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
 "Vulkan0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
 "Vulkan0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
-"Vulkan0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
+"Vulkan0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
 "Vulkan0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
 "Vulkan0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","FLOOR","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","CEIL","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","CEIL","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
+"Vulkan0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","FLOOR","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
+"Vulkan0","CEIL","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","CEIL","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
+"Vulkan0","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
+"Vulkan0","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
 "Vulkan0","ABS","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan"
 "Vulkan0","ABS","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan"
 "Vulkan0","SGN","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan"
@@ -89,8 +89,8 @@
 "Vulkan0","SGN","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
 "Vulkan0","NEG","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
 "Vulkan0","NEG","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","STEP","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","STEP","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
+"Vulkan0","STEP","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","STEP","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
 "Vulkan0","TANH","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
 "Vulkan0","TANH","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
 "Vulkan0","ELU","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
@@ -113,18 +113,18 @@
 "Vulkan0","EXP","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
 "Vulkan0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
 "Vulkan0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
-"Vulkan0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
+"Vulkan0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
 "Vulkan0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
 "Vulkan0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","FLOOR","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","FLOOR","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","CEIL","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","CEIL","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","ROUND","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","ROUND","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","TRUNC","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","TRUNC","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
+"Vulkan0","FLOOR","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","FLOOR","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
+"Vulkan0","CEIL","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","CEIL","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
+"Vulkan0","ROUND","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","ROUND","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
+"Vulkan0","TRUNC","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","TRUNC","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
 "Vulkan0","ABS","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan"
 "Vulkan0","ABS","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan"
 "Vulkan0","SGN","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan"
@@ -5005,8 +5005,8 @@
 "Vulkan0","DUP","type=f16,ne=[10,10,5,1],permute=[0,2,1,3]","support","1","yes","Vulkan"
 "Vulkan0","DUP","type=f32,ne=[10,10,5,1],permute=[1,0,2,3]","support","1","yes","Vulkan"
 "Vulkan0","DUP","type=f16,ne=[10,10,5,1],permute=[1,0,2,3]","support","1","yes","Vulkan"
-"Vulkan0","DUP","type=i16,ne=[10,8,3,1],permute=[0,2,1,3]","support","1","yes","Vulkan"
-"Vulkan0","DUP","type=i16,ne=[10,8,3,1],permute=[1,2,0,3]","support","1","yes","Vulkan"
+"Vulkan0","DUP","type=i16,ne=[10,8,3,1],permute=[0,2,1,3]","support","0","no","Vulkan"
+"Vulkan0","DUP","type=i16,ne=[10,8,3,1],permute=[1,2,0,3]","support","0","no","Vulkan"
 "Vulkan0","SET","type_src=f32,type_dst=f32,ne=[6,5,4,3],dim=1","support","0","no","Vulkan"
 "Vulkan0","SET","type_src=f32,type_dst=f32,ne=[6,5,4,3],dim=2","support","0","no","Vulkan"
 "Vulkan0","SET","type_src=f32,type_dst=f32,ne=[6,5,4,3],dim=3","support","0","no","Vulkan"
@@ -5032,14 +5032,14 @@
 "Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
 "Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan"
 "Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
-"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
-"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan"
+"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
+"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan"
 "Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
-"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
-"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan"
+"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
+"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan"
 "Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
-"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
-"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan"
+"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
+"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan"
 "Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
 "Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
 "Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan"
@@ -5271,7 +5271,7 @@
 "Vulkan0","CPY","type_src=bf16,type_dst=f16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
 "Vulkan0","CPY","type_src=bf16,type_dst=f16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
 "Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
-"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
+"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
 "Vulkan0","CPY","type_src=bf16,type_dst=q4_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
 "Vulkan0","CPY","type_src=bf16,type_dst=q4_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
 "Vulkan0","CPY","type_src=bf16,type_dst=q4_1,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
@@ -5415,49 +5415,21 @@
 "Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
 "Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
 "Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,4,3,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
-"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
+"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","0","no","Vulkan"
 "Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
 "Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
-"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
-"Vulkan0","CPY","type_src=i32,type_dst=i32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
-"Vulkan0","CPY","type_src=i32,type_dst=i32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
+"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","0","no","Vulkan"
 "Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=i32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=i32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=i32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=i32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=i32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=i32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=i32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=i32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=i32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=i32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=i32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=i32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=f16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=bf16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=bf16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=bf16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=bf16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=bf16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","Vulkan"
-"Vulkan0","CONT","type=bf16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f32,ne=[10,10,10,1]","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f32,ne=[2,1,1,1]","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f32,ne=[2,1,3,5]","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f32,ne=[2,3,5,7]","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f16,ne=[2,1,1,1]","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f16,ne=[2,1,3,5]","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=f16,ne=[2,3,5,7]","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=bf16,ne=[2,1,1,1]","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=bf16,ne=[2,1,3,5]","support","1","yes","Vulkan"
+"Vulkan0","CONT","type=bf16,ne=[2,3,5,7]","support","0","no","Vulkan"
 "Vulkan0","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
 "Vulkan0","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
 "Vulkan0","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
@@ -5682,8 +5654,7 @@
 "Vulkan0","SUB","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
 "Vulkan0","MUL","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
 "Vulkan0","DIV","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
-"Vulkan0","ADD1","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
-"Vulkan0","ADD1","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
+"Vulkan0","ADD1","type=f32,ne=[10,5,4,3]","support","0","no","Vulkan"
 "Vulkan0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=0.000000,inplace=0","support","1","yes","Vulkan"
 "Vulkan0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=0","support","1","yes","Vulkan"
 "Vulkan0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=1","support","1","yes","Vulkan"
@@ -8661,10 +8632,10 @@
 "Vulkan0","COS","type=f16,ne=[10,2,2,2]","support","0","no","Vulkan"
 "Vulkan0","CLAMP","type=f16,ne=[10,5,4,3],min=-0.500000,max=0.500000","support","0","no","Vulkan"
 "Vulkan0","LEAKY_RELU","type=f16,ne_a=[10,5,4,3],negative_slope=0.100000","support","0","no","Vulkan"
-"Vulkan0","FLOOR","type=f16,ne=[10,2,2,2]","support","1","yes","Vulkan"
-"Vulkan0","CEIL","type=f16,ne=[10,2,2,2]","support","1","yes","Vulkan"
-"Vulkan0","ROUND","type=f16,ne=[10,2,2,2]","support","1","yes","Vulkan"
-"Vulkan0","TRUNC","type=f16,ne=[10,2,2,2]","support","1","yes","Vulkan"
+"Vulkan0","FLOOR","type=f16,ne=[10,2,2,2]","support","0","no","Vulkan"
+"Vulkan0","CEIL","type=f16,ne=[10,2,2,2]","support","0","no","Vulkan"
+"Vulkan0","ROUND","type=f16,ne=[10,2,2,2]","support","0","no","Vulkan"
+"Vulkan0","TRUNC","type=f16,ne=[10,2,2,2]","support","0","no","Vulkan"
 "Vulkan0","SQR","type=f16,ne=[7,1,5,3]","support","0","no","Vulkan"
 "Vulkan0","SQRT","type=f16,ne=[7,1,5,3]","support","0","no","Vulkan"
 "Vulkan0","LOG","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
@@ -8672,14 +8643,10 @@
 "Vulkan0","COS","type=f16,ne=[7,1,5,3]","support","0","no","Vulkan"
 "Vulkan0","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","0","no","Vulkan"
 "Vulkan0","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","0","no","Vulkan"
-"Vulkan0","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
-"Vulkan0","FLOOR","type=f16,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
-"Vulkan0","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
-"Vulkan0","CEIL","type=f16,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
-"Vulkan0","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
-"Vulkan0","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
-"Vulkan0","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
-"Vulkan0","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
+"Vulkan0","FLOOR","type=f16,ne=[7,1,5,3]","support","0","no","Vulkan"
+"Vulkan0","CEIL","type=f16,ne=[7,1,5,3]","support","0","no","Vulkan"
+"Vulkan0","ROUND","type=f16,ne=[7,1,5,3]","support","0","no","Vulkan"
+"Vulkan0","TRUNC","type=f16,ne=[7,1,5,3]","support","0","no","Vulkan"
 "Vulkan0","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
 "Vulkan0","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","Vulkan"
 "Vulkan0","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
@@ -8687,10 +8654,10 @@
 "Vulkan0","COS","type=f32,ne=[10,2,2,2]","support","1","yes","Vulkan"
 "Vulkan0","CLAMP","type=f32,ne=[10,5,4,3],min=-0.500000,max=0.500000","support","1","yes","Vulkan"
 "Vulkan0","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","Vulkan"
-"Vulkan0","FLOOR","type=f32,ne=[10,2,2,2]","support","1","yes","Vulkan"
-"Vulkan0","CEIL","type=f32,ne=[10,2,2,2]","support","1","yes","Vulkan"
-"Vulkan0","ROUND","type=f32,ne=[10,2,2,2]","support","1","yes","Vulkan"
-"Vulkan0","TRUNC","type=f32,ne=[10,2,2,2]","support","1","yes","Vulkan"
+"Vulkan0","FLOOR","type=f32,ne=[10,2,2,2]","support","0","no","Vulkan"
+"Vulkan0","CEIL","type=f32,ne=[10,2,2,2]","support","0","no","Vulkan"
+"Vulkan0","ROUND","type=f32,ne=[10,2,2,2]","support","0","no","Vulkan"
+"Vulkan0","TRUNC","type=f32,ne=[10,2,2,2]","support","0","no","Vulkan"
 "Vulkan0","SQR","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
 "Vulkan0","SQRT","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
 "Vulkan0","LOG","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
@@ -8698,14 +8665,10 @@
 "Vulkan0","COS","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
 "Vulkan0","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","Vulkan"
 "Vulkan0","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","Vulkan"
-"Vulkan0","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
-"Vulkan0","FLOOR","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
-"Vulkan0","CEIL","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
-"Vulkan0","CEIL","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
-"Vulkan0","ROUND","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
-"Vulkan0","ROUND","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
-"Vulkan0","TRUNC","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
-"Vulkan0","TRUNC","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
+"Vulkan0","FLOOR","type=f32,ne=[7,1,5,3]","support","0","no","Vulkan"
+"Vulkan0","CEIL","type=f32,ne=[7,1,5,3]","support","0","no","Vulkan"
+"Vulkan0","ROUND","type=f32,ne=[7,1,5,3]","support","0","no","Vulkan"
+"Vulkan0","TRUNC","type=f32,ne=[7,1,5,3]","support","0","no","Vulkan"
 "Vulkan0","DIAG_MASK_INF","type=f32,ne=[10,10,1,1],n_past=5","support","1","yes","Vulkan"
 "Vulkan0","DIAG_MASK_INF","type=f32,ne=[10,10,3,1],n_past=5","support","1","yes","Vulkan"
 "Vulkan0","DIAG_MASK_INF","type=f32,ne=[10,10,3,2],n_past=5","support","1","yes","Vulkan"
@@ -9448,405 +9411,28 @@
 "Vulkan0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","Vulkan"
 "Vulkan0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","Vulkan"
 "Vulkan0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[1023,2,1,3],order=0","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[1024,2,1,3],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[1025,2,1,3],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[2047,2,1,3],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[2048,2,1,3],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[2049,2,1,3],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[1025,2,1,3],order=0","support","0","no","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","0","no","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[2047,2,1,3],order=0","support","0","no","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[2048,2,1,3],order=0","support","0","no","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[2049,2,1,3],order=0","support","0","no","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[8,1,1,1],order=1","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[16,10,10,10],order=1","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[60,10,10,10],order=1","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[1023,2,1,3],order=1","support","1","yes","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[1024,2,1,3],order=1","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[1025,2,1,3],order=1","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[2047,2,1,3],order=1","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","Vulkan"
-"Vulkan0","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[1025,2,1,3],order=1","support","0","no","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[16384,1,1,1],order=1","support","0","no","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[2047,2,1,3],order=1","support","0","no","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","0","no","Vulkan"
+"Vulkan0","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","0","no","Vulkan"
 "Vulkan0","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[12,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[13,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[13,1,2,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[4,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[15,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[4,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[15,1,2,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[4,1,1,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[15,1,2,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[19,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[19,1,2,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8,1,1,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[19,1,2,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8,1,1,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[19,1,2,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=9999","support","0","no","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=9999","support","0","no","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=9999","support","0","no","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=9999","support","0","no","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=9999","support","0","no","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=9999","support","0","no","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=9999","support","0","no","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=9999","support","0","no","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=9999","support","0","no","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=9999","support","0","no","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=100","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=500","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=1023","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=9999","support","0","no","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=9999","support","0","no","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=1","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=2","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=3","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=7","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=15","support","1","yes","Vulkan"
-"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=15","support","1","yes","Vulkan"
 "Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","Vulkan"
 "Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","Vulkan"
 "Vulkan0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","1","yes","Vulkan"
@@ -9859,10 +9445,6 @@
 "Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","1","yes","Vulkan"
 "Vulkan0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","1","yes","Vulkan"
 "Vulkan0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","1","yes","Vulkan"
-"Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=0","support","0","no","Vulkan"
-"Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=1","support","0","no","Vulkan"
-"Vulkan0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","0","no","Vulkan"
-"Vulkan0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","0","no","Vulkan"
 "Vulkan0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","1","yes","Vulkan"
 "Vulkan0","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","1","yes","Vulkan"
 "Vulkan0","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","1","yes","Vulkan"
@@ -9896,38 +9478,24 @@
 "Vulkan0","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","0","no","Vulkan"
 "Vulkan0","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","0","no","Vulkan"
 "Vulkan0","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","1","yes","Vulkan"
-"Vulkan0","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","1","yes","Vulkan"
-"Vulkan0","ARANGE","type=f32,start=0.000000,stop=1048576.000000,step=1.000000","support","1","yes","Vulkan"
+"Vulkan0","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","0","no","Vulkan"
 "Vulkan0","TIMESTEP_EMBEDDING","type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000","support","1","yes","Vulkan"
 "Vulkan0","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","Vulkan"
-"Vulkan0","CUMSUM","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
-"Vulkan0","CUMSUM","type=f32,ne=[127,5,4,3]","support","1","yes","Vulkan"
-"Vulkan0","CUMSUM","type=f32,ne=[128,5,4,3]","support","1","yes","Vulkan"
-"Vulkan0","CUMSUM","type=f32,ne=[255,5,4,3]","support","1","yes","Vulkan"
-"Vulkan0","CUMSUM","type=f32,ne=[256,5,4,3]","support","1","yes","Vulkan"
-"Vulkan0","CUMSUM","type=f32,ne=[511,5,4,3]","support","1","yes","Vulkan"
-"Vulkan0","CUMSUM","type=f32,ne=[512,5,4,3]","support","1","yes","Vulkan"
-"Vulkan0","CUMSUM","type=f32,ne=[1023,5,4,3]","support","1","yes","Vulkan"
-"Vulkan0","CUMSUM","type=f32,ne=[1024,5,4,3]","support","1","yes","Vulkan"
-"Vulkan0","CUMSUM","type=f32,ne=[2047,5,4,3]","support","1","yes","Vulkan"
-"Vulkan0","CUMSUM","type=f32,ne=[2048,5,4,3]","support","1","yes","Vulkan"
-"Vulkan0","CUMSUM","type=f32,ne=[242004,1,1,1]","support","1","yes","Vulkan"
-"Vulkan0","CUMSUM","type=f32,ne=[375960,1,1,1]","support","1","yes","Vulkan"
+"Vulkan0","CUMSUM","type=f32,ne=[10,5,4,3]","support","0","no","Vulkan"
 "Vulkan0","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","Vulkan"
-"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","Vulkan"
-"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","Vulkan"
-"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","1","yes","Vulkan"
-"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","1","yes","Vulkan"
-"Vulkan0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","1","yes","Vulkan"
-"Vulkan0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","Vulkan"
-"Vulkan0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","Vulkan"
-"Vulkan0","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","Vulkan"
-"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","1","yes","Vulkan"
-"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","1","yes","Vulkan"
-"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","1","yes","Vulkan"
-"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","1","yes","Vulkan"
-"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","1","yes","Vulkan"
-"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","1","yes","Vulkan"
+"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","Vulkan"
+"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","Vulkan"
+"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","0","no","Vulkan"
+"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","0","no","Vulkan"
+"Vulkan0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","0","no","Vulkan"
+"Vulkan0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","0","no","Vulkan"
+"Vulkan0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","0","no","Vulkan"
+"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","Vulkan"
+"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","Vulkan"
+"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","Vulkan"
+"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","0","no","Vulkan"
+"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","0","no","Vulkan"
+"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","0","no","Vulkan"
 "Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","0","no","Vulkan"
 "Vulkan0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","1","yes","Vulkan"
 "Vulkan0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","1","yes","Vulkan"
--- a/docs/ops/WebGPU.csv
+++ b/docs/ops/WebGPU.csv
--- a/docs/ops/ZenDNN.csv
+++ b/docs/ops/ZenDNN.csv
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,7 +20,6 @@ else()

    add_subdirectory(gguf-hash)
    add_subdirectory(gguf)
-    add_subdirectory(idle)
    add_subdirectory(lookahead)
    add_subdirectory(lookup)
    add_subdirectory(parallel)
--- a/examples/batched/README.md
+++ b/examples/batched/README.md
@@ -3,7 +3,7 @@
 The example demonstrates batched generation from a given prompt

 ```bash
-./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4 --kv-unified
+./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4

 ...

--- a/examples/diffusion/README.md
+++ b/examples/diffusion/README.md
@@ -6,54 +6,8 @@ More Info:
 - https://github.com/ggml-org/llama.cpp/pull/14644
 - https://github.com/ggml-org/llama.cpp/pull/14771

-## Parameters
-The diffusion CLI supports various parameters to control the generation process:

-### Core Diffusion Parameters
- `--diffusion-steps`: Number of diffusion steps (default: 256)
- `--diffusion-algorithm`: Algorithm for token selection
-  - `0`: ORIGIN - Token will be generated in a purely random order from https://arxiv.org/abs/2107.03006.
-  - `1`: ENTROPY_BASED - Entropy-based selection
-  - `2`: MARGIN_BASED - Margin-based selection
-  - `3`: RANDOM - Random selection
-  - `4`: CONFIDENCE_BASED - Confidence-based selection (default)
-  - More documentation here https://github.com/DreamLM/Dream
- `--diffusion-visual`: Enable live visualization during generation
+Example of using Dream architechture: `llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual`

-### Scheduling Parameters
-Choose one of the following scheduling methods:
+Example of using LLaDA architechture: `llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual`

-**Timestep-based scheduling:**
- `--diffusion-eps`: Epsilon value for timestep scheduling (e.g., 0.001)
-
-**Block-based scheduling:**
- `--diffusion-block-length`: Block size for block-based scheduling (e.g., 32)
-
-### Sampling Parameters
- `--temp`: Temperature for sampling (0.0 = greedy/deterministic, higher = more random)
- `--top-k`: Top-k filtering for sampling
- `--top-p`: Top-p (nucleus) filtering for sampling
- `--seed`: Random seed for reproducibility
-
-### Model Parameters
- `-m`: Path to the GGUF model file
- `-p`: Input prompt text
- `-ub`: Maximum sequence length (ubatch size)
- `-c`: Context size
- `-b`: Batch size
-
-### Examples
-#### Dream architechture:
-```
-llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual
-```
-
-#### LLaDA architechture:
-```
-llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual
-```
-
-#### RND1 architecture:
-```
-llama-diffusion-cli -m RND1-Base-0910.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-algorithm 1 --diffusion-steps 256 --diffusion-visual --temp 0.5 --diffusion-eps 0.001
-```
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -4,10 +4,10 @@
 #include "llama.h"
 #include "ggml.h"

-#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
+#include <numeric>

 /**
 * This the arbitrary data which will be passed to each callback.
@@ -37,23 +37,23 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
    return u.f;
 }

-static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
+static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
    float v;
    if (type == GGML_TYPE_F16) {
-        v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
+        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
    } else if (type == GGML_TYPE_F32) {
-        v = *(const float *) &data[i];
+        v = *(float *) &data[i];
    } else if (type == GGML_TYPE_I64) {
-        v = (float) *(const int64_t *) &data[i];
+        v = (float) *(int64_t *) &data[i];
    } else if (type == GGML_TYPE_I32) {
-        v = (float) *(const int32_t *) &data[i];
+        v = (float) *(int32_t *) &data[i];
    } else if (type == GGML_TYPE_I16) {
-        v = (float) *(const int16_t *) &data[i];
+        v = (float) *(int16_t *) &data[i];
    } else if (type == GGML_TYPE_I8) {
-        v = (float) *(const int8_t *) &data[i];
+        v = (float) *(int8_t *) &data[i];
    } else if (type == GGML_TYPE_BF16) {
-        v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
+        v = ggml_compute_bf16_to_fp32(*(ggml_bf16_t *) &data[i]);
    } else {
        GGML_ABORT("fatal error");
    }
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@@ -14,13 +14,12 @@ static void write_table_header(std::ofstream & file) {
 static void write_table_entry(std::ofstream & file, const common_arg & opt) {
    file << "| `";
    // args
-    auto all_args = opt.get_args();
-    for (const auto & arg : all_args) {
-    if (arg == all_args.front()) {
+    for (const auto & arg : opt.args) {
+    if (arg == opt.args.front()) {
            file << arg;
-            if (all_args.size() > 1) file << ", ";
+            if (opt.args.size() > 1) file << ", ";
        } else {
-            file << arg << (arg != all_args.back() ? ", " : "");
+            file << arg << (arg != opt.args.back() ? ", " : "");
        }
    }
    // value hint
@@ -77,7 +76,7 @@ static void export_md(std::string fname, llama_example ex) {
 }

 int main(int, char **) {
-    export_md("autogen-main.md", LLAMA_EXAMPLE_COMPLETION);
+    export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
    export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);

    return 0;
--- a/examples/idle/CMakeLists.txt
+++ b/examples/idle/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET llama-idle)
-add_executable(${TARGET} idle.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/idle/README.md
+++ b/examples/idle/README.md
@@ -1,3 +0,0 @@
-# llama.cpp/example/idle
-
-https://github.com/ggml-org/llama.cpp/pull/17766
--- a/examples/idle/idle.cpp
+++ b/examples/idle/idle.cpp
@@ -1,110 +0,0 @@
-#include "arg.h"
-#include "common.h"
-#include "log.h"
-#include "llama.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <string>
-#include <thread>
-#include <vector>
-
-static void print_usage(int /*argc*/, char ** argv) {
-    printf("\nexample usage:\n");
-    printf("\n    %s -m model.gguf [-ngl n_gpu_layers]\n", argv[0]);
-    printf("\n");
-}
-
-int main(int argc, char ** argv) {
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
-        return 1;
-    }
-
-    common_init();
-
-    // init LLM
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    // initialize the model
-
-    llama_model_params model_params = common_model_params_to_llama(params);
-
-    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
-
-    if (model == NULL) {
-        LOG_ERR("%s: error: unable to load model\n" , __func__);
-        return 1;
-    }
-
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    // we need just a dummy token to evaluate
-    std::vector<llama_token> prompt_tokens(1, llama_vocab_bos(vocab));
-
-    llama_context_params ctx_params = llama_context_default_params();
-    ctx_params.n_ctx   = 512;
-    ctx_params.n_batch = 512;
-    ctx_params.no_perf = false;
-
-    llama_context * ctx = llama_init_from_model(model, ctx_params);
-    if (ctx == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
-        return 1;
-    }
-
-    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
-
-    const int n_iters = 3;
-
-    // warm-up
-    llama_decode(ctx, batch);
-    llama_memory_clear(llama_get_memory(ctx), true);
-    llama_synchronize(ctx);
-
-    for (int64_t t_pause_ms = 0; t_pause_ms <= 4000; t_pause_ms += 800) {
-        double t_sum_us  = 0.0;
-        double t_sum2_us = 0.0;
-
-        for (int i = 0; i < n_iters; i++) {
-            // this pause is important - it simulates "idle GPU"
-            std::this_thread::sleep_for(std::chrono::milliseconds(t_pause_ms));
-
-            const int64_t t_start_us = llama_time_us();
-
-            // this should take constant time
-            llama_decode(ctx, batch);
-            llama_synchronize(ctx);
-
-            const int64_t t_end_us = llama_time_us();
-
-            const double t_cur_us = t_end_us - t_start_us;
-
-#if 1
-            // print individual decode times
-            printf("  - decode time: %8.2f ms\n", t_cur_us / 1000);
-#endif
-
-            t_sum_us  += t_cur_us;
-            t_sum2_us += t_cur_us * t_cur_us;
-
-            llama_memory_clear(llama_get_memory(ctx), true);
-            llama_synchronize(ctx); // just in case
-        }
-
-        const double t_avg_us = t_sum_us / n_iters;
-        const double t_dev_us = sqrt((t_sum2_us / (n_iters - 1)) - (t_avg_us * t_avg_us * n_iters) / (n_iters - 1));
-
-        printf("iters: %4d, pause: %5d ms, avg decode time: %8.2f +/- %4.2f ms\n", n_iters, (int) t_pause_ms, t_avg_us / 1000, t_dev_us / 1000);
-        fflush(stdout);
-    }
-
-    llama_free(ctx);
-    llama_model_free(model);
-
-    return 0;
-}
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -231,9 +231,9 @@ DOT = '[^\\x0A\\x0D]'
 RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()])

 INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
-GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\\]')
+GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
 GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
-GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]', '\\': '\\\\'}
+GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}

 NON_LITERAL_SET = set('|.()[]{}*+?')
 ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?')
--- a/examples/model-conversion/logits.cpp
+++ b/examples/model-conversion/logits.cpp
@@ -144,7 +144,7 @@ int main(int argc, char ** argv) {
            return 1;
        }
        std::string s(buf, n);
-        printf("%s (%d)", s.c_str(), id);
+        printf("%s", s.c_str());
    }
    printf("\n");

--- a/examples/model-conversion/scripts/causal/compare-logits.py
+++ b/examples/model-conversion/scripts/causal/compare-logits.py
@@ -32,6 +32,10 @@ def quick_logits_check(pytorch_file, llamacpp_file):
    print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}")
    print(f"Max absolute difference: {max_diff:.4f}")

+    if max_diff > 1.0:
+        print(f"❌ NOK: Large differences detected - max diff: {max_diff:.4f}")
+        return False
+
    return True

 def main():
--- a/examples/model-conversion/scripts/causal/run-converted-model.sh
+++ b/examples/model-conversion/scripts/causal/run-converted-model.sh
@@ -4,11 +4,6 @@ set -e

 # First try command line argument, then environment variable, then file
 CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
-MODEL_TESTING_PROMPT="${2:-"$MODEL_TESTING_PROMPT"}"
-
-if [ -z "$MODEL_TESTING_PROMPT"]; then
-    MODEL_TESTING_PROMPT="Hello, my name is"
-fi

 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
@@ -19,8 +14,7 @@ if [ -z "$CONVERTED_MODEL" ]; then
 fi

 echo $CONVERTED_MODEL
-echo $MODEL_TESTING_PROMPT

 cmake --build ../../build --target llama-logits -j8

-../../build/bin/llama-logits -m "$CONVERTED_MODEL" "$MODEL_TESTING_PROMPT"
+../../build/bin/llama-logits -m "$CONVERTED_MODEL" "Hello, my name is"
--- a/examples/model-conversion/scripts/causal/run-org-model.py
+++ b/examples/model-conversion/scripts/causal/run-org-model.py
@@ -184,12 +184,8 @@ model_name = os.path.basename(model_path)
 # of using AutoModelForCausalLM.
 print(f"Model class: {model.__class__.__name__}")

-device = next(model.parameters()).device
-if os.getenv("MODEL_TESTING_PROMPT"):
-    prompt = os.getenv("MODEL_TESTING_PROMPT")
-else:
-    prompt = "Hello, my name is"
-input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+prompt = "Hello, my name is"
+input_ids = tokenizer(prompt, return_tensors="pt").input_ids

 print(f"Input tokens: {input_ids}")
 print(f"Input text: {repr(prompt)}")
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -241,12 +241,6 @@ int main(int argc, char ** argv) {

    llama_batch_free(batch);

-    // this one is managed by common_init_result
-    //llama_free(ctx);
-
-    llama_free(ctx2);
-    llama_free(ctx3);
-
    if (result0 != result2) {
        fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
        return 1;
--- a/examples/simple-cmake-pkg/README.md
+++ b/examples/simple-cmake-pkg/README.md
@@ -18,7 +18,6 @@ cd llama.cpp
 cmake -S . -B build
 cmake --build build
 cmake --install build --prefix inst
-```

 ### Build simple-cmake-pkg

--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -15,9 +15,6 @@ MODEL_FILE=models/llama-2-7b.Q4_0.gguf
 NGL=99
 CONTEXT=4096

-#support malloc device memory more than 4GB.
-export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
-
 if [ $# -gt 0 ]; then
    GGML_SYCL_DEVICE=$1
    echo "use $GGML_SYCL_DEVICE as main GPU"
--- a/examples/sycl/run-llama3.sh
+++ b/examples/sycl/run-llama3.sh
@@ -6,7 +6,7 @@

 # If you want more control, DPC++ Allows selecting a specific device through the
 # following environment variable
-export ONEAPI_DEVICE_SELECTOR="level_zero:0"
+#export ONEAPI_DEVICE_SELECTOR="level_zero:0"
 source /opt/intel/oneapi/setvars.sh

 #export GGML_SYCL_DEBUG=1
@@ -18,14 +18,11 @@ MODEL_FILE=models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
 NGL=99 # Layers offloaded to the GPU. If the device runs out of memory, reduce this value according to the model you are using.
 CONTEXT=4096

-#support malloc device memory more than 4GB.
-export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
-
 if [ $# -gt 0 ]; then
    GGML_SYCL_DEVICE=$1
    echo "Using $GGML_SYCL_DEVICE as the main GPU"
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
 else
    #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT}
 fi
--- a/examples/sycl/win-run-llama2.bat
+++ b/examples/sycl/win-run-llama2.bat
@@ -5,7 +5,5 @@
 set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force

-:: support malloc device memory more than 4GB.
-set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1

 .\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 99 -s 0
--- a/examples/sycl/win-run-llama3.bat
+++ b/examples/sycl/win-run-llama3.bat
@@ -5,7 +5,5 @@
 set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force

-:: support malloc device memory more than 4GB.
-set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1

-.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -s 0 -e -ngl 99
+.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -e -ngl 99
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -25,17 +25,16 @@ if(GIT_EXE)
    )
 endif()

+# Build the version string with optional dirty flag
 set(GGML_VERSION "${GGML_VERSION_BASE}")
+if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
+    set(GGML_VERSION "${GGML_VERSION}-dirty")
+endif()

 if(NOT GGML_BUILD_COMMIT)
    set(GGML_BUILD_COMMIT "unknown")
 endif()

-# Build the commit string with optional dirty flag
-if(DEFINED GGML_GIT_DIRTY AND GGML_GIT_DIRTY EQUAL 1)
-    set(GGML_BUILD_COMMIT "${GGML_BUILD_COMMIT}-dirty")
-endif()
-
 include(CheckIncludeFileCXX)

 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -168,7 +167,6 @@ option(GGML_RVV              "ggml: enable rvv"              ON)
 option(GGML_RV_ZFH           "ggml: enable riscv zfh"        ON)
 option(GGML_RV_ZVFH          "ggml: enable riscv zvfh"       ON)
 option(GGML_RV_ZICBOP        "ggml: enable riscv zicbop"     ON)
-option(GGML_RV_ZIHINTPAUSE   "ggml: enable riscv zihintpause "  ON)
 option(GGML_XTHEADVECTOR     "ggml: enable xtheadvector"     OFF)
 option(GGML_VXE              "ggml: enable vxe"              ${GGML_NATIVE})

@@ -176,6 +174,11 @@ option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requi
 set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
 set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")

+
+if (MINGW)
+    set(GGML_WIN_VER "0xA00" CACHE STRING   "ggml: Windows version")
+endif()
+
 # ggml core
 set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
 option(GGML_CPU                             "ggml: enable CPU backend"                        ON)
@@ -222,7 +225,7 @@ option(GGML_WEBGPU                          "ggml: use WebGPU"
 option(GGML_WEBGPU_DEBUG                    "ggml: enable WebGPU debug output"                OFF)
 option(GGML_WEBGPU_CPU_PROFILE              "ggml: enable WebGPU profiling (CPU)"             OFF)
 option(GGML_WEBGPU_GPU_PROFILE              "ggml: enable WebGPU profiling (GPU)"             OFF)
-option(GGML_WEBGPU_JSPI                     "ggml: use JSPI for WebGPU"                       ON)
+
 option(GGML_ZDNN                            "ggml: use zDNN"                                  OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
@@ -254,9 +257,6 @@ option(GGML_HEXAGON                         "ggml: enable Hexagon backend"
 # toolchain for vulkan-shaders-gen
 set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")

-option(GGML_ZENDNN                          "ggml: use ZenDNN"                                OFF)
-option(ZENDNN_ROOT                          "ggml: path to ZenDNN installation"               "")
-
 # extra artifacts
 option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE})
 option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
@@ -318,7 +318,6 @@ set(GGML_PUBLIC_HEADERS
    include/ggml-sycl.h
    include/ggml-vulkan.h
    include/ggml-webgpu.h
-    include/ggml-zendnn.h
    include/gguf.h)

 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
@@ -408,67 +407,62 @@ if (MSVC)
        /wd4996  # Disable POSIX deprecation warnings
        /wd4702  # Unreachable code warnings
    )
-    set(MSVC_COMPILE_OPTIONS
-        "$<$<COMPILE_LANGUAGE:C>:/utf-8>"
-        "$<$<COMPILE_LANGUAGE:CXX>:/utf-8>"
-    )
-    function(configure_msvc_target target_name)
+    function(disable_msvc_warnings target_name)
        if(TARGET ${target_name})
            target_compile_options(${target_name} PRIVATE ${MSVC_WARNING_FLAGS})
-            target_compile_options(${target_name} PRIVATE ${MSVC_COMPILE_OPTIONS})
        endif()
    endfunction()

-    configure_msvc_target(ggml-base)
-    configure_msvc_target(ggml)
-    configure_msvc_target(ggml-cpu)
-    configure_msvc_target(ggml-cpu-x64)
-    configure_msvc_target(ggml-cpu-sse42)
-    configure_msvc_target(ggml-cpu-sandybridge)
-    configure_msvc_target(ggml-cpu-haswell)
-    configure_msvc_target(ggml-cpu-skylakex)
-    configure_msvc_target(ggml-cpu-icelake)
-    configure_msvc_target(ggml-cpu-alderlake)
+    disable_msvc_warnings(ggml-base)
+    disable_msvc_warnings(ggml)
+    disable_msvc_warnings(ggml-cpu)
+    disable_msvc_warnings(ggml-cpu-x64)
+    disable_msvc_warnings(ggml-cpu-sse42)
+    disable_msvc_warnings(ggml-cpu-sandybridge)
+    disable_msvc_warnings(ggml-cpu-haswell)
+    disable_msvc_warnings(ggml-cpu-skylakex)
+    disable_msvc_warnings(ggml-cpu-icelake)
+    disable_msvc_warnings(ggml-cpu-alderlake)

    if (GGML_BUILD_EXAMPLES)
-        configure_msvc_target(common-ggml)
-        configure_msvc_target(common)
+        disable_msvc_warnings(common-ggml)
+        disable_msvc_warnings(common)

-        configure_msvc_target(mnist-common)
-        configure_msvc_target(mnist-eval)
-        configure_msvc_target(mnist-train)
+        disable_msvc_warnings(mnist-common)
+        disable_msvc_warnings(mnist-eval)
+        disable_msvc_warnings(mnist-train)

-        configure_msvc_target(gpt-2-ctx)
-        configure_msvc_target(gpt-2-alloc)
-        configure_msvc_target(gpt-2-backend)
-        configure_msvc_target(gpt-2-sched)
-        configure_msvc_target(gpt-2-quantize)
-        configure_msvc_target(gpt-2-batched)
+        disable_msvc_warnings(gpt-2-ctx)
+        disable_msvc_warnings(gpt-2-alloc)
+        disable_msvc_warnings(gpt-2-backend)
+        disable_msvc_warnings(gpt-2-sched)
+        disable_msvc_warnings(gpt-2-quantize)
+        disable_msvc_warnings(gpt-2-batched)

-        configure_msvc_target(gpt-j)
-        configure_msvc_target(gpt-j-quantize)
+        disable_msvc_warnings(gpt-j)
+        disable_msvc_warnings(gpt-j-quantize)

-        configure_msvc_target(magika)
-        configure_msvc_target(yolov3-tiny)
-        configure_msvc_target(sam)
+        disable_msvc_warnings(magika)
+        disable_msvc_warnings(yolov3-tiny)
+        disable_msvc_warnings(sam)

-        configure_msvc_target(simple-ctx)
-        configure_msvc_target(simple-backend)
+        disable_msvc_warnings(simple-ctx)
+        disable_msvc_warnings(simple-backend)
    endif()

    if (GGML_BUILD_TESTS)
-        configure_msvc_target(test-mul-mat)
-        configure_msvc_target(test-arange)
-        configure_msvc_target(test-backend-ops)
-        configure_msvc_target(test-cont)
-        configure_msvc_target(test-conv-transpose)
-        configure_msvc_target(test-conv-transpose-1d)
-        configure_msvc_target(test-conv1d)
-        configure_msvc_target(test-conv2d)
-        configure_msvc_target(test-conv2d-dw)
-        configure_msvc_target(test-customop)
-        configure_msvc_target(test-dup)
-        configure_msvc_target(test-opt)
-        configure_msvc_target(test-pool)
+        disable_msvc_warnings(test-mul-mat)
+        disable_msvc_warnings(test-arange)
+        disable_msvc_warnings(test-backend-ops)
+        disable_msvc_warnings(test-cont)
+        disable_msvc_warnings(test-conv-transpose)
+        disable_msvc_warnings(test-conv-transpose-1d)
+        disable_msvc_warnings(test-conv1d)
+        disable_msvc_warnings(test-conv2d)
+        disable_msvc_warnings(test-conv2d-dw)
+        disable_msvc_warnings(test-customop)
+        disable_msvc_warnings(test-dup)
+        disable_msvc_warnings(test-opt)
+        disable_msvc_warnings(test-pool)
    endif ()
 endif()
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -99,7 +99,6 @@ extern "C" {
    GGML_BACKEND_API int ggml_cpu_has_sme        (void);
    // other
    GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
-    GGML_BACKEND_API int ggml_cpu_get_rvv_vlen   (void);  // risc-v vector length in bytes
    GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
    GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
    GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@@ -1,5 +1,6 @@
 #pragma once

+#include "ggml.h"
 #include "ggml-backend.h"

 #ifdef  __cplusplus
@@ -7,7 +8,7 @@ extern "C" {
 #endif

 #define RPC_PROTO_MAJOR_VERSION    3
-#define RPC_PROTO_MINOR_VERSION    6
+#define RPC_PROTO_MINOR_VERSION    0
 #define RPC_PROTO_PATCH_VERSION    0
 #define GGML_RPC_MAX_SERVERS       16

--- a/ggml/include/ggml-zendnn.h
+++ b/ggml/include/ggml-zendnn.h
@@ -1,22 +0,0 @@
-#pragma once
-
-#include "ggml-backend.h"
-#include "ggml.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// backend API
-GGML_BACKEND_API ggml_backend_t ggml_backend_zendnn_init(void);
-
-GGML_BACKEND_API bool ggml_backend_is_zendnn(ggml_backend_t backend);
-
-// number of threads used for zendnn operations
-GGML_BACKEND_API void ggml_backend_zendnn_set_n_threads(ggml_backend_t backend_zendnn, int n_threads);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zendnn_reg(void);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -204,10 +204,6 @@
 #    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif

-#if defined(_WIN32) && !defined(_WIN32_WINNT)
-#    define _WIN32_WINNT 0x0A00
-#endif
-
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
@@ -534,7 +530,6 @@ extern "C" {
        GGML_OP_ARANGE,
        GGML_OP_TIMESTEP_EMBEDDING,
        GGML_OP_ARGSORT,
-        GGML_OP_TOP_K,
        GGML_OP_LEAKY_RELU,
        GGML_OP_TRI,
        GGML_OP_FILL,
@@ -2152,8 +2147,7 @@ extern "C" {
    };

    enum ggml_scale_flag {
-        GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8),
-        GGML_SCALE_FLAG_ANTIALIAS     = (1 << 9),
+        GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8)
    };

    // interpolate
@@ -2196,15 +2190,6 @@ extern "C" {
            int                  p2,
            int                  p3);

-    // pad each dimension with values on the other side of the torus (looping around)
-    GGML_API struct ggml_tensor * ggml_pad_circular(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   p0,
-            int                   p1,
-            int                   p2,
-            int                   p3);
-
    GGML_API struct ggml_tensor * ggml_pad_ext(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -2218,19 +2203,6 @@ extern "C" {
            int                  rp3
            );

-    // pad each dimension with values on the other side of the torus (looping around)
-    GGML_API struct ggml_tensor * ggml_pad_ext_circular(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   lp0,
-            int                   rp0,
-            int                   lp1,
-            int                   rp1,
-            int                   lp2,
-            int                   rp2,
-            int                   lp3,
-            int                   rp3);
-
    // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
    GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
            struct ggml_context * ctx,
@@ -2286,30 +2258,25 @@ extern "C" {
            struct ggml_tensor  * a,
            enum ggml_sort_order  order);

-    // similar to ggml_top_k but implemented as `argsort` + `view`
-    GGML_API struct ggml_tensor * ggml_argsort_top_k(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   k);
-
-    // top k elements per row
-    // note: the resulting top k indices are in no particular order
-    GGML_API struct ggml_tensor * ggml_top_k(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   k);
-
    GGML_API struct ggml_tensor * ggml_arange(
            struct ggml_context * ctx,
            float                 start,
            float                 stop,
            float                 step);

-    // q:    [n_embd_k, n_batch, n_head,    ne3 ]
-    // k:    [n_embd_k, n_kv,    n_head_kv, ne3 ]
-    // v:    [n_embd_v, n_kv,    n_head_kv, ne3 ] !! not transposed !!
-    // mask: [n_kv,     n_batch, ne32,      ne33]
-    // res:  [n_embd_v, n_head,  n_batch,   ne3 ] !! permuted !!
+    // top k elements per row
+    GGML_API struct ggml_tensor * ggml_top_k(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   k);
+
+#define GGML_KQ_MASK_PAD 64
+
+    // q:    [n_embd_k, n_batch,     n_head,    ne3 ]
+    // k:    [n_embd_k, n_kv,        n_head_kv, ne3 ]
+    // v:    [n_embd_v, n_kv,        n_head_kv, ne3 ] !! not transposed !!
+    // mask: [n_kv,     n_batch_pad, ne32,      ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
+    // res:  [n_embd_v, n_head,      n_batch,   ne3 ] !! permuted !!
    //
    // broadcast:
    //   n_head % n_head_kv == 0
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -127,6 +127,10 @@ if (NOT MSVC)
    endif()
 endif()

+if (MINGW)
+    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
+endif()
+
 #
 # POSIX conformance
 #
@@ -270,13 +274,10 @@ function(ggml_add_backend_library backend)
    endif()

    # Set versioning properties for all backend libraries
-    # Building a MODULE library with a version is not supported on macOS (https://gitlab.kitware.com/cmake/cmake/-/issues/20782)
-    if (NOT (APPLE AND GGML_BACKEND_DL))
-        set_target_properties(${backend} PROPERTIES
-            VERSION ${GGML_VERSION}
-            SOVERSION ${GGML_VERSION_MAJOR}
-        )
-    endif()
+    set_target_properties(${backend} PROPERTIES
+        VERSION ${GGML_VERSION}
+        SOVERSION ${GGML_VERSION_MAJOR}
+    )

    if(NOT GGML_AVAILABLE_BACKENDS)
        set(GGML_AVAILABLE_BACKENDS "${backend}"
@@ -331,14 +332,6 @@ function(ggml_add_cpu_backend_variant tag_name)
            set(GGML_INTERNAL_${feat} OFF)
        endforeach()

-        foreach (feat ${ARGN})
-            set(GGML_INTERNAL_${feat} ON)
-        endforeach()
-    elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
-        foreach (feat RVV)
-            set(GGML_INTERNAL_${feat} OFF)
-        endforeach()
-
        foreach (feat ${ARGN})
            set(GGML_INTERNAL_${feat} ON)
        endforeach()
@@ -413,13 +406,6 @@ if (GGML_CPU_ALL_VARIANTS)
        else()
            message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
        endif()
-    elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
-        if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-            ggml_add_cpu_backend_variant(riscv64_0)
-            ggml_add_cpu_backend_variant(riscv64_v   RVV)
-        else()
-            message(FATAL_ERROR "Unsupported RISC-V target OS: ${CMAKE_SYSTEM_NAME}")
-        endif()
    else()
        message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
    endif()
@@ -440,7 +426,6 @@ ggml_add_backend(WebGPU)
 ggml_add_backend(zDNN)
 ggml_add_backend(OpenCL)
 ggml_add_backend(Hexagon)
-ggml_add_backend(ZenDNN)

 foreach (target ggml-base ggml)
    target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -25,7 +25,6 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
 // ops that return true for this function must not use restrict pointers for their backend implementations
 bool ggml_op_can_inplace(enum ggml_op op) {
    switch (op) {
-        case GGML_OP_FILL:
        case GGML_OP_SCALE:
        case GGML_OP_DIAG_MASK_ZERO:
        case GGML_OP_DIAG_MASK_INF:
@@ -312,9 +311,16 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
 }

 // this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_dyn_tallocr_free_bytes(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) {
+static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
    size = aligned_offset(NULL, size, alloc->alignment);

+    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
+        __func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    remove_allocated_tensor(alloc, addr, tensor);
+#endif
+
    struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];

    // see if we can merge with an existing block
@@ -350,6 +356,8 @@ static void ggml_dyn_tallocr_free_bytes(struct ggml_dyn_tallocr * alloc, struct
    }
    // otherwise, add a new block
    ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
+
+    GGML_UNUSED(tensor);
 }

 static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
@@ -607,17 +615,13 @@ static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_ten

    GGML_ASSERT(parent_size >= node_size);

-    // note: we want after the freeing the chunks to continue to be aligned
-    struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
-    parent_size = aligned_offset(NULL, parent_size, p_alloc->alignment);
-    node_size = aligned_offset(NULL, node_size, p_alloc->alignment);
-
    if (parent_size > node_size) {
+        struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
        struct buffer_address p_addr = p_hn->addr;
        p_addr.offset += node_size;
        size_t extra_size = parent_size - node_size;
        AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
-        ggml_dyn_tallocr_free_bytes(p_alloc, p_addr, extra_size);
+        ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
    }
 }

@@ -701,14 +705,7 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
    struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
    ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
    size_t size = ggml_backend_buft_get_alloc_size(buft, node);
-
-    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
-        __func__, node->name, hn->addr.chunk, hn->addr.offset, size, alloc->chunks[hn->addr.chunk]->n_free_blocks);
-#ifdef GGML_ALLOCATOR_DEBUG
-    remove_allocated_tensor(alloc, hn->addr, node);
-#endif
-
-    ggml_dyn_tallocr_free_bytes(alloc, hn->addr, size);
+    ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
    hn->allocated = false;
 }

--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -73,10 +73,6 @@
 #include "ggml-cann.h"
 #endif

-#ifdef GGML_USE_ZENDNN
-#include "ggml-zendnn.h"
-#endif
-
 // disable C++17 deprecation warning for std::codecvt_utf8
 #if defined(__clang__)
 #    pragma clang diagnostic push
@@ -207,9 +203,6 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_OPENCL
        register_backend(ggml_backend_opencl_reg());
 #endif
-#ifdef GGML_USE_ZENDNN
-        register_backend(ggml_backend_zendnn_reg());
-#endif
 #ifdef GGML_USE_HEXAGON
        register_backend(ggml_backend_hexagon_reg());
 #endif
@@ -541,12 +534,8 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
    fs::path best_path;

    for (const auto & search_path : search_paths) {
-        if (std::error_code ec; !fs::exists(search_path, ec)) {
-            if (ec) {
-                GGML_LOG_DEBUG("%s: posix_stat(%s) failure, error-message: %s\n", __func__, path_str(search_path).c_str(), ec.message().c_str());
-            } else {
-                GGML_LOG_DEBUG("%s: search path %s does not exist\n", __func__, path_str(search_path).c_str());
-            }
+        if (!fs::exists(search_path)) {
+            GGML_LOG_DEBUG("%s: search path %s does not exist\n", __func__, path_str(search_path).c_str());
            continue;
        }
        fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
@@ -586,12 +575,8 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
        for (const auto & search_path : search_paths) {
            fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
            fs::path path = search_path / filename;
-            if (std::error_code ec; fs::exists(path, ec)) {
+            if (fs::exists(path)) {
                return get_reg().load_backend(path, silent);
-            } else {
-                if (ec) {
-                    GGML_LOG_DEBUG("%s: posix_stat(%s) failure, error-message: %s\n", __func__, path_str(path).c_str(), ec.message().c_str());
-                }
            }
        }
        return nullptr;
@@ -612,7 +597,6 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
 #endif

    ggml_backend_load_best("blas", silent, dir_path);
-    ggml_backend_load_best("zendnn", silent, dir_path);
    ggml_backend_load_best("cann", silent, dir_path);
    ggml_backend_load_best("cuda", silent, dir_path);
    ggml_backend_load_best("hip", silent, dir_path);
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -723,12 +723,6 @@ struct ggml_backend_sched {
    bool op_offload;

    int debug;
-
-    // used for debugging graph reallocations [GGML_SCHED_DEBUG_REALLOC]
-    // ref: https://github.com/ggml-org/llama.cpp/pull/17617
-    int debug_realloc;
-    int debug_graph_size;
-    int debug_prev_graph_size;
 };

 #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@@ -1240,8 +1234,10 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
                                tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
                                ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
                            }
-                            ggml_set_input(tensor_copy);
-                            ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
+                            if (sched->n_copies > 1) {
+                                ggml_set_input(tensor_copy);
+                                ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
+                            }
                            tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
                            SET_CAUSE(tensor_copy, "4.cpy");
                        }
@@ -1293,11 +1289,6 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
    }

    int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
-
-    // remember the actual graph_size for performing reallocation checks later [GGML_SCHED_DEBUG_REALLOC]
-    sched->debug_prev_graph_size = sched->debug_graph_size;
-    sched->debug_graph_size = graph_size;
-
    if (sched->graph.size < graph_size) {
        sched->graph.size = graph_size;
        sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
@@ -1404,26 +1395,18 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {

    // allocate graph
    if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
-#ifndef NDEBUG
-        GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
-#endif
-
-        if (sched->debug_realloc > 0) {
-            // we are interested only in situations where the graph was reallocated even though its size remained the same [GGML_SCHED_DEBUG_REALLOC]
-            // example: https://github.com/ggml-org/llama.cpp/pull/17143
-            const bool unexpected = !backend_ids_changed && sched->debug_prev_graph_size == sched->debug_graph_size;
-
-            if (unexpected || sched->debug_realloc > 1) {
-                GGML_ABORT("%s: unexpected graph reallocation (graph size = %d, nodes = %d, leafs = %d), debug_realloc = %d\n", __func__,
-                        sched->debug_graph_size, sched->graph.n_nodes, sched->graph.n_leafs, sched->debug_realloc);
-            }
-        }
-
        // the re-allocation may cause the split inputs to be moved to a different address
        // synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
        for (int i = 0; i < sched->n_backends; i++) {
            ggml_backend_synchronize(sched->backends[i]);
        }
+#ifdef GGML_SCHED_NO_REALLOC
+        GGML_ABORT("%s: failured to allocate graph, but graph re-allocation is disabled by GGML_SCHED_NO_REALLOC\n", __func__);
+#endif
+
+#ifndef NDEBUG
+        GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
+#endif

        ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
        if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
@@ -1636,14 +1619,6 @@ ggml_backend_sched_t ggml_backend_sched_new(

    const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
    sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
-
-    sched->debug_realloc = 0;
-#ifdef GGML_SCHED_NO_REALLOC
-    sched->debug_realloc = 1;
-#endif
-    const char * GGML_SCHED_DEBUG_REALLOC = getenv("GGML_SCHED_DEBUG_REALLOC");
-    sched->debug_realloc = GGML_SCHED_DEBUG_REALLOC ? atoi(GGML_SCHED_DEBUG_REALLOC) : sched->debug_realloc;
-
    sched->n_backends = n_backends;
    sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;

@@ -1660,9 +1635,6 @@ ggml_backend_sched_t ggml_backend_sched_new(
    sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
    sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));

-    sched->debug_graph_size = 0;
-    sched->debug_prev_graph_size = 0;
-
    sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
    sched->context_buffer = (char *) malloc(sched->context_buffer_size);

--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -42,7 +42,6 @@
 #include <aclnnop/aclnn_exp.h>
 #include <aclnnop/aclnn_fill_scalar.h>
 #include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
-#include <aclnnop/aclnn_ger.h>
 #include <aclnnop/aclnn_group_norm.h>
 #include <aclnnop/aclnn_grouped_matmul_v3.h>
 #include <aclnnop/aclnn_gt_scalar.h>
@@ -2207,120 +2206,78 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context & ctx,
 }

 /**
- * @brief Initializes and caches all intermediate tensors required for RoPE
- *        (Rotary Position Embedding), including support for Yarn, mRoPE,
- *        i-mRoPE, Neox repeat strategy, independent sectors, frequency factors，
- *        and multi-section rotary groups.
+ * @brief Initializes and caches sine/cosine positional encoding values
+ *        (used in RoPE, Rotary Position Embedding) for attention layers.
 *
- * This function computes and caches the per-dimension θ coefficients used for
- * Q/K rotary embedding. The cache is shared across layers, and recomputed only
- * when any dependent parameter changes.
+ * This function computes and caches the sin/cos values of
+ * θ = position * theta_scale for RoPE encoding. The cache is shared
+ * across attention layers, and only the first attention layer will
+ * trigger initialization. The cache includes repeated sin/cos values
+ * with different repeat methods depending on the @param is_neox flag.
 *
- * The function now supports:
- *   - Yarn RoPE extrapolation (via @param corr_dims and @param ext_factor)
- *   - Per-dimension independent sector exponent rules (indep_sects + sections[])
- *   - Multi-section RoPE (mRoPE) index mapping (mrope_used + is_imrope)
- *   - Frequency factor division (src2)
- *   - Neox / normal repeat expansion modes
+ * Steps performed by this function:
+ *   1. Identify whether the target tensor belongs to Q/K in attention
+ *      and restrict computation to the first layer only.
+ *   2. Initialize the theta scale array (arange → power → freq scaling).
+ *   3. Allocate sin/cos caches if the max prompt length increases.
+ *   4. Compute θ = position * theta_scale.
+ *   5. Compute sin(θ), cos(θ) and optionally scale by attn_factor.
+ *   6. Expand sin/cos values by repeat or repeat_interleave depending
+ *      on whether @param is_neox is enabled.
 *
- * @param ctx                CANN backend context, containing memory pool,
- *                           cached buffers, and runtime stream.
- * @param dst                Destination ggml_tensor whose computation
- *                           depends on RoPE (typically Qcur or Kcur).
- * @param corr_dims          [low, high] Yarn correction range.
- * @param ext_factor         Yarn extrapolation strength. 0 = disabled.
- * @param theta_scale        Base multiplier for per-dimension θ exponent.
- * @param freq_scale         Global frequency scaling factor.
- * @param attn_factor        Optional scaling applied to sin/cos (if needed).
- * @param is_neox            Whether to use Neox-style dimension interleave.
- * @param sections           4-way sector sizes for independent-section RoPE
- *                           and multi-section mRoPE (t/h/w/e).
- * @param mrope_used         Whether to enable multi-section rotary embedding.
- * @param is_imrope          Whether to apply interleaved mRoPE rules.
- * @param indep_sects        Whether each dimension runs independent exponent
- *                           resets based on @p sections.
+ * @param ctx                The CANN backend context, holding memory pool,
+ *                           stream, and persistent buffers for rope init/cache.
+ * @param dst                The destination ggml_tensor whose computation
+ *                           depends on the RoPE values (usually Qcur/Kcur).
+ * @param theta_scale        Scalar exponent base for computing theta scale values.
+ * @param freq_scale         Frequency scaling factor, applied to theta scale.
+ * @param attn_factor        Attention scaling factor, applied to sin/cos.
+ * @param is_neox            Whether to use Neox-style repeat strategy
+ *                           (dim expansion vs repeat_interleave).
 */
-static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
-                                  ggml_tensor *               dst,
-                                  float *                     corr_dims,
-                                  float                       ext_factor,
-                                  float                       theta_scale,
-                                  float                       freq_scale,
-                                  float                       attn_factor,
-                                  bool                        is_neox,
-                                  int                         sections[4],
-                                  bool                        mrope_used,
-                                  bool                        is_imrope,
-                                  bool                        indep_sects,
-                                  int64_t                     rope_dims) {
+static void aclnn_cache_init(ggml_backend_cann_context & ctx,
+                             ggml_tensor *               dst,
+                             float *                     corr_dims,
+                             float                       ext_factor,
+                             float                       theta_scale,
+                             float                       freq_scale,
+                             float                       attn_factor,
+                             bool                        is_neox) {
+    ggml_tensor * src0 = dst->src[0];  // input
    ggml_tensor * src1 = dst->src[1];  // position
    ggml_tensor * src2 = dst->src[2];  // freq_factors

-    int64_t theta_scale_length = rope_dims / 2;
-    int64_t position_length    = dst->ne[2];
-
-    // TODO: check theta_scale_length and position_length.
-    if (src2 == nullptr && ctx.rope_cache.cached &&
-        ctx.rope_cache.equal(theta_scale_length, position_length, ext_factor, theta_scale, freq_scale, attn_factor,
-                             is_neox, indep_sects, mrope_used, is_imrope, sections)) {
+    if (src2 == nullptr && ctx.rope_cache.cached && ctx.rope_cache.ext_factor == ext_factor &&
+        ctx.rope_cache.theta_scale == theta_scale && ctx.rope_cache.freq_scale == freq_scale &&
+        ctx.rope_cache.attn_factor == attn_factor && ctx.rope_cache.is_neox == is_neox) {
        // use cache.
        return;
    }

-    // Step0: calculate tensor shape.
-    int64_t theta_scale_ne[] = { theta_scale_length, 1, 1, 1 };
-    size_t  theta_scale_nb[] = { sizeof(float), theta_scale_length * sizeof(float), theta_scale_length * sizeof(float),
-                                 theta_scale_length * sizeof(float) };
+    int64_t theta_scale_length = src0->ne[0] / 2;
+    int64_t theta_scale_ne[]   = { theta_scale_length, 1, 1, 1 };
+    size_t  theta_scale_nb[]   = { sizeof(float), sizeof(float), sizeof(float), theta_scale_length * sizeof(float) };

    GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    int64_t position_ne[] = { 1, 1, position_length, 1 };
-    size_t  position_nb[] = { sizeof(int32_t), sizeof(int32_t), sizeof(int32_t), sizeof(int32_t) * position_length };
+    int64_t position_length = src1->ne[0];
+    int64_t position_ne[]   = { 1, 1, position_length, 1 };
+    size_t  position_nb[]   = { sizeof(int32_t), sizeof(int32_t), sizeof(int32_t), sizeof(int32_t) * position_length };

-    int64_t cache_ne[] = { theta_scale_length, 1, position_length, 1 };
-    size_t  cache_nb[GGML_MAX_DIMS];
-    cache_nb[0] = sizeof(float);
+    int64_t theta_ne[] = { theta_scale_length, 1, position_length, 1 };
+    size_t  theta_nb[GGML_MAX_DIMS];
+    theta_nb[0] = sizeof(float);
    for (int i = 1; i < GGML_MAX_DIMS; i++) {
-        cache_nb[i] = cache_nb[i - 1] * cache_ne[i - 1];
+        theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1];
    }

-    // Step1: Compute the coefficient of theta. During the cache_init process, aside from
-    // (1) multiplying by the position,
-    // (2) dividing by freq_factors,
-    // (3) computing the sine and cosine,
-    // the other parameters used in the computation generally do not change in most scenarios.
-    // Therefore, we can first compute this part of the result and then cache it.
-
-    // Step1.1: prepare theta_scale exponent. if this exponent updated, should update theta_scale_tensor.
+    // theta_scale arange, [0,1,...,ne00/2 - 1]
    acl_tensor_ptr acl_theta_scale_tensor;
-    bool           theta_scale_updated = false;
-    if (ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.theta_scale != theta_scale ||
-        ctx.rope_cache.indep_sects != indep_sects) {
-        theta_scale_updated = true;
-        if (ctx.rope_cache.theta_scale_exp_host != nullptr) {
-            free(ctx.rope_cache.theta_scale_exp_host);
-        }
-        ctx.rope_cache.theta_scale_exp_host = (float *) malloc(theta_scale_length * sizeof(float));
-        GGML_ASSERT(ctx.rope_cache.theta_scale_exp_host != nullptr);
-        if (!indep_sects) {
-            ctx.rope_cache.theta_scale_exp_host[0] = 1;
-            for (int i = 1; i < theta_scale_length; i++) {
-                ctx.rope_cache.theta_scale_exp_host[i] = ctx.rope_cache.theta_scale_exp_host[i - 1] * theta_scale;
-            }
-        } else {
-            int sect_dims = sections[0] + sections[1] + sections[2] + sections[3];
-            int sec_w     = sections[1] + sections[0];
-            int sec_e     = sections[2] + sec_w;
-
-            ctx.rope_cache.theta_scale_exp_host[0] = 1;
-            for (int i = 1; i < theta_scale_length; i++) {
-                int sector = i % sect_dims;
-                if (sector == 0 || sector == sections[0] || sector == sec_w || sector == sec_e) {
-                    ctx.rope_cache.theta_scale_exp_host[i] = 1;
-                    continue;
-                }
-                ctx.rope_cache.theta_scale_exp_host[i] = ctx.rope_cache.theta_scale_exp_host[i - 1] * theta_scale;
-            }
-        }
+    // cache theta scale
+    if (ctx.rope_cache.theta_scale_length != theta_scale_length ||
+        // theta_scale and freq_scale should not change during the current token inference process,
+        // so we can directly use == here instead of comparing the absolute difference.
+        ctx.rope_cache.theta_scale != theta_scale || ctx.rope_cache.freq_scale != freq_scale) {
+        ctx.rope_cache.theta_scale_length = theta_scale_length;

        if (ctx.rope_cache.theta_scale_cache != nullptr) {
            ACL_CHECK(aclrtFree(ctx.rope_cache.theta_scale_cache));
@@ -2328,137 +2285,74 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
        ACL_CHECK(aclrtMalloc(&ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float),
                              ACL_MEM_MALLOC_HUGE_FIRST));

-        ACL_CHECK(aclrtMemcpyAsync(ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float),
-                                   ctx.rope_cache.theta_scale_exp_host, theta_scale_length * sizeof(float),
-                                   ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream()));
-    }
-    acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
-                                                     theta_scale_ne, theta_scale_nb, 1);
+        acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
+                                                         theta_scale_ne, theta_scale_nb, 1);

-    // Step1.2: prepare rope_yarn_ramp, if this part updated, should update theta_scale_tensor.
-    // TODO: acl_yarn_ramp_tensor use rope cache.
-    bool                 yarn_ramp_tensor_updated = false;
-    ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool());
-    acl_tensor_ptr       acl_yarn_ramp_tensor;
-    if (ext_factor != 0 && (theta_scale_updated || ctx.rope_cache.theta_scale_length != theta_scale_length ||
-                            ctx.rope_cache.freq_scale != freq_scale)) {
-        yarn_ramp_tensor_updated = true;
+        float start      = 0;
+        float step       = 1;
+        float stop       = theta_scale_length;
+        float n_elements = theta_scale_length;
+        aclnn_arange(ctx, acl_theta_scale_tensor.get(), start, stop, step, n_elements);

-        // -rope_yarn_ramp
-        // const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
-        // return MIN(1, MAX(0, y)) - 1;
-        yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
-        void * yarn_ramp_buffer = yarn_ramp_allocator.get();
-        acl_yarn_ramp_tensor =
-            ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float), theta_scale_ne, theta_scale_nb, 1);
-        float          zero_value = 0, one_value = 1;
-        float          denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
-        acl_scalar_ptr low              = ggml_cann_create_scalar(&corr_dims[0], aclDataType::ACL_FLOAT);
-        acl_scalar_ptr zero             = ggml_cann_create_scalar(&zero_value, aclDataType::ACL_FLOAT);
-        acl_scalar_ptr one              = ggml_cann_create_scalar(&one_value, aclDataType::ACL_FLOAT);
-        acl_scalar_ptr denom_safe       = ggml_cann_create_scalar(&denom_safe_value, aclDataType::ACL_FLOAT);
-        acl_scalar_ptr ext_factor_sc    = ggml_cann_create_scalar(&ext_factor, aclDataType::ACL_FLOAT);
+        ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool());
+        acl_tensor_ptr       acl_yarn_ramp_tensor;
+        if (ext_factor != 0) {
+            // -rope_yarn_ramp
+            // const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
+            // return MIN(1, MAX(0, y)) - 1;
+            yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
+            void * yarn_ramp_buffer = yarn_ramp_allocator.get();
+            acl_yarn_ramp_tensor =
+                ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float), theta_scale_ne, theta_scale_nb, 1);
+            float          zero_value = 0, one_value = 1;
+            float          denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
+            acl_scalar_ptr low              = ggml_cann_create_scalar(&corr_dims[0], aclDataType::ACL_FLOAT);
+            acl_scalar_ptr zero             = ggml_cann_create_scalar(&zero_value, aclDataType::ACL_FLOAT);
+            acl_scalar_ptr one              = ggml_cann_create_scalar(&one_value, aclDataType::ACL_FLOAT);
+            acl_scalar_ptr denom_safe       = ggml_cann_create_scalar(&denom_safe_value, aclDataType::ACL_FLOAT);
+            acl_scalar_ptr ext_factor_sc    = ggml_cann_create_scalar(&ext_factor, aclDataType::ACL_FLOAT);

-        aclnn_arange(ctx, acl_yarn_ramp_tensor.get(), 0, theta_scale_length, 1, theta_scale_length);
-        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSubs, acl_yarn_ramp_tensor.get(), low.get(), one.get());
-        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDivs, acl_yarn_ramp_tensor.get(), denom_safe.get());
-        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceThreshold, acl_yarn_ramp_tensor.get(), zero.get(), zero.get());
-        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceClampMax, acl_yarn_ramp_tensor.get(), one.get());
-        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSubs, acl_yarn_ramp_tensor.get(), one.get(), one.get());
-        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor.get(), ext_factor_sc.get());
+            GGML_CANN_CALL_ACLNN_OP(ctx, Subs, acl_theta_scale_tensor.get(), low.get(), one.get(),
+                                    acl_yarn_ramp_tensor.get());
+            GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDivs, acl_yarn_ramp_tensor.get(), denom_safe.get());
+            GGML_CANN_CALL_ACLNN_OP(ctx, InplaceThreshold, acl_yarn_ramp_tensor.get(), zero.get(), zero.get());
+            GGML_CANN_CALL_ACLNN_OP(ctx, InplaceClampMax, acl_yarn_ramp_tensor.get(), one.get());
+            GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSubs, acl_yarn_ramp_tensor.get(), one.get(), one.get());
+            GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor.get(), ext_factor_sc.get());

-        // theta_interp = freq_scale * theta_extrap;
-        // theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-        // theta = freq_scale * theta_extrap * (1 - ramp_mix) + theta_extrap * ramp_mix;
-        // theta = freq_scale * theta_extrap - freq_scale * theta_extrap * ramp_mix + theta_extrap * ramp_mix;
-        // theta = theta_extrap * (freq_scale - freq_scale * ramp_mix + ramp_mix);
-        //
-        // we cache (freq_scale - freq_scale * ramp_mix + ramp_mix), Considering that the rope_yarn_ramp here is the inverse
-        // cache freq_scale + (freq_scale - 1) * ramp_mix
-        float          freq_scale_1    = freq_scale - 1;
-        acl_scalar_ptr freq_scale_sc   = ggml_cann_create_scalar(&freq_scale, aclDataType::ACL_FLOAT);
-        acl_scalar_ptr freq_scale_1_sc = ggml_cann_create_scalar(&freq_scale_1, aclDataType::ACL_FLOAT);
-        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor.get(), freq_scale_1_sc.get());
-        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_yarn_ramp_tensor.get(), freq_scale_sc.get(), one.get());
-    }
-
-    // Step 1.3: update theta_scale_tensor according to ext_factor or freq_scale.
-    if (ext_factor != 0) {
-        if (theta_scale_updated || yarn_ramp_tensor_updated) {
-            theta_scale_updated = true;
-            aclnn_mul(ctx, acl_theta_scale_tensor.get(), acl_yarn_ramp_tensor.get());
+            // theta_interp = freq_scale * theta_extrap;
+            // theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+            // theta = freq_scale * theta_extrap * (1 - ramp_mix) + theta_extrap * ramp_mix;
+            // theta = freq_scale * theta_extrap - freq_scale * theta_extrap * ramp_mix + theta_extrap * ramp_mix;
+            // theta = theta_extrap * (freq_scale - freq_scale * ramp_mix + ramp_mix);
+            //
+            // we cache (freq_scale - freq_scale * ramp_mix + ramp_mix), Considering that the rope_yarn_ramp here is the inverse
+            // cache freq_scale + (freq_scale - 1) * ramp_mix
+            float          freq_scale_1    = freq_scale - 1;
+            acl_scalar_ptr freq_scale_sc   = ggml_cann_create_scalar(&freq_scale, aclDataType::ACL_FLOAT);
+            acl_scalar_ptr freq_scale_1_sc = ggml_cann_create_scalar(&freq_scale_1, aclDataType::ACL_FLOAT);
+            GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor.get(), freq_scale_1_sc.get());
+            GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_yarn_ramp_tensor.get(), freq_scale_sc.get(), one.get());
        }
-    } else {
-        if (freq_scale != 1 && (ctx.rope_cache.freq_scale != freq_scale || theta_scale_updated)) {
-            theta_scale_updated = true;
+
+        // power
+        acl_scalar_ptr acl_theta_scale = ggml_cann_create_scalar(&theta_scale, aclDataType::ACL_FLOAT);
+        GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale.get(), acl_theta_scale_tensor.get(),
+                                acl_theta_scale_tensor.get());
+
+        if (ext_factor != 0) {
+            aclnn_mul(ctx, acl_theta_scale_tensor.get(), acl_yarn_ramp_tensor.get());
+        } else if (freq_scale != 1) {
            aclnn_muls(ctx, acl_theta_scale_tensor.get(), freq_scale, nullptr, true);
        }
-    }
-
-    // Nothing changed, use cache.
-    if (!theta_scale_updated) {
+    } else {
+        // use cache
        acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
                                                         theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
    }

-    // Step 1.4: prepare select index if mrope
-    acl_tensor_ptr position_select_index_tensor;
-    if (mrope_used) {
-        if (ctx.rope_cache.sections[0] != sections[0] || ctx.rope_cache.sections[1] != sections[1] ||
-            ctx.rope_cache.sections[2] != sections[2] || ctx.rope_cache.sections[3] != sections[3] ||
-            ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.is_imrope != is_imrope) {
-            if (ctx.rope_cache.position_select_index_host != nullptr) {
-                free(ctx.rope_cache.position_select_index_host);
-            }
-            ctx.rope_cache.position_select_index_host = (int *) malloc(theta_scale_length * sizeof(int));
-            GGML_ASSERT(ctx.rope_cache.position_select_index_host != nullptr);
-            int sect_dims = sections[0] + sections[1] + sections[2] + sections[3];
-            int sec_w     = sections[1] + sections[0];
-            int sec_e     = sections[2] + sec_w;
-            // t,h,w,e
-            for (int i = 0; i < theta_scale_length; i++) {
-                int sector = i % sect_dims;
-
-                if (is_imrope) {  // qwen3vl apply interleaved mrope
-                    if (sector % 3 == 1 && sector < 3 * sections[1]) {
-                        ctx.rope_cache.position_select_index_host[i] = 1;
-                    } else if (sector % 3 == 2 && sector < 3 * sections[2]) {
-                        ctx.rope_cache.position_select_index_host[i] = 2;
-                    } else if (sector % 3 == 0 && sector < 3 * sections[0]) {
-                        ctx.rope_cache.position_select_index_host[i] = 0;
-                    } else {
-                        ctx.rope_cache.position_select_index_host[i] = 3;
-                    }
-                } else {
-                    if (sector >= sections[0] && sector < sec_w) {
-                        ctx.rope_cache.position_select_index_host[i] = 1;
-                    } else if (sector >= sec_w && sector < sec_e) {
-                        ctx.rope_cache.position_select_index_host[i] = 2;
-                    } else if (sector >= sec_e) {
-                        ctx.rope_cache.position_select_index_host[i] = 3;
-                    } else {
-                        ctx.rope_cache.position_select_index_host[i] = 0;
-                    }
-                }
-            }
-
-            if (ctx.rope_cache.position_select_index != nullptr) {
-                ACL_CHECK(aclrtFree(ctx.rope_cache.position_select_index));
-            }
-            ACL_CHECK(aclrtMalloc(&ctx.rope_cache.position_select_index, theta_scale_length * sizeof(int),
-                                  ACL_MEM_MALLOC_HUGE_FIRST));
-
-            ACL_CHECK(aclrtMemcpyAsync(ctx.rope_cache.position_select_index, theta_scale_length * sizeof(int),
-                                       ctx.rope_cache.position_select_index_host, theta_scale_length * sizeof(int),
-                                       ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream()));
-        }
-
-        position_select_index_tensor = ggml_cann_create_tensor(ctx.rope_cache.position_select_index, ACL_INT32,
-                                                               sizeof(int), theta_scale_ne, theta_scale_nb, 1);
-    }
-
-    // Step2: divide by freq_factors
    ggml_cann_pool_alloc freq_fac_res_allocator(ctx.pool());
+    // freq_factors
    if (src2) {
        freq_fac_res_allocator.alloc(theta_scale_length * sizeof(float));
        void *         freq_fac_res_ptr = freq_fac_res_allocator.get();
@@ -2471,85 +2365,6 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
        std::swap(acl_theta_scale_tensor, acl_freq_fac_res_tensor);
    }

-    // Step3: prepare position_tensor
-    acl_tensor_ptr       acl_position_tensor;
-    ggml_cann_pool_alloc mrope_position_acllocator(ctx.pool());
-    if (mrope_used) {
-        // Step3.1: select current position;
-        // position :
-        // pos1: [[0, 1 ,2 ,3 ],
-        // pos2:  [4, 5 ,6 ,7 ],
-        // pos3:  [8, 9 ,10,11],
-        // pos4:  [12,13,14,15] ]
-        //
-        // select index = [0, 1, 2, 2, 1, 0]
-        //
-        // selected_tensor:
-        // [[0, 1 ,2 ,3 ],
-        //  [4, 5 ,6 ,7 ],
-        //  [8, 9 ,10,11],
-        //  [8, 9 ,10,11],
-        //  [4, 5 ,6 ,7 ],
-        //  [0, 1 ,2 ,3 ]]
-        //
-        // transpose, from [seq_len:dims] to [dims:seq_len]
-        // [0, 4, 8 ,8 ,4, 0],
-        // [1, 5, 9, 9, 5, 1],
-        // [2, 6, 10,10,6 ,2],
-        // [3, 7, 11,11,7 3 ]]
-        //
-        // multipy by theta_scale_tensor
-        // [theta_scale^0, theta_scale^1, ..., theta_scale ^ n]
-
-        int64_t        mrope_position_ne[] = { position_length, 4 };
-        size_t         mrope_position_nb[] = { sizeof(int), position_length * sizeof(int) };
-        acl_tensor_ptr mrope_position =
-            ggml_cann_create_tensor(src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type),
-                                    mrope_position_ne, mrope_position_nb, 2);
-
-        // selected position tensor's shape is a transpose of cache tensor.
-        int64_t selected_position_ne[] = { position_length, theta_scale_length };
-        size_t  selected_position_nb[] = { sizeof(float), position_length * sizeof(float) };
-        mrope_position_acllocator.alloc(theta_scale_length * position_length * sizeof(float));
-        void * mrope_position_buffer = mrope_position_acllocator.get();
-        acl_position_tensor =
-            ggml_cann_create_tensor(mrope_position_buffer, ggml_cann_type_mapping(src1->type),
-                                    ggml_type_size(src1->type), selected_position_ne, selected_position_nb, 2);
-        GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, mrope_position.get(), 0, position_select_index_tensor.get(),
-                                acl_position_tensor.get());
-
-        // transpose
-        int64_t transposed_ne[] = { position_length, 1, theta_scale_length, 1 };
-        size_t  transposed_nb[GGML_MAX_DIMS];
-        transposed_nb[0] = sizeof(float);
-        for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            transposed_nb[i] = transposed_nb[i - 1] * transposed_ne[i - 1];
-        }
-
-        std::swap(transposed_ne[0], transposed_ne[2]);
-        std::swap(transposed_nb[0], transposed_nb[2]);
-
-        acl_position_tensor =
-            ggml_cann_create_tensor(mrope_position_buffer, ggml_cann_type_mapping(src1->type),
-                                    ggml_type_size(src1->type), transposed_ne, transposed_nb, GGML_MAX_DIMS);
-
-    } else {
-        // auto bcast.
-        acl_position_tensor =
-            ggml_cann_create_tensor(src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type),
-                                    position_ne, position_nb, GGML_MAX_DIMS);
-    }
-
-    // Step4: multiply by the position
-    int64_t              theta_length = theta_scale_length * position_length;
-    ggml_cann_pool_alloc theta_allocator(ctx.pool(), theta_length * sizeof(float));
-    void *               theta_buffer = theta_allocator.get();
-
-    acl_tensor_ptr acl_theta_tensor =
-        ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float), cache_ne, cache_nb, GGML_MAX_DIMS);
-    aclnn_mul(ctx, acl_position_tensor.get(), acl_theta_scale_tensor.get(), acl_theta_tensor.get());
-
-    // Step5: calculate sin cos.
    // init sin_repeat && cos_repeat, only to accelerate first layer on each device
    if (position_length > ctx.rope_cache.position_length) {
        ctx.rope_cache.position_length = position_length;
@@ -2566,30 +2381,44 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
            aclrtMalloc(&ctx.rope_cache.cos_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
    }

+    // position
+    acl_tensor_ptr acl_position_tensor =
+        ggml_cann_create_tensor(src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type), position_ne,
+                                position_nb, GGML_MAX_DIMS);
+
+    // power * position
+    int64_t              theta_length = theta_scale_length * position_length;
+    ggml_cann_pool_alloc theta_allocator(ctx.pool(), theta_length * sizeof(float));
+    void *               theta_buffer = theta_allocator.get();
+
+    acl_tensor_ptr acl_theta_tensor =
+        ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, GGML_MAX_DIMS);
+    aclnn_mul(ctx, acl_position_tensor.get(), acl_theta_scale_tensor.get(), acl_theta_tensor.get());
+
    // sin/cos
    ggml_cann_pool_alloc sin_allocator(ctx.pool(), theta_length * sizeof(float));
    void *               sin_buffer = sin_allocator.get();
    acl_tensor_ptr       acl_sin_tensor =
-        ggml_cann_create_tensor(sin_buffer, ACL_FLOAT, sizeof(float), cache_ne, cache_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+        ggml_cann_create_tensor(sin_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
    aclnn_sin(ctx, acl_theta_tensor.get(), acl_sin_tensor.get());

    ggml_cann_pool_alloc cos_allocator(ctx.pool(), theta_length * sizeof(float));
    void *               cos_buffer = cos_allocator.get();
    acl_tensor_ptr       acl_cos_tensor =
-        ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float), cache_ne, cache_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+        ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
    aclnn_cos(ctx, acl_theta_tensor.get(), acl_cos_tensor.get());

    if (ext_factor != 0) {
        attn_factor *= 1.0f + 0.1f * logf(1.0f / freq_scale);
    }

-    // Step 5: multiply by attn_factor
+    // attn_factor
    if (attn_factor != 1) {
        aclnn_muls(ctx, acl_sin_tensor.get(), attn_factor, nullptr, true);
        aclnn_muls(ctx, acl_cos_tensor.get(), attn_factor, nullptr, true);
    }

-    int64_t sin_reshape_ne[4] = { rope_dims, 1, dst->ne[2], 1 };
+    int64_t sin_reshape_ne[4] = { src0->ne[0], 1, src0->ne[2], 1 };
    size_t  sin_reshape_nb[GGML_MAX_DIMS];
    sin_reshape_nb[0] = sizeof(float);
    for (int i = 1; i < GGML_MAX_DIMS; i++) {
@@ -2600,9 +2429,8 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
    acl_tensor_ptr acl_cos_repeat_tensor = ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
                                                                   sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);

-    // Step 6: repeat
+    // repeat
    if (is_neox) {
-        // [sinθ1, sinθ1, sinθ2, sinθ2, ..., sinθn, sinθn]
        int64_t repeatsArray[] = { 1, 1, 1, 2 };
        aclnn_repeat(ctx, acl_sin_tensor.get(), acl_sin_repeat_tensor.get(), repeatsArray);
        aclnn_repeat(ctx, acl_cos_tensor.get(), acl_cos_repeat_tensor.get(), repeatsArray);
@@ -2610,15 +2438,17 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
        int64_t num_repeats = 2;
        int64_t dim         = 3;
        int64_t output_size = theta_scale_length * num_repeats;
-        // [sinθ1, sinθ2, ..., sinθn, sinθ1, sinθ2, ..., sinθn]
        aclnn_repeat_interleave(ctx, acl_sin_tensor.get(), acl_sin_repeat_tensor.get(), dim, num_repeats, output_size);
        aclnn_repeat_interleave(ctx, acl_cos_tensor.get(), acl_cos_repeat_tensor.get(), dim, num_repeats, output_size);
    }

-    // Update cached value.
-    ctx.rope_cache.cached = true;
-    ctx.rope_cache.set(theta_scale_length, position_length, ext_factor, theta_scale, freq_scale, attn_factor, is_neox,
-                       indep_sects, mrope_used, is_imrope, sections);
+    // Other layers use cache except first layer.
+    ctx.rope_cache.cached      = true;
+    ctx.rope_cache.ext_factor  = ext_factor;
+    ctx.rope_cache.theta_scale = theta_scale;
+    ctx.rope_cache.freq_scale  = freq_scale;
+    ctx.rope_cache.attn_factor = attn_factor;
+    ctx.rope_cache.is_neox     = is_neox;
 }

 #ifdef __cplusplus
@@ -2644,7 +2474,6 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {

    // param
    float     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-    int       sections[4];
    // const int n_past     = ((int32_t *) dst->op_params)[0];
    const int n_dims     = ((int32_t *) dst->op_params)[1];
    const int mode       = ((int32_t *) dst->op_params)[2];
@@ -2659,54 +2488,22 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
    memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
    memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
    memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
-    memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int) * 4);

+    // TODO: n_dims <= ne0
+    GGML_ASSERT(n_dims == ne0);
    GGML_ASSERT(n_dims % 2 == 0);
-    GGML_ASSERT(n_dims <= ne00);

    const float theta_scale = powf(freq_base, -2.0f / n_dims);

    float corr_dims[2];
    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);

-    bool       is_neox    = mode & GGML_ROPE_TYPE_NEOX;
-    const bool is_imrope  = mode == GGML_ROPE_TYPE_IMROPE;  // qwen3vl apply interleaved mrope
-    // mrope_used means the GGML_ROPE_TYPE_MROPE bit is set.
-    // Note: this bit is also set for imrope and some vision modes,
-    // so mrope_used does NOT exclusively indicate pure mrope.
-    const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
-    const bool is_vision  = mode == GGML_ROPE_TYPE_VISION;
-
-    if (mrope_used) {
-        GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
-    }
-
-    if (is_vision) {
-        GGML_ASSERT(n_dims == ne0 / 2);
-    }
-
-    if (is_imrope || mrope_used) {
-        is_neox = true;
-    }
-
-    int64_t rope_dims = n_dims;
-
-    //Our current RotaryPositionEmbedding does not support the VISION mode,
-    //but essentially it only modifies theta_base in mrope,
-    //then repeats it at the end in the same way as is_neox.
-    //In fact, RoPE is still applied across all dimensions.
-    if (is_vision) {
-        rope_dims = src0->ne[0];
-    }
-    int64_t tail_dims = ne00 - rope_dims;
-    bool    has_tail  = tail_dims > 0;
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;

    // init ctx.rope_cos/rope_sin cache
-    aclnn_rope_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, sections,
-                          mrope_used, is_imrope, is_vision, rope_dims);
+    aclnn_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox);

-    // Cache is generated with ne00 dimensions, so we use ne00 for reshape
-    int64_t sin_reshape_ne[4] = { rope_dims, 1, ne02, 1 };
+    int64_t sin_reshape_ne[4] = { ne00, 1, ne02, 1 };
    size_t  sin_reshape_nb[GGML_MAX_DIMS];
    sin_reshape_nb[0] = sizeof(float);
    for (int i = 1; i < GGML_MAX_DIMS; i++) {
@@ -2719,6 +2516,7 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {

    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
 #ifdef ASCEND_310P
    // Special ROPE operation for 310P

@@ -2746,7 +2544,7 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {

        int64_t shifts[] = { 1 };
        int64_t dims[]   = { 3 };
-        aclnn_roll(ctx, acl_input_tensor.get(), acl_input_roll_tensor.get(), shifts, dims);
+        aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);

        // init [-1, 1, -1, 1, ...]
        minus_one_scale_buffer = minus_one_scale_allocator.get();
@@ -2766,7 +2564,7 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
        }
        int64_t index_num = src0->ne[0];
        float   value     = -1;
-        aclnn_index_fill_tensor(ctx, acl_minus_one_tensor.get(), dim, index, index_num, value);
+        aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index, index_num, value);
    } else {
        // roll input: [q0,q1,q2,...] ->
        // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
@@ -2778,7 +2576,7 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {

        int64_t shifts[] = { src0->ne[0] / 2 };
        int64_t dims[]   = { 3 };
-        aclnn_roll(ctx, acl_input_tensor.get(), acl_input_roll_tensor.get(), shifts, dims);
+        aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);

        // init [-1, -1, -1, 1, 1，1，...]
        minus_one_scale_buffer  = minus_one_scale_allocator.get();
@@ -2801,7 +2599,7 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
                                                                       first_half_ne, first_half_nb, GGML_MAX_DIMS);
        bool           inplace               = true;
        float          scale                 = -1;
-        aclnn_muls(ctx, acl_first_half_tensor.get(), scale, nullptr, inplace);
+        aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
    }

    // TODO: n_dims < ne0
@@ -2822,15 +2620,14 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
        ggml_cann_create_tensor(input_roll_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
                                src0->ne, input_nb, GGML_MAX_DIMS);

-    aclnn_mul(ctx, acl_input_roll_reshape_tensor.get(), acl_minus_one_tensor.get(),
-              acl_input_roll_mul_scale_tensor.get());
+    aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor, acl_input_roll_mul_scale_tensor);

    // output
    void * output_fp32_buffer;
    if (src0->type == GGML_TYPE_F32) {
-        aclnn_mul(ctx, acl_src.get(), acl_cos_reshape_tensor.get());
-        aclnn_mul(ctx, acl_input_roll_mul_scale_tensor.get(), acl_sin_reshape_tensor.get());
-        aclnn_add(ctx, acl_src.get(), acl_input_roll_mul_scale_tensor.get(), acl_dst.get());
+        aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor);
+        aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor);
+        aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst);
        // TODO: ne0 != n_dims in mode2
    } else if (src0->type == GGML_TYPE_F16) {
        size_t input_fp32_nb[GGML_MAX_DIMS];
@@ -2851,131 +2648,54 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
        output_fp32_buffer                = fp32_allocator.get();
        acl_tensor_ptr output_fp32_tensor = ggml_cann_create_tensor(output_fp32_buffer, ACL_FLOAT, sizeof(float),
                                                                    dst->ne, input_fp32_nb, GGML_MAX_DIMS);
-        aclnn_mul(ctx, acl_src.get(), acl_cos_reshape_tensor.get(), input_fp32_tensor1.get());
-        aclnn_mul(ctx, acl_input_roll_mul_scale_tensor.get(), acl_sin_reshape_tensor.get(), input_fp32_tensor2.get());
-        aclnn_add(ctx, input_fp32_tensor1.get(), input_fp32_tensor2.get(), output_fp32_tensor.get());
-        aclnn_cast(ctx, output_fp32_tensor.get(), acl_dst.get(), ACL_FLOAT16);
+        aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1);
+        aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor, input_fp32_tensor2);
+        aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2, output_fp32_tensor);
+        aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
    }
    return;
 #endif
-    int64_t acl_mode = is_neox ? 0 : 1;

-    // Pre-define head and tail dimensions for reuse
-    int64_t head_ne[GGML_MAX_DIMS] = { rope_dims, ne01, ne02, ne03 };
-    int64_t tail_ne[GGML_MAX_DIMS] = { tail_dims, ne01, ne02, ne03 };
+    // ggml_mode = 0 --> aclnn_model = 1
+    int64_t acl_mode = mode == 0 ? 1 : mode;

-    // Step 1: Prepare trans tensors for F16 type conversion to F32 if needed
-    bool                 src_dst_need_trans = false;
-    ggml_cann_pool_alloc src_trans_allocator(ctx.pool());
-    ggml_cann_pool_alloc dst_trans_allocator(ctx.pool());
-    acl_tensor_ptr       acl_src_trans_tensor;
-    acl_tensor_ptr       acl_dst_trans_tensor;
-    void *               src_trans_buffer = nullptr;
-    void *               dst_trans_buffer = nullptr;
-    size_t               src_dst_trans_nb[GGML_MAX_DIMS];
-    if (src0->type == GGML_TYPE_F16) {
-        src_dst_need_trans = true;
-        src_trans_buffer   = src_trans_allocator.alloc(ggml_nelements(src0) * sizeof(float));
-        dst_trans_buffer   = dst_trans_allocator.alloc(ggml_nelements(dst) * sizeof(float));
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
+                                        acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get());
+                break;
+            }
+        case GGML_TYPE_F16:
+            {
+                ggml_cann_pool_alloc src_trans_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(float));
+                void *               src_trans_buffer = src_trans_allocator.get();
+                ggml_cann_pool_alloc dst_trans_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float));
+                void *               dst_trans_buffer = dst_trans_allocator.get();

-        src_dst_trans_nb[0] = sizeof(float);
-        for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            src_dst_trans_nb[i] = src_dst_trans_nb[i - 1] * src0->ne[i - 1];
-        }
-        acl_src_trans_tensor = ggml_cann_create_tensor(src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne,
-                                                       src_dst_trans_nb, GGML_MAX_DIMS);
-        acl_dst_trans_tensor = ggml_cann_create_tensor(dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne,
-                                                       src_dst_trans_nb, GGML_MAX_DIMS);
-        aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT);
-    }
+                size_t src_trans_nb[GGML_MAX_DIMS];
+                src_trans_nb[0] = sizeof(float);
+                for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                    src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+                }

-    // Step 2: Prepare head tensors for tail splitting if needed
-    acl_tensor_ptr acl_src_head;
-    acl_tensor_ptr acl_dst_head;
-    if (has_tail) {
-        // Create head views for RotaryPositionEmbedding (only first rope_dims dimensions)
-        // RotaryPositionEmbedding requires contiguous dst tensor, so we use a temporary buffer
-        if (src_dst_need_trans) {
-            // Use F32 trans tensor strides
-            acl_src_head = ggml_cann_create_tensor((char *) src_trans_buffer, ACL_FLOAT, sizeof(float), head_ne,
-                                                   src_dst_trans_nb, GGML_MAX_DIMS);
-        } else {
-            // Use original F32 tensor strides
-            acl_src_head = ggml_cann_create_tensor((char *) src0->data, ACL_FLOAT, sizeof(float), head_ne, src0->nb,
-                                                   GGML_MAX_DIMS);
-        }
+                acl_tensor_ptr acl_src_trans_tensor = ggml_cann_create_tensor(
+                    src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb, GGML_MAX_DIMS);
+                acl_tensor_ptr acl_dst_trans_tensor = ggml_cann_create_tensor(
+                    dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb, GGML_MAX_DIMS);

-        int64_t              head_elements = rope_dims * ne01 * ne02 * ne03;
-        ggml_cann_pool_alloc dst_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float));
-        void *               dst_head_contiguous_buffer = dst_head_contiguous_allocator.get();
+                aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT);

-        size_t head_contiguous_nb[GGML_MAX_DIMS];
-        head_contiguous_nb[0] = sizeof(float);
-        for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            head_contiguous_nb[i] = head_contiguous_nb[i - 1] * head_ne[i - 1];
-        }
-        acl_dst_head = ggml_cann_create_tensor(dst_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne,
-                                               head_contiguous_nb, GGML_MAX_DIMS);
-    }
+                GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(),
+                                        acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode,
+                                        acl_dst_trans_tensor.get());

-    // Step 3: Execute RotaryPositionEmbedding
-    if (has_tail) {
-        // Rotate only the head portion (first rope_dims dimensions)
-        GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_head.get(), acl_cos_reshape_tensor.get(),
-                                acl_sin_reshape_tensor.get(), acl_mode, acl_dst_head.get());
-
-        // Copy head result from contiguous buffer back to destination tensor
-        if (src_dst_need_trans) {
-            acl_tensor_ptr acl_dst_head_target = ggml_cann_create_tensor(
-                (char *) dst_trans_buffer, ACL_FLOAT, sizeof(float), head_ne, src_dst_trans_nb, GGML_MAX_DIMS);
-            cann_copy(ctx, acl_dst_head.get(), acl_dst_head_target.get());
-        } else {
-            acl_tensor_ptr acl_dst_head_target =
-                ggml_cann_create_tensor((char *) dst->data, ACL_FLOAT, sizeof(float), head_ne, dst->nb, GGML_MAX_DIMS);
-            cann_copy(ctx, acl_dst_head.get(), acl_dst_head_target.get());
-        }
-    } else if (src_dst_need_trans) {
-        // Rotate full tensor (no tail), using trans tensors
-        GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(), acl_cos_reshape_tensor.get(),
-                                acl_sin_reshape_tensor.get(), acl_mode, acl_dst_trans_tensor.get());
-    } else {
-        // Rotate full tensor (no tail), using original tensors
-        GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
-                                acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get());
-    }
-
-    // Step 4: Copy unrotated tail portion from source to destination
-    if (has_tail) {
-        size_t src_tail_offset;
-        size_t dst_tail_offset;
-
-        auto copy_tail_device = [&](void * src_ptr, void * dst_ptr, aclDataType dtype, size_t elem_size,
-                                    size_t * nb_src_arr, size_t * nb_dst_arr) {
-            acl_tensor_ptr acl_src_tail =
-                ggml_cann_create_tensor(src_ptr, dtype, elem_size, tail_ne, nb_src_arr, GGML_MAX_DIMS);
-            acl_tensor_ptr acl_dst_tail =
-                ggml_cann_create_tensor(dst_ptr, dtype, elem_size, tail_ne, nb_dst_arr, GGML_MAX_DIMS);
-            cann_copy(ctx, acl_src_tail.get(), acl_dst_tail.get());
-        };
-
-        if (src_dst_need_trans) {
-            // Use F32 trans tensor strides and offsets
-            src_tail_offset = rope_dims * src_dst_trans_nb[0];
-            dst_tail_offset = rope_dims * src_dst_trans_nb[0];
-            copy_tail_device((char *) src_trans_buffer + src_tail_offset, (char *) dst_trans_buffer + dst_tail_offset,
-                             ACL_FLOAT, sizeof(float), src_dst_trans_nb, src_dst_trans_nb);
-        } else {
-            // Use original tensor strides and offsets
-            src_tail_offset = rope_dims * nb00;
-            dst_tail_offset = rope_dims * nb0;
-            copy_tail_device((char *) src0->data + src_tail_offset, (char *) dst->data + dst_tail_offset,
-                             ggml_cann_type_mapping(dst->type), ggml_element_size(dst), src0->nb, dst->nb);
-        }
-    }
-
-    // Step 5: Cast back to F16 if needed
-    if (src_dst_need_trans) {
-        aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16);
+                aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16);
+                break;
+            }
+        default:
+            GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE");
+            break;
    }
 }

@@ -3515,64 +3235,3 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst
        GGML_ABORT("Function is not implemented.");
    }
 }
-
-static void ggml_cann_out_prod_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
-    ggml_tensor * src0 = dst->src[0];  // weight
-    ggml_tensor * src1 = dst->src[1];  // input
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
-    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_dst.get());
-
-    const int64_t dps2 = ne2 / ne02;
-    const int64_t dps3 = ne3 / ne03;
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t i02 = i2 / dps2;
-            const int64_t i03 = i3 / dps3;
-
-            const int64_t  i12 = i2;
-            const int64_t  i13 = i3;
-            acl_tensor_ptr accumulator =
-                ggml_cann_create_tensor((char *) dst->data + i2 * nb2 + i3 * nb3, ggml_cann_type_mapping(dst->type),
-                                        ggml_type_size(dst->type), dst->ne, dst->nb, 2);
-
-            // The outer product needs to be accumulated in this dimension.
-            for (int64_t i1 = 0; i1 < ne11; i1++) {
-                acl_tensor_ptr acl_input = ggml_cann_create_tensor(
-                    (char *) src1->data + i1 * nb11 + i12 * nb12 + i13 * nb13, ggml_cann_type_mapping(src0->type),
-                    ggml_type_size(src0->type), src1->ne, src1->nb, 1);
-
-                acl_tensor_ptr acl_weight = ggml_cann_create_tensor(
-                    (char *) src0->data + i1 * nb01 + i02 * nb02 + i03 * nb03, ggml_cann_type_mapping(src0->type),
-                    ggml_type_size(src0->type), src0->ne, src0->nb, 1);
-
-                ggml_cann_pool_alloc output_allocator(ctx.pool());
-                void *               output_buffer = output_allocator.alloc(ggml_nbytes(dst));
-                acl_tensor_ptr       acl_out = ggml_cann_create_tensor(output_buffer, ggml_cann_type_mapping(dst->type),
-                                                                       ggml_type_size(dst->type), dst->ne, dst->nb, 2);
-
-                GGML_CANN_CALL_ACLNN_OP(ctx, Ger, acl_input.get(), acl_weight.get(), acl_out.get());
-                float       alpha_value = 1.0f;
-                aclScalar * alpha       = aclCreateScalar(&alpha_value, ACL_FLOAT);
-                GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, accumulator.get(), acl_out.get(), alpha);
-            }
-        }
-    }
-}
-
-void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
-    ggml_tensor * src0 = dst->src[0];
-
-    const enum ggml_type type = src0->type;
-
-    switch (type) {
-        case GGML_TYPE_F32:
-        case GGML_TYPE_F16:
-            ggml_cann_out_prod_fp(ctx, dst);
-            break;
-        default:
-            GGML_ABORT("Unsupport type for GGML_OP_OUT_PROD");
-            break;
-    }
-}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	6cdda87baf	ci : disable op offload in some tests	2025-11-20 17:16:50 +02:00
Georgi Gerganov	e4838046f3	llama : update worst-case graph for unified cache	2025-11-19 09:56:31 +02:00
slaren	0710d5f0f8	ggml : add GGML_SCHED_NO_REALLOC option to disable reallocations in ggml_backend_sched Enabled in ggml-ci for testing.	2025-11-17 21:29:22 +01:00