ggml : remove redundant src in ggml_cast

2026-02-12 14:03:20 +02:00 · 2025-12-09 11:16:15 +02:00
141 changed files with 7871 additions and 22908 deletions
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@@ -4,7 +4,7 @@

 # Define the CANN base image for easier version updates later
 ARG CHIP_TYPE=910b
-ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11
+ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc1.alpha001-${CHIP_TYPE}-openeuler22.03-py3.11

 # ==============================================================================
 # BUILD STAGE
@@ -111,7 +111,7 @@ ENTRYPOINT ["/app/tools.sh"]
 # ==============================================================================
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 ENTRYPOINT [ "/app/llama-cli" ]

--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 WORKDIR /app

--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -74,7 +74,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 WORKDIR /app

--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -73,7 +73,7 @@ ENTRYPOINT ["/app/tools.sh"]
 FROM base AS light

 COPY --from=build /app/lib/ /app
-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 WORKDIR /app

--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -81,7 +81,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 WORKDIR /app

--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -94,7 +94,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 WORKDIR /app

--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -105,7 +105,7 @@ WORKDIR /llama.cpp/bin

 # Copy llama.cpp binaries and libraries
 COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
-COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
+COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin

 ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]

--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -13,8 +13,6 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
    exec ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
    exec ./llama-cli "$@"
-elif [[ "$arg1" == '--run-legacy' || "$arg1" == '-l' ]]; then
-    exec ./llama-completion "$@"
 elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
    exec ./llama-bench "$@"
 elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
@@ -34,10 +32,8 @@ elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
-    echo "  --run (-r): Run a model (chat) previously converted into ggml"
-    echo "              ex: -m /models/7B/ggml-model-q4_0.bin"
-    echo "  --run-legacy (-l): Run a model (legacy completion) previously converted into ggml"
-    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -no-cnv -p \"Building a website can be done in 10 simple steps:\" -n 512"
+    echo "  --run (-r): Run a model previously converted into ggml"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
    echo "  --bench (-b): Benchmark the performance of the inference for various parameters."
    echo "              ex: -m model.gguf"
    echo "  --perplexity (-p): Measure the perplexity of a model over a given text."
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 WORKDIR /app

--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -243,7 +243,7 @@ jobs:
          echo "Fetch llama2c model"
          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

      - name: Test llama2c (s390x)
        id: llama2c_test_s390x
@@ -252,7 +252,7 @@ jobs:
          cd build
          echo "Fetch llama2c big-endian model"
          wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
-          ./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+          ./bin/llama-cli -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

  ubuntu-latest-cmake-sanitizer:
    runs-on: ubuntu-latest
@@ -1400,54 +1400,25 @@ jobs:
        chip_type: ['910b', '310p']
        build: ['Release']
    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+    container: ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.2.rc1-310p-openeuler22.03-py3.11' }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0

-      - name: Free up disk space
-        uses: ggml-org/free-disk-space@v1.3.1
-        with:
-          tool-cache: true
-
-      - name: Set container image
-        id: cann-image
+      - name: Dependencies
        run: |
-          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
-          echo "image=${image}" >> "${GITHUB_OUTPUT}"
-
-      - name: Pull container image
-        run: docker pull "${{ steps.cann-image.outputs.image }}"
+          yum update -y
+          yum install -y git gcc gcc-c++ make cmake libcurl-devel

      - name: Build
-        env:
-          BUILD_TYPE: ${{ matrix.build }}
-          SOC_TYPE: ascend${{ matrix.chip_type }}
        run: |
-          HOST_UID=$(id -u)
-          HOST_GID=$(id -g)
+          export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}

-          docker run --rm \
-            -v "${PWD}:/workspace" \
-            -w /workspace \
-            -e SOC_TYPE=${SOC_TYPE} \
-            -e BUILD_TYPE=${BUILD_TYPE} \
-            "${{ steps.cann-image.outputs.image }}" \
-            bash -lc '
-              set -e
-              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
-              yum clean all && rm -rf /var/cache/yum
-              git config --global --add safe.directory "/workspace"
-              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-              cmake -S . -B build \
-                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-                  -DGGML_CANN=on \
-                  -DSOC_TYPE=${SOC_TYPE}
-              cmake --build build -j $(nproc)
-
-              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
-            '
+          cmake -S . -B build \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
+              -DGGML_CANN=on \
+              -DSOC_TYPE=ascend${{ matrix.chip_type }}
+          cmake --build build -j $(nproc)

 # TODO: simplify the following workflows using a matrix
 # TODO: run lighter CI on PRs and the full CI only on master (if needed)
@@ -1799,7 +1770,7 @@ jobs:
          echo "Fetch llama2c model"
          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

  ubuntu-cmake-sanitizer-riscv64-native:
    runs-on: RISCV64
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -731,78 +731,6 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
          name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz

-
-  openEuler-cann:
-    strategy:
-      matrix:
-        arch: [x86, aarch64]
-        chip_type: ['910b', '310p']
-        build: ['Release']
-    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Free up disk space
-        uses: ggml-org/free-disk-space@v1.3.1
-        with:
-          tool-cache: true
-
-      - name: Set container image
-        id: cann-image
-        run: |
-          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
-          echo "image=${image}" >> "${GITHUB_OUTPUT}"
-
-      - name: Pull container image
-        run: docker pull "${{ steps.cann-image.outputs.image }}"
-
-      - name: Build
-        env:
-          BUILD_TYPE: ${{ matrix.build }}
-          SOC_TYPE: ascend${{ matrix.chip_type }}
-        run: |
-          HOST_UID=$(id -u)
-          HOST_GID=$(id -g)
-
-          docker run --rm \
-            -v "${PWD}:/workspace" \
-            -w /workspace \
-            -e SOC_TYPE=${SOC_TYPE} \
-            -e BUILD_TYPE=${BUILD_TYPE} \
-            "${{ steps.cann-image.outputs.image }}" \
-            bash -lc '
-              set -e
-              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
-              yum clean all && rm -rf /var/cache/yum
-              git config --global --add safe.directory "/workspace"
-              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-              cmake -S . -B build \
-                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-                  -DGGML_CANN=on \
-                  -DSOC_TYPE=${SOC_TYPE}
-              cmake --build build -j $(nproc)
-
-              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
-            '
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
-
-      - name: Upload artifacts (tar)
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
-          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
-
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

@@ -824,7 +752,6 @@ jobs:
      - macOS-arm64
      - macOS-x64
      - ios-xcode-build
-      - openEuler-cann

    steps:
      - name: Clone
@@ -917,12 +844,6 @@ jobs:
            - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
            - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)

-            **openEuler:**
-            - [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
-            - [openEuler x86 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86.tar.gz)
-            - [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
-            - [openEuler aarch64 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64.tar.gz)
-
      - name: Upload release
        id: upload_release
        uses: actions/github-script@v3
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -15,7 +15,6 @@ The project differentiates between 3 levels of contributors:
    - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
    - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
 - Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
- When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
 - If your PR becomes stale, rebase it on top of latest `master` to get maintainers attention
 - Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
--- a/README.md
+++ b/README.md
@@ -347,6 +347,19 @@ To learn more about model quantization, [read this documentation](tools/quantize

    </details>

+- <details>
+    <summary>Run simple text completion</summary>
+
+    To disable conversation mode explicitly, use `-no-cnv`
+
+    ```bash
+    llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
+
+    # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
+    ```
+
+    </details>
+
 - <details>
    <summary>Constrain the output with a custom grammar</summary>

--- a/ci/run.sh
+++ b/ci/run.sh
@@ -398,18 +398,18 @@ function gg_run_qwen3_0_6b {
    ./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
    ./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)

-    (time ./bin/llama-completion -no-cnv --model ${model_f16}  -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    if [ -z ${GG_BUILD_NO_BF16} ]; then
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -73,8 +73,6 @@ add_library(${TARGET} STATIC
    ngram-cache.h
    peg-parser.cpp
    peg-parser.h
-    preset.cpp
-    preset.h
    regex-partial.cpp
    regex-partial.h
    sampling.cpp
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -47,12 +47,10 @@
 #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083

 using json = nlohmann::ordered_json;
-using namespace common_arg_utils;

 static std::initializer_list<enum llama_example> mmproj_examples = {
    LLAMA_EXAMPLE_MTMD,
    LLAMA_EXAMPLE_SERVER,
-    LLAMA_EXAMPLE_CLI,
 };

 static std::string read_file(const std::string & fname) {
@@ -65,15 +63,6 @@ static std::string read_file(const std::string & fname) {
    return content;
 }

-static const std::vector<common_arg> & get_common_arg_defs() {
-    static const std::vector<common_arg> options = [] {
-        common_params params;
-        auto ctx = common_params_parser_init(params, LLAMA_EXAMPLE_SERVER, nullptr);
-        return ctx.options;
-    }();
-    return options;
-}
-
 common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
    this->examples = examples;
    return *this;
@@ -105,16 +94,6 @@ bool common_arg::is_exclude(enum llama_example ex) {

 bool common_arg::get_value_from_env(std::string & output) const {
    if (env == nullptr) return false;
-    if (!args_neg.empty()) {
-        // for compatibility, we need to check LLAMA_ARG_NO_ env as well
-        std::string neg_env = env;
-        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
-        char * neg_value = std::getenv(neg_env.c_str());
-        if (neg_value) {
-            output = "0"; // falsey
-            return true;
-        }
-    }
    char * value = std::getenv(env);
    if (value) {
        output = value;
@@ -124,14 +103,6 @@ bool common_arg::get_value_from_env(std::string & output) const {
 }

 bool common_arg::has_value_from_env() const {
-    if (env != nullptr && !args_neg.empty()) {
-        // for compatibility, we need to check LLAMA_ARG_NO_ env as well
-        std::string neg_env = env;
-        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
-        if (std::getenv(neg_env.c_str())) {
-            return true;
-        }
-    }
    return env != nullptr && std::getenv(env);
 }

@@ -162,17 +133,16 @@ static std::vector<std::string> break_str_into_lines(std::string input, size_t m
    return result;
 }

-std::string common_arg::to_string() const {
+std::string common_arg::to_string() {
    // params for printing to console
    const static int n_leading_spaces = 40;
    const static int n_char_per_line_help = 70; // TODO: detect this based on current console
    std::string leading_spaces(n_leading_spaces, ' ');

    std::ostringstream ss;
-    auto all_args = get_args(); // also contains args_neg
-    for (const auto & arg : all_args) {
-        if (arg == all_args.front()) {
-            if (all_args.size() == 1) {
+    for (const auto arg : args) {
+        if (arg == args.front()) {
+            if (args.size() == 1) {
                ss << arg;
            } else {
                // first arg is usually abbreviation, we need padding to make it more beautiful
@@ -181,7 +151,7 @@ std::string common_arg::to_string() const {
                ss << tmp << spaces;
            }
        } else {
-            ss << arg << (arg != all_args.back() ? ", " : "");
+            ss << arg << (arg != args.back() ? ", " : "");
        }
    }
    if (value_hint) ss << " " << value_hint;
@@ -200,31 +170,6 @@ std::string common_arg::to_string() const {
    return ss.str();
 }

-std::vector<std::string> common_arg::get_args() const {
-    std::vector<std::string> result;
-    for (const auto & arg : args) {
-        result.push_back(std::string(arg));
-    }
-    for (const auto & arg : args_neg) {
-        result.push_back(std::string(arg));
-    }
-    return result;
-}
-
-std::vector<std::string> common_arg::get_env() const {
-    std::vector<std::string> result;
-    if (env) {
-        result.push_back(std::string(env));
-    }
-    if (!args_neg.empty() && env) {
-        // for compatibility, we need to add LLAMA_ARG_NO_ variant
-        std::string neg_env = env;
-        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
-        result.push_back(neg_env);
-    }
-    return result;
-}
-
 //
 // utils
 //
@@ -360,16 +305,6 @@ static std::string get_all_kv_cache_types() {
    return msg.str();
 }

-static bool parse_bool_value(const std::string & value) {
-    if (is_truthy(value)) {
-        return true;
-    } else if (is_falsey(value)) {
-        return false;
-    } else {
-        throw std::invalid_argument("invalid boolean value");
-    }
-}
-
 //
 // CLI argument parsing functions
 //
@@ -377,13 +312,10 @@ static bool parse_bool_value(const std::string & value) {
 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
    common_params & params = ctx_arg.params;

-    std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
+    std::unordered_map<std::string, common_arg *> arg_to_options;
    for (auto & opt : ctx_arg.options) {
        for (const auto & arg : opt.args) {
-            arg_to_options[arg] = {&opt, /* is_positive */ true};
-        }
-        for (const auto & arg : opt.args_neg) {
-            arg_to_options[arg] = {&opt, /* is_positive */ false};
+            arg_to_options[arg] = &opt;
        }
    }

@@ -392,15 +324,12 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        std::string value;
        if (opt.get_value_from_env(value)) {
            try {
-                if (opt.handler_void && is_truthy(value)) {
+                if (opt.handler_void && (value == "1" || value == "true")) {
                    opt.handler_void(params);
                }
                if (opt.handler_int) {
                    opt.handler_int(params, std::stoi(value));
                }
-                if (opt.handler_bool) {
-                    opt.handler_bool(params, parse_bool_value(value));
-                }
                if (opt.handler_string) {
                    opt.handler_string(params, value);
                    continue;
@@ -429,9 +358,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        if (arg_to_options.find(arg) == arg_to_options.end()) {
            throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
        }
-        auto & tmp = arg_to_options[arg];
-        auto opt = *tmp.first;
-        bool is_positive = tmp.second;
+        auto opt = *arg_to_options[arg];
        if (opt.has_value_from_env()) {
            fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
        }
@@ -440,10 +367,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
                opt.handler_void(params);
                continue;
            }
-            if (opt.handler_bool) {
-                opt.handler_bool(params, is_positive);
-                continue;
-            }

            // arg with single value
            check_arg(i);
@@ -468,7 +391,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
            throw std::invalid_argument(string_format(
                "error while handling argument \"%s\": %s\n\n"
                "usage:\n%s\n\nto show complete usage, run with -h",
-                arg.c_str(), e.what(), opt.to_string().c_str()));
+                arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
        }
    }

@@ -545,8 +468,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        ));
    }

-    common_log_set_verbosity_thold(params.verbosity);
-
    return true;
 }

@@ -723,53 +644,6 @@ static void add_rpc_devices(const std::string & servers) {
    }
 }

-bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
-    common_params dummy_params;
-    common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
-
-    std::unordered_map<std::string, common_arg *> arg_to_options;
-    for (auto & opt : ctx_arg.options) {
-        for (const auto & arg : opt.args) {
-            arg_to_options[arg] = &opt;
-        }
-    }
-
-    // TODO @ngxson : find a way to deduplicate this code
-
-    // handle command line arguments
-    auto check_arg = [&](int i) {
-        if (i+1 >= argc) {
-            throw std::invalid_argument("expected value for argument");
-        }
-    };
-
-    for (int i = 1; i < argc; i++) {
-        const std::string arg_prefix = "--";
-
-        std::string arg = argv[i];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
-        if (arg_to_options.find(arg) == arg_to_options.end()) {
-            throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
-        }
-        auto opt = *arg_to_options[arg];
-        std::string val;
-        if (opt.value_hint != nullptr) {
-            // arg with single value
-            check_arg(i);
-            val = argv[++i];
-        }
-        if (opt.value_hint_2 != nullptr) {
-            // TODO: support arg with 2 values
-            throw std::invalid_argument("error: argument with 2 values is not yet supported\n");
-        }
-        out_map[opt] = val;
-    }
-
-    return true;
-}
-
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
    auto ctx_arg = common_params_parser_init(params, ex, print_usage);
    const common_params params_org = ctx_arg.params; // the example can modify the default params
@@ -815,19 +689,25 @@ static std::string list_builtin_chat_templates() {
    return msg.str();
 }

-bool common_arg_utils::is_truthy(const std::string & value) {
-    return value == "on" || value == "enabled" || value == "true" || value == "1";
+static bool is_truthy(const std::string & value) {
+    return value == "on" || value == "enabled" || value == "1";
 }

-bool common_arg_utils::is_falsey(const std::string & value) {
-    return value == "off" || value == "disabled" || value == "false" || value == "0";
+static bool is_falsey(const std::string & value) {
+    return value == "off" || value == "disabled" || value == "0";
 }

-bool common_arg_utils::is_autoy(const std::string & value) {
+static bool is_autoy(const std::string & value) {
    return value == "auto" || value == "-1";
 }

 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+    // default values specific to example
+    // note: we place it here instead of inside server.cpp to allow llama-gen-docs to pick it up
+    if (ex == LLAMA_EXAMPLE_SERVER) {
+        params.use_jinja = true;
+    }
+
    params.use_color = tty_can_use_colors();

    // load dynamic backends
@@ -905,13 +785,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ));
    add_opt(common_arg(
-        {"--display-prompt"},
        {"--no-display-prompt"},
-        string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
-        [](common_params & params, bool value) {
-            params.display_prompt = value;
+        string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
+        [](common_params & params) {
+            params.display_prompt = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
        {"-co", "--color"}, "[on|off|auto]",
        "Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
@@ -928,7 +807,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                    string_format("error: unknown value for --color: '%s'\n", value.c_str()));
            }
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
    add_opt(common_arg(
        {"-t", "--threads"}, "N",
        string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
@@ -1061,7 +940,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    add_opt(common_arg(
        {"-n", "--predict", "--n-predict"}, "N",
        string_format(
-            ex == LLAMA_EXAMPLE_COMPLETION
+            ex == LLAMA_EXAMPLE_MAIN
                ? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
                : "number of tokens to predict (default: %d, -1 = infinity)",
            params.n_predict),
@@ -1105,7 +984,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, int value) {
            params.n_ctx_checkpoints = value;
        }
-    ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+    ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--cache-ram", "-cram"}, "N",
        string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
@@ -1113,7 +992,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, int value) {
            params.cache_ram_mib = value;
        }
-    ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+    ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--kv-unified", "-kvu"},
        string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
@@ -1123,13 +1002,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_KV_UNIFIED"));
    add_opt(common_arg(
-        {"--context-shift"},
        {"--no-context-shift"},
-        string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
-        [](common_params & params, bool value) {
-            params.ctx_shift = value;
+        string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
+        [](common_params & params) {
+            params.ctx_shift = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
+    add_opt(common_arg(
+        {"--context-shift"},
+        string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.ctx_shift = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
    add_opt(common_arg(
        {"--chunks"}, "N",
        string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -1165,24 +1050,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.system_prompt = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
    add_opt(common_arg(
-        {"--perf"},
        {"--no-perf"},
-        string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
-        [](common_params & params, bool value) {
-            params.no_perf = !value;
-            params.sampling.no_perf = !value;
+        string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
+        [](common_params & params) {
+            params.no_perf = true;
+            params.sampling.no_perf = true;
        }
-    ).set_env("LLAMA_ARG_PERF"));
-    add_opt(common_arg(
-        {"--show-timings"},
-        {"--no-show-timings"},
-        string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
-        [](common_params & params, bool value) {
-            params.show_timings = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
+    ).set_env("LLAMA_ARG_NO_PERF"));
    add_opt(common_arg(
        {"-f", "--file"}, "FNAME",
        "a file containing the prompt (default: none)",
@@ -1204,7 +1080,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                params.system_prompt.pop_back();
            }
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
    add_opt(common_arg(
        {"--in-file"}, "FNAME",
        "an input file (repeat to specify multiple files)",
@@ -1234,10 +1110,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_excludes({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-e", "--escape"},
+        string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
+        [](common_params & params) {
+            params.escape = true;
+        }
+    ));
+    add_opt(common_arg(
        {"--no-escape"},
-        string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
-        [](common_params & params, bool value) {
-            params.escape = value;
+        "do not process escape sequences",
+        [](common_params & params) {
+            params.escape = false;
        }
    ));
    add_opt(common_arg(
@@ -1246,53 +1128,59 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, int value) {
            params.n_print = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
        {"--prompt-cache"}, "FNAME",
        "file to cache prompt state for faster startup (default: none)",
        [](common_params & params, const std::string & value) {
            params.path_prompt_cache = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
        {"--prompt-cache-all"},
        "if specified, saves user input and generations to cache as well\n",
        [](common_params & params) {
            params.prompt_cache_all = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
        {"--prompt-cache-ro"},
        "if specified, uses the prompt cache but does not update it",
        [](common_params & params) {
            params.prompt_cache_ro = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
        {"-r", "--reverse-prompt"}, "PROMPT",
        "halt generation at PROMPT, return control in interactive mode\n",
        [](common_params & params, const std::string & value) {
            params.antiprompt.emplace_back(value);
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-sp", "--special"},
        string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
        [](common_params & params) {
            params.special = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-cnv", "--conversation"},
-        {"-no-cnv", "--no-conversation"},
-        "whether to run in conversation mode:\n"
+        "run in conversation mode:\n"
        "- does not print special tokens and suffix/prefix\n"
        "- interactive mode is also enabled\n"
        "(default: auto enabled if chat template is available)",
-        [](common_params & params, bool value) {
-            params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
+        [](common_params & params) {
+            params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"-no-cnv", "--no-conversation"},
+        "force disable conversation mode (default: false)",
+        [](common_params & params) {
+            params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
        {"-st", "--single-turn"},
        "run conversation for a single turn only, then exit when done\n"
@@ -1301,28 +1189,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.single_turn = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
        {"-i", "--interactive"},
        string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
        [](common_params & params) {
            params.interactive = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
        {"-if", "--interactive-first"},
        string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
        [](common_params & params) {
            params.interactive_first = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
        {"-mli", "--multiline-input"},
        "allows you to write or paste multiple lines without ending each in '\\'",
        [](common_params & params) {
            params.multiline_input = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
        {"--in-prefix-bos"},
        "prefix BOS to user inputs, preceding the `--in-prefix` string",
@@ -1330,7 +1218,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.input_prefix_bos = true;
            params.enable_chat_template = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
        {"--in-prefix"}, "STRING",
        "string to prefix user inputs with (default: empty)",
@@ -1338,7 +1226,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.input_prefix = value;
            params.enable_chat_template = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
        {"--in-suffix"}, "STRING",
        "string to suffix after user inputs with (default: empty)",
@@ -1346,15 +1234,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.input_suffix = value;
            params.enable_chat_template = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
-        {"--warmup"},
        {"--no-warmup"},
-        string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
-        [](common_params & params, bool value) {
-            params.warmup = value;
+        "skip warming up the model with an empty run",
+        [](common_params & params) {
+            params.warmup = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
    add_opt(common_arg(
        {"--spm-infill"},
        string_format(
@@ -1745,30 +1632,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, int value) {
            params.grp_attn_n = value;
        }
-    ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_PASSKEY}));
+    ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
    add_opt(common_arg(
        {"-gaw", "--grp-attn-w"}, "N",
        string_format("group-attention width (default: %d)", params.grp_attn_w),
        [](common_params & params, int value) {
            params.grp_attn_w = value;
        }
-    ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
+    ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
-        {"-kvo", "--kv-offload"},
        {"-nkvo", "--no-kv-offload"},
-        string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
-        [](common_params & params, bool value) {
-            params.no_kv_offload = !value;
+        "disable KV offload",
+        [](common_params & params) {
+            params.no_kv_offload = true;
        }
-    ).set_env("LLAMA_ARG_KV_OFFLOAD"));
+    ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
    add_opt(common_arg(
-        {"--repack"},
        {"-nr", "--no-repack"},
-        string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
-        [](common_params & params, bool value) {
-            params.no_extra_bufts = !value;
+        "disable weight repacking",
+        [](common_params & params) {
+            params.no_extra_bufts = true;
        }
-    ).set_env("LLAMA_ARG_REPACK"));
+    ).set_env("LLAMA_ARG_NO_REPACK"));
    add_opt(common_arg(
        {"--no-host"},
        "bypass host buffer allowing extra buffers to be used",
@@ -1897,14 +1782,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
    add_opt(common_arg(
        {"-cb", "--cont-batching"},
-        {"-nocb", "--no-cont-batching"},
-        string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
-        [](common_params & params, bool value) {
-            params.cont_batching = value;
+        string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.cont_batching = true;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
    add_opt(common_arg(
-        {"-mm", "--mmproj"}, "FILE",
+        {"-nocb", "--no-cont-batching"},
+        "disable continuous batching",
+        [](common_params & params) {
+            params.cont_batching = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
+    add_opt(common_arg(
+        {"--mmproj"}, "FILE",
        "path to a multimodal projector file. see tools/mtmd/README.md\n"
        "note: if -hf is used, this argument can be omitted",
        [](common_params & params, const std::string & value) {
@@ -1912,35 +1803,33 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
    add_opt(common_arg(
-        {"-mmu", "--mmproj-url"}, "URL",
+        {"--mmproj-url"}, "URL",
        "URL to a multimodal projector file. see tools/mtmd/README.md",
        [](common_params & params, const std::string & value) {
            params.mmproj.url = value;
        }
    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
    add_opt(common_arg(
-        {"--mmproj-auto"},
-        {"--no-mmproj", "--no-mmproj-auto"},
-        string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
-        [](common_params & params, bool value) {
-            params.no_mmproj = !value;
+        {"--no-mmproj"},
+        "explicitly disable multimodal projector, useful when using -hf",
+        [](common_params & params) {
+            params.no_mmproj = true;
        }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
    add_opt(common_arg(
-        {"--mmproj-offload"},
        {"--no-mmproj-offload"},
-        string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
-        [](common_params & params, bool value) {
-            params.mmproj_use_gpu = value;
+        "do not offload multimodal projector to GPU",
+        [](common_params & params) {
+            params.mmproj_use_gpu = false;
        }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
    add_opt(common_arg(
        {"--image", "--audio"}, "FILE",
        "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
        [](common_params & params, const std::string & value) {
            params.image.emplace_back(value);
        }
-    ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
+    ).set_examples({LLAMA_EXAMPLE_MTMD}));
    add_opt(common_arg(
        {"--image-min-tokens"}, "N",
        "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
@@ -1973,13 +1862,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_MLOCK"));
    add_opt(common_arg(
-        {"--mmap"},
        {"--no-mmap"},
-        string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
-        [](common_params & params, bool value) {
-            params.use_mmap = value;
+        "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
+        [](common_params & params) {
+            params.use_mmap = false;
        }
-    ).set_env("LLAMA_ARG_MMAP"));
+    ).set_env("LLAMA_ARG_NO_MMAP"));
    add_opt(common_arg(
        {"--numa"}, "TYPE",
        "attempt optimizations that help on some NUMA systems\n"
@@ -2034,7 +1922,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
            parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--cpu-moe", "-cmoe"},
        "keep all Mixture of Experts (MoE) weights in the CPU",
@@ -2063,7 +1951,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
    add_opt(common_arg(
        {"--n-cpu-moe-draft", "-ncmoed"}, "N",
        "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
@@ -2077,7 +1965,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
            }
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
    add_opt(common_arg(
        {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
        string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
@@ -2167,11 +2055,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ));
    add_opt(common_arg(
-        {"--op-offload"},
        {"--no-op-offload"},
-        string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
-        [](common_params & params, bool value) {
-            params.no_op_offload = !value;
+        string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
+        [](common_params & params) {
+            params.no_op_offload = true;
        }
    ));
    add_opt(common_arg(
@@ -2367,11 +2254,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
    add_opt(common_arg(
-        {"--ppl"},
        {"--no-ppl"},
-        string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
-        [](common_params & params, bool value) {
-            params.compute_ppl = value;
+        string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
+        [](common_params & params) {
+            params.compute_ppl = false;
        }
    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
    add_opt(common_arg(
@@ -2490,13 +2376,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
    add_opt(common_arg(
-        {"--webui"},
        {"--no-webui"},
-        string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
-        [](common_params & params, bool value) {
-            params.webui = value;
+        string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.webui = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
    add_opt(common_arg(
        {"--embedding", "--embeddings"},
        string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@@ -2559,7 +2444,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                params.default_template_kwargs[item.key()] = item.value().dump();
            }
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
    add_opt(common_arg(
        {"-to", "--timeout"}, "N",
        string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@@ -2601,12 +2486,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
    add_opt(common_arg(
        {"--slots"},
-        {"--no-slots"},
-        string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
-        [](common_params & params, bool value) {
-            params.endpoint_slots = value;
+        string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.endpoint_slots = true;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
+    add_opt(common_arg(
+        {"--no-slots"},
+        "disables slots monitoring endpoint",
+        [](common_params & params) {
+            params.endpoint_slots = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
    add_opt(common_arg(
        {"--slot-save-path"}, "PATH",
        "path to save slot kv cache (default: disabled)",
@@ -2642,13 +2533,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.models_dir = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
-    add_opt(common_arg(
-        {"--models-preset"}, "PATH",
-        "path to INI file containing model presets for the router server (default: disabled)",
-        [](common_params & params, const std::string & value) {
-            params.models_preset = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_PRESET"));
    add_opt(common_arg(
        {"--models-max"}, "N",
        string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
@@ -2657,21 +2541,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
    add_opt(common_arg(
-        {"--models-autoload"},
        {"--no-models-autoload"},
-        string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
-        [](common_params & params, bool value) {
-            params.models_autoload = value;
+        "disables automatic loading of models (default: enabled)",
+        [](common_params & params) {
+            params.models_autoload = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
    add_opt(common_arg(
        {"--jinja"},
-        {"--no-jinja"},
-        string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
-        [](common_params & params, bool value) {
-            params.use_jinja = value;
+        string_format("use jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.use_jinja = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
+    add_opt(common_arg(
+        {"--no-jinja"},
+        string_format("disable jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.use_jinja = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
    add_opt(common_arg(
        {"--reasoning-format"}, "FORMAT",
        "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
@@ -2682,7 +2571,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.reasoning_format = common_reasoning_format_from_name(value);
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
    add_opt(common_arg(
        {"--reasoning-budget"}, "N",
        "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
@@ -2690,7 +2579,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
            params.reasoning_budget = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
    add_opt(common_arg(
        {"--chat-template"}, "JINJA_TEMPLATE",
        string_format(
@@ -2702,7 +2591,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.chat_template = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
    add_opt(common_arg(
        {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
        string_format(
@@ -2714,18 +2603,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.chat_template = read_file(value);
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
    add_opt(common_arg(
-        {"--prefill-assistant"},
        {"--no-prefill-assistant"},
        string_format(
            "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
            "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
        ),
-        [](common_params & params, bool value) {
-            params.prefill_assistant = value;
+        [](common_params & params) {
+            params.prefill_assistant = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
    add_opt(common_arg(
        {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
        string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
@@ -2746,7 +2634,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.simple_io = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
        {"--positive-file"}, "FNAME",
        string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
@@ -2829,6 +2717,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
        [](common_params & params) {
            params.verbosity = INT_MAX;
+            common_log_set_verbosity_thold(INT_MAX);
        }
    ));
    add_opt(common_arg(
@@ -2849,6 +2738,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            "(default: %d)\n", params.verbosity),
        [](common_params & params, int value) {
            params.verbosity = value;
+            common_log_set_verbosity_thold(value);
        }
    ).set_env("LLAMA_LOG_VERBOSITY"));
    add_opt(common_arg(
@@ -2981,14 +2871,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, int value) {
            params.speculative.n_max = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MAX"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
    add_opt(common_arg(
        {"--draft-min", "--draft-n-min"}, "N",
        string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
        [](common_params & params, int value) {
            params.speculative.n_min = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
    add_opt(common_arg(
        {"--draft-p-split"}, "P",
        string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
@@ -3002,14 +2892,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.speculative.p_min = std::stof(value);
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
    add_opt(common_arg(
        {"-cd", "--ctx-size-draft"}, "N",
        string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
        [](common_params & params, int value) {
            params.speculative.n_ctx = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
    add_opt(common_arg(
        {"-devd", "--device-draft"}, "<dev1,dev2,..>",
        "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@@ -3017,7 +2907,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.speculative.devices = parse_device_list(value);
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
        "number of layers to store in VRAM for the draft model",
@@ -3029,21 +2919,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
            }
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
    add_opt(common_arg(
        {"-md", "--model-draft"}, "FNAME",
        "draft model for speculative decoding (default: unused)",
        [](common_params & params, const std::string & value) {
            params.speculative.model.path = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_MODEL_DRAFT"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
    add_opt(common_arg(
        {"--spec-replace"}, "TARGET", "DRAFT",
        "translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
        [](common_params & params, const std::string & tgt, const std::string & dft) {
            params.speculative.replacements.push_back({ tgt, dft });
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-ctkd", "--cache-type-k-draft"}, "TYPE",
        string_format(
@@ -3307,7 +3197,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.use_jinja = true;
            //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));

    add_opt(common_arg(
        {"--gpt-oss-120b-default"},
@@ -3326,7 +3216,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.use_jinja = true;
            //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));

    add_opt(common_arg(
        {"--vision-gemma-4b-default"},
@@ -3337,7 +3227,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.n_ctx = 0;
            params.use_jinja = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));

    add_opt(common_arg(
        {"--vision-gemma-12b-default"},
@@ -3348,7 +3238,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.n_ctx = 0;
            params.use_jinja = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));

    return ctx_arg;
 }
--- a/common/arg.h
+++ b/common/arg.h
@@ -3,10 +3,8 @@
 #include "common.h"

 #include <set>
-#include <map>
 #include <string>
 #include <vector>
-#include <cstring>

 //
 // CLI argument parsing
@@ -16,7 +14,6 @@ struct common_arg {
    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
    std::set<enum llama_example> excludes = {};
    std::vector<const char *> args;
-    std::vector<const char *> args_neg;  // for negated args like --no-xxx
    const char * value_hint   = nullptr; // help text or example for arg value
    const char * value_hint_2 = nullptr; // for second arg value
    const char * env          = nullptr;
@@ -26,9 +23,6 @@ struct common_arg {
    void (*handler_string) (common_params & params, const std::string &) = nullptr;
    void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
    void (*handler_int)    (common_params & params, int) = nullptr;
-    void (*handler_bool)   (common_params & params, bool) = nullptr;
-
-    common_arg() = default;

    common_arg(
        const std::initializer_list<const char *> & args,
@@ -50,13 +44,6 @@ struct common_arg {
        void (*handler)(common_params & params)
    ) : args(args), help(help), handler_void(handler) {}

-    common_arg(
-        const std::initializer_list<const char *> & args,
-        const std::initializer_list<const char *> & args_neg,
-        const std::string & help,
-        void (*handler)(common_params & params, bool)
-    ) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {}
-
    // support 2 values for arg
    common_arg(
        const std::initializer_list<const char *> & args,
@@ -74,33 +61,9 @@ struct common_arg {
    bool is_exclude(enum llama_example ex);
    bool get_value_from_env(std::string & output) const;
    bool has_value_from_env() const;
-    std::string to_string() const;
-
-    // for using as key in std::map
-    bool operator<(const common_arg& other) const {
-        if (args.empty() || other.args.empty()) {
-            return false;
-        }
-        return strcmp(args[0], other.args[0]) < 0;
-    }
-    bool operator==(const common_arg& other) const {
-        if (args.empty() || other.args.empty()) {
-            return false;
-        }
-        return strcmp(args[0], other.args[0]) == 0;
-    }
-
-    // get all args and env vars (including negated args/env)
-    std::vector<std::string> get_args() const;
-    std::vector<std::string> get_env() const;
+    std::string to_string();
 };

-namespace common_arg_utils {
-    bool is_truthy(const std::string & value);
-    bool is_falsey(const std::string & value);
-    bool is_autoy(const std::string & value);
-}
-
 struct common_params_context {
    enum llama_example ex = LLAMA_EXAMPLE_COMMON;
    common_params & params;
@@ -113,11 +76,7 @@ struct common_params_context {
 // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

-// parse input arguments from CLI into a map
-// TODO: support repeated args in the future
-bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
-
-// initialize argument parser context - used by test-arg-parser and preset
+// function to be used by test-arg-parser
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

 struct common_remote_params {
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1,6 +1,5 @@
 #include "chat.h"
 #include "chat-parser.h"
-#include "chat-peg-parser.h"
 #include "common.h"
 #include "json-partial.h"
 #include "json-schema-to-grammar.h"
@@ -151,7 +150,6 @@ struct templates_params {
    common_chat_tool_choice tool_choice;
    json json_schema;
    bool parallel_tool_calls;
-    common_reasoning_format reasoning_format;
    bool stream;
    std::string grammar;
    bool add_generation_prompt = true;
@@ -591,16 +589,6 @@ common_chat_templates_ptr common_chat_templates_init(
            "{%- if false %}");
    }

-    // TODO @aldehir : this is a temporary fix, pending Minja changes
-    // Ref: https://github.com/ggml-org/llama.cpp/pull/17713#issuecomment-3631342664
-    if (default_template_src.find("[TOOL_CALLS]") != std::string::npos
-            // search for the error message and patch it
-            && default_template_src.find("if (message['content'] is none or") != std::string::npos) {
-        string_replace_all(default_template_src,
-            "{%- if (message['content'] is none or message['content'] == '' or message['content']|length == 0) and (message['tool_calls'] is not defined or message['tool_calls'] is none or message['tool_calls']|length == 0) %}",
-            "{%- if false %}");
-    }
-
    std::string token_bos = bos_token_override;
    std::string token_eos = eos_token_override;
    bool add_bos = false;
@@ -999,118 +987,6 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
    return data;
 }

-static common_chat_params common_chat_params_init_ministral_3(const common_chat_template & tmpl, const struct templates_params & inputs) {
-    common_chat_params data;
-
-    // Build up messages to follow the format: https://huggingface.co/mistralai/Ministral-3-14B-Reasoning-2512/blob/main/chat_template.jinja
-    auto adjusted_messages = json::array();
-    for (const auto & msg : inputs.messages) {
-        auto role = msg.value("role", "");
-        if (role != "system" && role != "assistant") {
-            // Only adjust system and assistant messages. Interestingly, the system message may contain thinking.
-            adjusted_messages.push_back(msg);
-            continue;
-        }
-
-        auto content = json::array();
-
-        // If message contains `reasoning_content`, add it as a block of type `thinking`
-        if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
-            content.push_back({
-                {"type", "thinking"},
-                {"thinking", msg.at("reasoning_content").get<std::string>()},
-            });
-        }
-
-        // If message contains `content`, add it as a block of type `text`
-        if (msg.contains("content")) {
-            if (msg.at("content").is_string()) {
-                content.push_back({
-                    {"type", "text"},
-                    {"text", msg.at("content").get<std::string>()},
-                });
-            } else if (msg.at("content").is_array()) {
-                auto blocks = msg.at("content");
-                content.insert(content.end(), blocks.begin(), blocks.end());
-            }
-        }
-
-        auto adjusted = msg;
-        adjusted["content"] = content;
-        adjusted.erase("reasoning_content");
-        adjusted_messages.push_back(adjusted);
-    }
-
-    auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
-    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
-    auto include_grammar = true;
-
-    data.prompt = apply(tmpl, inputs, /* messages_override = */ adjusted_messages);
-    data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
-    data.preserved_tokens = {
-        "[THINK]",
-        "[/THINK]",
-        "[TOOL_CALLS]",
-        "[ARGS]",
-    };
-
-    auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
-        auto reasoning = extract_reasoning ? p.optional("[THINK]" + p.reasoning(p.until("[/THINK]")) + "[/THINK]") : p.eps();
-
-        // Response format parser
-        if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
-            // Ministral wants to emit json surrounded by code fences
-            return reasoning << "```json" << p.content(p.schema(p.json(), "response-format", inputs.json_schema)) << "```";
-        }
-
-        // Tool call parser
-        if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
-            auto tool_choice = p.choice();
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                std::string name = function.at("name");
-                const auto & schema = function.at("parameters");
-
-                tool_choice |= p.rule("tool-" + name,
-                    p.tool_open(p.tool_name(p.literal(name)) + "[ARGS]")
-                    + p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema))
-                );
-            });
-
-            auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
-            auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
-            auto tool_calls = p.trigger_rule("tool-call", p.repeat("[TOOL_CALLS]" + tool_choice, min_calls, max_calls));
-
-            return reasoning << p.content(p.until("[TOOL_CALLS]")) << tool_calls;
-        }
-
-        // Content only parser
-        include_grammar = false;
-        return reasoning << p.content(p.rest());
-    });
-
-    data.parser = parser.save();
-
-    if (include_grammar) {
-        data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
-
-        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                auto schema = function.at("parameters");
-                builder.resolve_refs(schema);
-            });
-            parser.build_grammar(builder, data.grammar_lazy);
-        });
-
-        data.grammar_triggers = {
-            {COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"}
-        };
-    }
-
-    return data;
-}
-
 static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
    common_chat_params data;
    data.prompt = apply(tmpl, inputs);
@@ -2465,7 +2341,6 @@ static common_chat_params common_chat_templates_apply_jinja(
    params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
    params.add_generation_prompt = inputs.add_generation_prompt;
    params.tool_choice = inputs.tool_choice;
-    params.reasoning_format = inputs.reasoning_format;
    params.enable_thinking = inputs.enable_thinking;
    params.grammar = inputs.grammar;
    params.now = inputs.now;
@@ -2629,13 +2504,6 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
    }

-    // Ministral/Mistral Large 3
-    if (src.find("[SYSTEM_PROMPT]") != std::string::npos &&
-        src.find("[TOOL_CALLS]") != std::string::npos &&
-        src.find("[ARGS]") != std::string::npos) {
-        return common_chat_params_init_ministral_3(tmpl, params);
-    }
-
    if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
        return common_chat_params_init_magistral(tmpl, params);
    }
--- a/common/common.h
+++ b/common/common.h
@@ -82,8 +82,7 @@ int32_t cpu_get_num_math();
 enum llama_example {
    LLAMA_EXAMPLE_COMMON,
    LLAMA_EXAMPLE_SPECULATIVE,
-    LLAMA_EXAMPLE_COMPLETION,
-    LLAMA_EXAMPLE_CLI,
+    LLAMA_EXAMPLE_MAIN,
    LLAMA_EXAMPLE_EMBEDDING,
    LLAMA_EXAMPLE_PERPLEXITY,
    LLAMA_EXAMPLE_RETRIEVAL,
@@ -407,7 +406,6 @@ struct common_params {
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
    bool no_perf           = false; // disable performance metrics
-    bool show_timings      = true;  // show timing information on CLI
    bool ctx_shift         = false; // context shift on infinite text generation
    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
    bool kv_unified        = false; // enable unified KV cache
@@ -464,7 +462,7 @@ struct common_params {
    std::string public_path   = "";                                                                         // NOLINT
    std::string api_prefix    = "";                                                                         // NOLINT
    std::string chat_template = "";                                                                         // NOLINT
-    bool use_jinja = true;                                                                                  // NOLINT
+    bool use_jinja = false;                                                                                 // NOLINT
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int reasoning_budget = -1;
@@ -484,10 +482,9 @@ struct common_params {
    bool endpoint_metrics = false;

    // router server configs
-    std::string models_dir    = ""; // directory containing models for the router server
-    std::string models_preset = ""; // directory containing model presets for the router server
-    int models_max = 4;             // maximum number of models to load simultaneously
-    bool models_autoload = true;    // automatically load models when requested via the router server
+    std::string models_dir = ""; // directory containing models for the router server
+    int models_max = 4;          // maximum number of models to load simultaneously
+    bool models_autoload = true; // automatically load models when requested via the router server

    bool log_json = false;

--- a/common/console.cpp
+++ b/common/console.cpp
@@ -1,16 +1,6 @@
 #include "console.h"
-#include "log.h"
 #include <vector>
 #include <iostream>
-#include <cassert>
-#include <cstddef>
-#include <cctype>
-#include <cwctype>
-#include <cstdint>
-#include <condition_variable>
-#include <mutex>
-#include <thread>
-#include <stdarg.h>

 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
@@ -40,44 +30,26 @@
 #define ANSI_COLOR_BLUE    "\x1b[34m"
 #define ANSI_COLOR_MAGENTA "\x1b[35m"
 #define ANSI_COLOR_CYAN    "\x1b[36m"
-#define ANSI_COLOR_GRAY    "\x1b[90m"
 #define ANSI_COLOR_RESET   "\x1b[0m"
 #define ANSI_BOLD          "\x1b[1m"

 namespace console {

-#if defined (_WIN32)
-    namespace {
-        // Use private-use unicode values to represent special keys that are not reported
-        // as characters (e.g. arrows on Windows). These values should never clash with
-        // real input and let the rest of the code handle navigation uniformly.
-        static constexpr char32_t KEY_ARROW_LEFT       = 0xE000;
-        static constexpr char32_t KEY_ARROW_RIGHT      = 0xE001;
-        static constexpr char32_t KEY_ARROW_UP         = 0xE002;
-        static constexpr char32_t KEY_ARROW_DOWN       = 0xE003;
-        static constexpr char32_t KEY_HOME             = 0xE004;
-        static constexpr char32_t KEY_END              = 0xE005;
-        static constexpr char32_t KEY_CTRL_ARROW_LEFT  = 0xE006;
-        static constexpr char32_t KEY_CTRL_ARROW_RIGHT = 0xE007;
-        static constexpr char32_t KEY_DELETE           = 0xE008;
-    }
-
    //
    // Console state
    //
-#endif

-    static bool         advanced_display = false;
-    static bool         simple_io        = true;
-    static display_type current_display  = DISPLAY_TYPE_RESET;
+    static bool      advanced_display = false;
+    static bool      simple_io        = true;
+    static display_t current_display  = reset;

-    static FILE*        out              = stdout;
+    static FILE*     out              = stdout;

 #if defined (_WIN32)
-    static void*        hConsole;
+    static void*     hConsole;
 #else
-    static FILE*        tty              = nullptr;
-    static termios      initial_state;
+    static FILE*     tty              = nullptr;
+    static termios   initial_state;
 #endif

    //
@@ -148,7 +120,7 @@ namespace console {

    void cleanup() {
        // Reset console display
-        set_display(DISPLAY_TYPE_RESET);
+        set_display(reset);

 #if !defined(_WIN32)
        // Restore settings on POSIX systems
@@ -168,26 +140,20 @@ namespace console {
    //

    // Keep track of current display and only emit ANSI code if it changes
-    void set_display(display_type display) {
+    void set_display(display_t display) {
        if (advanced_display && current_display != display) {
-            common_log_flush(common_log_main());
+            fflush(stdout);
            switch(display) {
-                case DISPLAY_TYPE_RESET:
+                case reset:
                    fprintf(out, ANSI_COLOR_RESET);
                    break;
-                case DISPLAY_TYPE_INFO:
-                    fprintf(out, ANSI_COLOR_MAGENTA);
-                    break;
-                case DISPLAY_TYPE_PROMPT:
+                case prompt:
                    fprintf(out, ANSI_COLOR_YELLOW);
                    break;
-                case DISPLAY_TYPE_REASONING:
-                    fprintf(out, ANSI_COLOR_GRAY);
-                    break;
-                case DISPLAY_TYPE_USER_INPUT:
+                case user_input:
                    fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
                    break;
-                case DISPLAY_TYPE_ERROR:
+                case error:
                    fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
            }
            current_display = display;
@@ -210,18 +176,7 @@ namespace console {
            if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
                wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
                if (wc == 0) {
-                    const DWORD ctrl_mask = LEFT_CTRL_PRESSED | RIGHT_CTRL_PRESSED;
-                    const bool ctrl_pressed = (record.Event.KeyEvent.dwControlKeyState & ctrl_mask) != 0;
-                    switch (record.Event.KeyEvent.wVirtualKeyCode) {
-                        case VK_LEFT:   return ctrl_pressed ? KEY_CTRL_ARROW_LEFT  : KEY_ARROW_LEFT;
-                        case VK_RIGHT:  return ctrl_pressed ? KEY_CTRL_ARROW_RIGHT : KEY_ARROW_RIGHT;
-                        case VK_UP:     return KEY_ARROW_UP;
-                        case VK_DOWN:   return KEY_ARROW_DOWN;
-                        case VK_HOME:   return KEY_HOME;
-                        case VK_END:    return KEY_END;
-                        case VK_DELETE: return KEY_DELETE;
-                        default:        continue;
-                    }
+                    continue;
                }

                if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
@@ -360,52 +315,6 @@ namespace console {
 #endif
    }

-    static char32_t decode_utf8(const std::string & input, size_t pos, size_t & advance) {
-        unsigned char c = static_cast<unsigned char>(input[pos]);
-        if ((c & 0x80u) == 0u) {
-            advance = 1;
-            return c;
-        }
-        if ((c & 0xE0u) == 0xC0u && pos + 1 < input.size()) {
-            unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
-            if ((c1 & 0xC0u) != 0x80u) {
-                advance = 1;
-                return 0xFFFD;
-            }
-            advance = 2;
-            return ((c & 0x1Fu) << 6) | (static_cast<unsigned char>(input[pos + 1]) & 0x3Fu);
-        }
-        if ((c & 0xF0u) == 0xE0u && pos + 2 < input.size()) {
-            unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
-            unsigned char c2 = static_cast<unsigned char>(input[pos + 2]);
-            if ((c1 & 0xC0u) != 0x80u || (c2 & 0xC0u) != 0x80u) {
-                advance = 1;
-                return 0xFFFD;
-            }
-            advance = 3;
-            return ((c & 0x0Fu) << 12) |
-                   ((static_cast<unsigned char>(input[pos + 1]) & 0x3Fu) << 6) |
-                   (static_cast<unsigned char>(input[pos + 2]) & 0x3Fu);
-        }
-        if ((c & 0xF8u) == 0xF0u && pos + 3 < input.size()) {
-            unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
-            unsigned char c2 = static_cast<unsigned char>(input[pos + 2]);
-            unsigned char c3 = static_cast<unsigned char>(input[pos + 3]);
-            if ((c1 & 0xC0u) != 0x80u || (c2 & 0xC0u) != 0x80u || (c3 & 0xC0u) != 0x80u) {
-                advance = 1;
-                return 0xFFFD;
-            }
-            advance = 4;
-            return ((c & 0x07u) << 18) |
-                   ((static_cast<unsigned char>(input[pos + 1]) & 0x3Fu) << 12) |
-                   ((static_cast<unsigned char>(input[pos + 2]) & 0x3Fu) << 6) |
-                   (static_cast<unsigned char>(input[pos + 3]) & 0x3Fu);
-        }
-
-        advance = 1;
-        return 0xFFFD; // replacement character for invalid input
-    }
-
    static void append_utf8(char32_t ch, std::string & out) {
        if (ch <= 0x7F) {
            out.push_back(static_cast<unsigned char>(ch));
@@ -427,319 +336,22 @@ namespace console {
    }

    // Helper function to remove the last UTF-8 character from a string
-    static size_t prev_utf8_char_pos(const std::string & line, size_t pos) {
-        if (pos == 0) return 0;
-        pos--;
-        while (pos > 0 && (line[pos] & 0xC0) == 0x80) {
-            pos--;
-        }
-        return pos;
-    }
-
-    static size_t next_utf8_char_pos(const std::string & line, size_t pos) {
-        if (pos >= line.length()) return line.length();
-        pos++;
-        while (pos < line.length() && (line[pos] & 0xC0) == 0x80) {
-            pos++;
-        }
-        return pos;
-    }
-
-    static void move_cursor(int delta);
-    static void move_word_left(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
-    static void move_word_right(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
-    static void move_to_line_start(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths);
-    static void move_to_line_end(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
-
-    static void delete_at_cursor(std::string & line, std::vector<int> & widths, size_t & char_pos, size_t & byte_pos) {
-        if (char_pos >= widths.size()) {
+    static void pop_back_utf8_char(std::string & line) {
+        if (line.empty()) {
            return;
        }

-        size_t next_pos = next_utf8_char_pos(line, byte_pos);
-        int w = widths[char_pos];
-        size_t char_len = next_pos - byte_pos;
+        size_t pos = line.length() - 1;

-        line.erase(byte_pos, char_len);
-        widths.erase(widths.begin() + char_pos);
-
-        size_t p = byte_pos;
-        int tail_width = 0;
-        for (size_t i = char_pos; i < widths.size(); ++i) {
-            size_t following = next_utf8_char_pos(line, p);
-            put_codepoint(line.c_str() + p, following - p, widths[i]);
-            tail_width += widths[i];
-            p = following;
+        // Find the start of the last UTF-8 character (checking up to 4 bytes back)
+        for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
+            if ((line[pos] & 0xC0) != 0x80) {
+                break; // Found the start of the character
+            }
        }
-
-        for (int i = 0; i < w; ++i) {
-            fputc(' ', out);
-        }
-
-        move_cursor(-(tail_width + w));
+        line.erase(pos);
    }

-    static void clear_current_line(const std::vector<int> & widths) {
-        int total_width = 0;
-        for (int w : widths) {
-            total_width += (w > 0 ? w : 1);
-        }
-
-        if (total_width > 0) {
-            std::string spaces(total_width, ' ');
-            fwrite(spaces.c_str(), 1, total_width, out);
-            move_cursor(-total_width);
-        }
-    }
-
-    static void set_line_contents(std::string new_line, std::string & line, std::vector<int> & widths, size_t & char_pos,
-                                  size_t & byte_pos) {
-        move_to_line_start(char_pos, byte_pos, widths);
-        clear_current_line(widths);
-
-        line = std::move(new_line);
-        widths.clear();
-        byte_pos = 0;
-        char_pos = 0;
-
-        size_t idx = 0;
-        while (idx < line.size()) {
-            size_t advance = 0;
-            char32_t cp = decode_utf8(line, idx, advance);
-            int expected_width = estimateWidth(cp);
-            int real_width = put_codepoint(line.c_str() + idx, advance, expected_width);
-            if (real_width < 0) real_width = 0;
-            widths.push_back(real_width);
-            idx += advance;
-            ++char_pos;
-            byte_pos = idx;
-        }
-    }
-
-    static void move_to_line_start(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths) {
-        int back_width = 0;
-        for (size_t i = 0; i < char_pos; ++i) {
-            back_width += widths[i];
-        }
-        move_cursor(-back_width);
-        char_pos = 0;
-        byte_pos = 0;
-    }
-
-    static void move_to_line_end(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
-        int forward_width = 0;
-        for (size_t i = char_pos; i < widths.size(); ++i) {
-            forward_width += widths[i];
-        }
-        move_cursor(forward_width);
-        char_pos = widths.size();
-        byte_pos = line.length();
-    }
-
-    static bool has_ctrl_modifier(const std::string & params) {
-        size_t start = 0;
-        while (start < params.size()) {
-            size_t end = params.find(';', start);
-            size_t len = (end == std::string::npos) ? params.size() - start : end - start;
-            if (len > 0) {
-                int value = 0;
-                for (size_t i = 0; i < len; ++i) {
-                    char ch = params[start + i];
-                    if (!std::isdigit(static_cast<unsigned char>(ch))) {
-                        value = -1;
-                        break;
-                    }
-                    value = value * 10 + (ch - '0');
-                }
-                if (value == 5) {
-                    return true;
-                }
-            }
-
-            if (end == std::string::npos) {
-                break;
-            }
-            start = end + 1;
-        }
-        return false;
-    }
-
-    static bool is_space_codepoint(char32_t cp) {
-        return std::iswspace(static_cast<wint_t>(cp)) != 0;
-    }
-
-    static void move_word_left(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
-        if (char_pos == 0) {
-            return;
-        }
-
-        size_t new_char_pos = char_pos;
-        size_t new_byte_pos = byte_pos;
-        int move_width = 0;
-
-        while (new_char_pos > 0) {
-            size_t prev_byte = prev_utf8_char_pos(line, new_byte_pos);
-            size_t advance = 0;
-            char32_t cp = decode_utf8(line, prev_byte, advance);
-            if (!is_space_codepoint(cp)) {
-                break;
-            }
-            move_width += widths[new_char_pos - 1];
-            new_char_pos--;
-            new_byte_pos = prev_byte;
-        }
-
-        while (new_char_pos > 0) {
-            size_t prev_byte = prev_utf8_char_pos(line, new_byte_pos);
-            size_t advance = 0;
-            char32_t cp = decode_utf8(line, prev_byte, advance);
-            if (is_space_codepoint(cp)) {
-                break;
-            }
-            move_width += widths[new_char_pos - 1];
-            new_char_pos--;
-            new_byte_pos = prev_byte;
-        }
-
-        move_cursor(-move_width);
-        char_pos = new_char_pos;
-        byte_pos = new_byte_pos;
-    }
-
-    static void move_word_right(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
-        if (char_pos >= widths.size()) {
-            return;
-        }
-
-        size_t new_char_pos = char_pos;
-        size_t new_byte_pos = byte_pos;
-        int move_width = 0;
-
-        while (new_char_pos < widths.size()) {
-            size_t advance = 0;
-            char32_t cp = decode_utf8(line, new_byte_pos, advance);
-            if (!is_space_codepoint(cp)) {
-                break;
-            }
-            move_width += widths[new_char_pos];
-            new_char_pos++;
-            new_byte_pos += advance;
-        }
-
-        while (new_char_pos < widths.size()) {
-            size_t advance = 0;
-            char32_t cp = decode_utf8(line, new_byte_pos, advance);
-            if (is_space_codepoint(cp)) {
-                break;
-            }
-            move_width += widths[new_char_pos];
-            new_char_pos++;
-            new_byte_pos += advance;
-        }
-
-        while (new_char_pos < widths.size()) {
-            size_t advance = 0;
-            char32_t cp = decode_utf8(line, new_byte_pos, advance);
-            if (!is_space_codepoint(cp)) {
-                break;
-            }
-            move_width += widths[new_char_pos];
-            new_char_pos++;
-            new_byte_pos += advance;
-        }
-
-        move_cursor(move_width);
-        char_pos = new_char_pos;
-        byte_pos = new_byte_pos;
-    }
-
-    static void move_cursor(int delta) {
-        if (delta == 0) return;
-#if defined(_WIN32)
-        if (hConsole != NULL) {
-            CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
-            GetConsoleScreenBufferInfo(hConsole, &bufferInfo);
-            COORD newCursorPosition = bufferInfo.dwCursorPosition;
-            int width = bufferInfo.dwSize.X;
-            int newX = newCursorPosition.X + delta;
-            int newY = newCursorPosition.Y;
-
-            while (newX >= width) {
-                newX -= width;
-                newY++;
-            }
-            while (newX < 0) {
-                newX += width;
-                newY--;
-            }
-
-            newCursorPosition.X = newX;
-            newCursorPosition.Y = newY;
-            SetConsoleCursorPosition(hConsole, newCursorPosition);
-        }
-#else
-        if (delta < 0) {
-            for (int i = 0; i < -delta; i++) fprintf(out, "\b");
-        } else {
-            for (int i = 0; i < delta; i++) fprintf(out, "\033[C");
-        }
-#endif
-    }
-
-    struct history_t {
-        std::vector<std::string> entries;
-        size_t viewing_idx = SIZE_MAX;
-        std::string backup_line; // current line before viewing history
-        void add(const std::string & line) {
-            if (line.empty()) {
-                return;
-            }
-            // avoid duplicates with the last entry
-            if (entries.empty() || entries.back() != line) {
-                entries.push_back(line);
-            }
-            // also clear viewing state
-            end_viewing();
-        }
-        bool prev(std::string & cur_line) {
-            if (entries.empty()) {
-                return false;
-            }
-            if (viewing_idx == SIZE_MAX) {
-                return false;
-            }
-            if (viewing_idx > 0) {
-                viewing_idx--;
-            }
-            cur_line = entries[viewing_idx];
-            return true;
-        }
-        bool next(std::string & cur_line) {
-            if (entries.empty() || viewing_idx == SIZE_MAX) {
-                return false;
-            }
-            viewing_idx++;
-            if (viewing_idx >= entries.size()) {
-                cur_line = backup_line;
-                end_viewing();
-            } else {
-                cur_line = entries[viewing_idx];
-            }
-            return true;
-        }
-        void begin_viewing(const std::string & line) {
-            backup_line = line;
-            viewing_idx = entries.size();
-        }
-        void end_viewing() {
-            viewing_idx = SIZE_MAX;
-            backup_line.clear();
-        }
-        bool is_viewing() const {
-            return viewing_idx != SIZE_MAX;
-        }
-    } history;
-
    static bool readline_advanced(std::string & line, bool multiline_input) {
        if (out != stdout) {
            fflush(stdout);
@@ -750,33 +362,8 @@ namespace console {
        bool is_special_char = false;
        bool end_of_stream = false;

-        size_t byte_pos = 0; // current byte index
-        size_t char_pos = 0; // current character index (one char can be multiple bytes)
-
        char32_t input_char;
        while (true) {
-            assert(char_pos <= byte_pos);
-            assert(char_pos <= widths.size());
-            auto history_prev = [&]() {
-                if (!history.is_viewing()) {
-                    history.begin_viewing(line);
-                }
-                std::string new_line;
-                if (!history.prev(new_line)) {
-                    return;
-                }
-                set_line_contents(new_line, line, widths, char_pos, byte_pos);
-            };
-            auto history_next = [&]() {
-                if (history.is_viewing()) {
-                    std::string new_line;
-                    if (!history.next(new_line)) {
-                        return;
-                    }
-                    set_line_contents(new_line, line, widths, char_pos, byte_pos);
-                }
-            };
-
            fflush(out); // Ensure all output is displayed before waiting for input
            input_char = getchar32();

@@ -784,83 +371,20 @@ namespace console {
                break;
            }

-            if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D */) {
+            if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
                end_of_stream = true;
                break;
            }

            if (is_special_char) {
+                set_display(user_input);
                replace_last(line.back());
                is_special_char = false;
            }

            if (input_char == '\033') { // Escape sequence
                char32_t code = getchar32();
-                if (code == '[') {
-                    std::string params;
-                    while (true) {
-                        code = getchar32();
-                        if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~' || code == (char32_t) WEOF) {
-                            break;
-                        }
-                        params.push_back(static_cast<char>(code));
-                    }
-
-                    const bool ctrl_modifier = has_ctrl_modifier(params);
-
-                    if (code == 'D') { // left
-                        if (ctrl_modifier) {
-                            move_word_left(char_pos, byte_pos, widths, line);
-                        } else if (char_pos > 0) {
-                            int w = widths[char_pos - 1];
-                            move_cursor(-w);
-                            char_pos--;
-                            byte_pos = prev_utf8_char_pos(line, byte_pos);
-                        }
-                    } else if (code == 'C') { // right
-                        if (ctrl_modifier) {
-                            move_word_right(char_pos, byte_pos, widths, line);
-                        } else if (char_pos < widths.size()) {
-                            int w = widths[char_pos];
-                            move_cursor(w);
-                            char_pos++;
-                            byte_pos = next_utf8_char_pos(line, byte_pos);
-                        }
-                    } else if (code == 'H') { // home
-                        move_to_line_start(char_pos, byte_pos, widths);
-                    } else if (code == 'F') { // end
-                        move_to_line_end(char_pos, byte_pos, widths, line);
-                    } else if (code == 'A' || code == 'B') {
-                        // up/down
-                        if (code == 'A') {
-                            history_prev();
-                            is_special_char = false;
-                        } else if (code == 'B') {
-                            history_next();
-                            is_special_char = false;
-                        }
-                    } else if ((code == '~' || (code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z')) && !params.empty()) {
-                        std::string digits;
-                        for (char ch : params) {
-                            if (ch == ';') {
-                                break;
-                            }
-                            if (std::isdigit(static_cast<unsigned char>(ch))) {
-                                digits.push_back(ch);
-                            }
-                        }
-
-                        if (code == '~') {
-                            if (digits == "1" || digits == "7") { // home
-                                move_to_line_start(char_pos, byte_pos, widths);
-                            } else if (digits == "4" || digits == "8") { // end
-                                move_to_line_end(char_pos, byte_pos, widths, line);
-                            } else if (digits == "3") { // delete
-                                delete_at_cursor(line, widths, char_pos, byte_pos);
-                            }
-                        }
-                    }
-                } else if (code == 0x1B) {
+                if (code == '[' || code == 0x1B) {
                    // Discard the rest of the escape sequence
                    while ((code = getchar32()) != (char32_t) WEOF) {
                        if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
@@ -868,110 +392,32 @@ namespace console {
                        }
                    }
                }
-#if defined(_WIN32)
-            } else if (input_char == KEY_ARROW_LEFT) {
-                if (char_pos > 0) {
-                    int w = widths[char_pos - 1];
-                    move_cursor(-w);
-                    char_pos--;
-                    byte_pos = prev_utf8_char_pos(line, byte_pos);
-                }
-            } else if (input_char == KEY_ARROW_RIGHT) {
-                if (char_pos < widths.size()) {
-                    int w = widths[char_pos];
-                    move_cursor(w);
-                    char_pos++;
-                    byte_pos = next_utf8_char_pos(line, byte_pos);
-                }
-            } else if (input_char == KEY_CTRL_ARROW_LEFT) {
-                move_word_left(char_pos, byte_pos, widths, line);
-            } else if (input_char == KEY_CTRL_ARROW_RIGHT) {
-                move_word_right(char_pos, byte_pos, widths, line);
-            } else if (input_char == KEY_HOME) {
-                move_to_line_start(char_pos, byte_pos, widths);
-            } else if (input_char == KEY_END) {
-                move_to_line_end(char_pos, byte_pos, widths, line);
-            } else if (input_char == KEY_DELETE) {
-                delete_at_cursor(line, widths, char_pos, byte_pos);
-            } else if (input_char == KEY_ARROW_UP || input_char == KEY_ARROW_DOWN) {
-                if (input_char == KEY_ARROW_UP) {
-                    history_prev();
-                    is_special_char = false;
-                } else if (input_char == KEY_ARROW_DOWN) {
-                    history_next();
-                    is_special_char = false;
-                }
-#endif
            } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
-                if (char_pos > 0) {
-                    int w = widths[char_pos - 1];
-                    move_cursor(-w);
-                    char_pos--;
-                    size_t prev_pos = prev_utf8_char_pos(line, byte_pos);
-                    size_t char_len = byte_pos - prev_pos;
-                    byte_pos = prev_pos;
-
-                    // remove the character
-                    line.erase(byte_pos, char_len);
-                    widths.erase(widths.begin() + char_pos);
-
-                    // redraw tail
-                    size_t p = byte_pos;
-                    int tail_width = 0;
-                    for (size_t i = char_pos; i < widths.size(); ++i) {
-                        size_t next_p = next_utf8_char_pos(line, p);
-                        put_codepoint(line.c_str() + p, next_p - p, widths[i]);
-                        tail_width += widths[i];
-                        p = next_p;
-                    }
-
-                    // clear display
-                    for (int i = 0; i < w; ++i) {
-                        fputc(' ', out);
-                    }
-                    move_cursor(-(tail_width + w));
+                if (!widths.empty()) {
+                    int count;
+                    do {
+                        count = widths.back();
+                        widths.pop_back();
+                        // Move cursor back, print space, and move cursor back again
+                        for (int i = 0; i < count; i++) {
+                            replace_last(' ');
+                            pop_cursor();
+                        }
+                        pop_back_utf8_char(line);
+                    } while (count == 0 && !widths.empty());
                }
            } else {
-                // insert character
-                std::string new_char_str;
-                append_utf8(input_char, new_char_str);
-                int w = estimateWidth(input_char);
-
-                if (char_pos == widths.size()) {
-                    // insert at the end
-                    line += new_char_str;
-                    int real_w = put_codepoint(new_char_str.c_str(), new_char_str.length(), w);
-                    if (real_w < 0) real_w = 0;
-                    widths.push_back(real_w);
-                    byte_pos += new_char_str.length();
-                    char_pos++;
-                } else {
-                    // insert in middle
-                    line.insert(byte_pos, new_char_str);
-
-                    int real_w = put_codepoint(new_char_str.c_str(), new_char_str.length(), w);
-                    if (real_w < 0) real_w = 0;
-
-                    widths.insert(widths.begin() + char_pos, real_w);
-
-                    // print the tail
-                    size_t p = byte_pos + new_char_str.length();
-                    int tail_width = 0;
-                    for (size_t i = char_pos + 1; i < widths.size(); ++i) {
-                        size_t next_p = next_utf8_char_pos(line, p);
-                        put_codepoint(line.c_str() + p, next_p - p, widths[i]);
-                        tail_width += widths[i];
-                        p = next_p;
-                    }
-
-                    move_cursor(-tail_width);
-
-                    byte_pos += new_char_str.length();
-                    char_pos++;
+                int offset = line.length();
+                append_utf8(input_char, line);
+                int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
+                if (width < 0) {
+                    width = 0;
                }
+                widths.push_back(width);
            }

            if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
+                set_display(prompt);
                replace_last(line.back());
                is_special_char = true;
            }
@@ -1005,15 +451,6 @@ namespace console {
            }
        }

-        if (!end_of_stream && !line.empty()) {
-            // remove the trailing newline for history storage
-            if (!line.empty() && line.back() == '\n') {
-                line.pop_back();
-            }
-            // TODO: maybe support multiline history entries?
-            history.add(line);
-        }
-
        fflush(out);
        return has_more;
    }
@@ -1056,82 +493,12 @@ namespace console {
    }

    bool readline(std::string & line, bool multiline_input) {
+        set_display(user_input);
+
        if (simple_io) {
            return readline_simple(line, multiline_input);
        }
        return readline_advanced(line, multiline_input);
    }

-    namespace spinner {
-        static const char LOADING_CHARS[] = {'|', '/', '-', '\\'};
-        static std::condition_variable cv_stop;
-        static std::thread th;
-        static size_t frame = 0; // only modified by one thread
-        static bool running = false;
-        static std::mutex mtx;
-        static auto wait_time = std::chrono::milliseconds(100);
-        static void draw_next_frame() {
-            // don't need lock because only one thread modifies running
-            frame = (frame + 1) % sizeof(LOADING_CHARS);
-            replace_last(LOADING_CHARS[frame]);
-            fflush(out);
-        }
-        void start() {
-            std::unique_lock<std::mutex> lock(mtx);
-            if (simple_io || running) {
-                return;
-            }
-            common_log_flush(common_log_main());
-            fprintf(out, "%c", LOADING_CHARS[0]);
-            fflush(out);
-            frame = 1;
-            running = true;
-            th = std::thread([]() {
-                std::unique_lock<std::mutex> lock(mtx);
-                while (true) {
-                    if (cv_stop.wait_for(lock, wait_time, []{ return !running; })) {
-                        break;
-                    }
-                    draw_next_frame();
-                }
-            });
-        }
-        void stop() {
-            {
-                std::unique_lock<std::mutex> lock(mtx);
-                if (simple_io || !running) {
-                    return;
-                }
-                running = false;
-                cv_stop.notify_all();
-            }
-            if (th.joinable()) {
-                th.join();
-            }
-            replace_last(' ');
-            pop_cursor();
-            fflush(out);
-        }
-    }
-
-    void log(const char * fmt, ...) {
-        va_list args;
-        va_start(args, fmt);
-        vfprintf(out, fmt, args);
-        va_end(args);
-    }
-
-    void error(const char * fmt, ...) {
-        va_list args;
-        va_start(args, fmt);
-        display_type cur = current_display;
-        set_display(DISPLAY_TYPE_ERROR);
-        vfprintf(out, fmt, args);
-        set_display(cur); // restore previous color
-        va_end(args);
-    }
-
-    void flush() {
-        fflush(out);
-    }
 }
--- a/common/console.h
+++ b/common/console.h
@@ -2,40 +2,18 @@

 #pragma once

-#include "common.h"
-
 #include <string>

-enum display_type {
-    DISPLAY_TYPE_RESET = 0,
-    DISPLAY_TYPE_INFO,
-    DISPLAY_TYPE_PROMPT,
-    DISPLAY_TYPE_REASONING,
-    DISPLAY_TYPE_USER_INPUT,
-    DISPLAY_TYPE_ERROR
-};
-
 namespace console {
+    enum display_t {
+        reset = 0,
+        prompt,
+        user_input,
+        error
+    };
+
    void init(bool use_simple_io, bool use_advanced_display);
    void cleanup();
-    void set_display(display_type display);
+    void set_display(display_t display);
    bool readline(std::string & line, bool multiline_input);
-
-    namespace spinner {
-        void start();
-        void stop();
-    }
-
-    // note: the logging API below output directly to stdout
-    // it can negatively impact performance if used on inference thread
-    // only use in in a dedicated CLI thread
-    // for logging in inference thread, use log.h instead
-
-    LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
-    void log(const char * fmt, ...);
-
-    LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
-    void error(const char * fmt, ...);
-
-    void flush();
 }
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -12,8 +12,6 @@
 #include <filesystem>
 #include <fstream>
 #include <future>
-#include <map>
-#include <mutex>
 #include <regex>
 #include <string>
 #include <thread>
@@ -474,79 +472,36 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &

 #elif defined(LLAMA_USE_HTTPLIB)

-class ProgressBar {
-    static inline std::mutex mutex;
-    static inline std::map<const ProgressBar *, int> lines;
-    static inline int max_line = 0;
-
-    static void cleanup(const ProgressBar * line) {
-        lines.erase(line);
-        if (lines.empty()) {
-            max_line = 0;
-        }
-    }
-
-    static bool is_output_a_tty() {
+static bool is_output_a_tty() {
 #if defined(_WIN32)
-        return _isatty(_fileno(stdout));
+    return _isatty(_fileno(stdout));
 #else
-        return isatty(1);
+    return isatty(1);
 #endif
+}
+
+static void print_progress(size_t current, size_t total) {
+    if (!is_output_a_tty()) {
+        return;
    }

-public:
-    ProgressBar() = default;
-
-    ~ProgressBar() {
-        std::lock_guard<std::mutex> lock(mutex);
-        cleanup(this);
+    if (!total) {
+        return;
    }

-    void update(size_t current, size_t total) {
-        if (!is_output_a_tty()) {
-            return;
-        }
+    size_t width = 50;
+    size_t pct = (100 * current) / total;
+    size_t pos = (width * current) / total;

-        if (!total) {
-            return;
-        }
-
-        std::lock_guard<std::mutex> lock(mutex);
-
-        if (lines.find(this) == lines.end()) {
-            lines[this] = max_line++;
-            std::cout << "\n";
-        }
-        int lines_up = max_line - lines[this];
-
-        size_t width = 50;
-        size_t pct = (100 * current) / total;
-        size_t pos = (width * current) / total;
-
-        std::cout << "\033[s";
-
-        if (lines_up > 0) {
-            std::cout << "\033[" << lines_up << "A";
-        }
-        std::cout << "\033[2K\r["
-            << std::string(pos, '=')
-            << (pos < width ? ">" : "")
-            << std::string(width - pos, ' ')
-            << "] " << std::setw(3) << pct << "%  ("
-            << current / (1024 * 1024) << " MB / "
-            << total / (1024 * 1024) << " MB) "
-            << "\033[u";
-
-        std::cout.flush();
-
-        if (current == total) {
-             cleanup(this);
-        }
-    }
-
-    ProgressBar(const ProgressBar &) = delete;
-    ProgressBar & operator=(const ProgressBar &) = delete;
-};
+    std::cout << "["
+              << std::string(pos, '=')
+              << (pos < width ? ">" : "")
+              << std::string(width - pos, ' ')
+              << "] " << std::setw(3) << pct << "%  ("
+              << current / (1024 * 1024) << " MB / "
+              << total / (1024 * 1024) << " MB)\r";
+    std::cout.flush();
+}

 static bool common_pull_file(httplib::Client & cli,
                             const std::string & resolve_path,
@@ -568,7 +523,6 @@ static bool common_pull_file(httplib::Client & cli,
    const char * func = __func__; // avoid __func__ inside a lambda
    size_t downloaded = existing_size;
    size_t progress_step = 0;
-    ProgressBar bar;

    auto res = cli.Get(resolve_path, headers,
        [&](const httplib::Response &response) {
@@ -600,7 +554,7 @@ static bool common_pull_file(httplib::Client & cli,
            progress_step += len;

            if (progress_step >= total_size / 1000 || downloaded == total_size) {
-                bar.update(downloaded, total_size);
+                print_progress(downloaded, total_size);
                progress_step = 0;
            }
            return true;
@@ -608,6 +562,8 @@ static bool common_pull_file(httplib::Client & cli,
        nullptr
    );

+    std::cout << "\n";
+
    if (!res) {
        LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
        return false;
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -420,11 +420,6 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) {
    log->set_timestamps(timestamps);
 }

-void common_log_flush(struct common_log * log) {
-    log->pause();
-    log->resume();
-}
-
 static int common_get_verbosity(enum ggml_log_level level) {
    switch (level) {
        case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
--- a/common/log.h
+++ b/common/log.h
@@ -84,7 +84,6 @@ void common_log_set_file      (struct common_log * log, const char * file); // n
 void common_log_set_colors    (struct common_log * log, log_colors colors); // not thread-safe
 void common_log_set_prefix    (struct common_log * log, bool prefix);       // whether to output prefix to each log
 void common_log_set_timestamps(struct common_log * log, bool timestamps);   // whether to output timestamps in the prefix
-void common_log_flush         (struct common_log * log);                    // flush all pending log messages

 // helper macros for logging
 // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -1,186 +0,0 @@
-#include "arg.h"
-#include "preset.h"
-#include "peg-parser.h"
-#include "log.h"
-
-#include <fstream>
-#include <sstream>
-#include <filesystem>
-
-static std::string rm_leading_dashes(const std::string & str) {
-    size_t pos = 0;
-    while (pos < str.size() && str[pos] == '-') {
-        ++pos;
-    }
-    return str.substr(pos);
-}
-
-std::vector<std::string> common_preset::to_args() const {
-    std::vector<std::string> args;
-
-    for (const auto & [opt, value] : options) {
-        args.push_back(opt.args.back()); // use the last arg as the main arg
-        if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
-            // flag option, no value
-            if (common_arg_utils::is_falsey(value)) {
-                // use negative arg if available
-                if (!opt.args_neg.empty()) {
-                    args.back() = opt.args_neg.back();
-                } else {
-                    // otherwise, skip the flag
-                    // TODO: maybe throw an error instead?
-                    args.pop_back();
-                }
-            }
-        }
-        if (opt.value_hint != nullptr) {
-            // single value
-            args.push_back(value);
-        }
-        if (opt.value_hint != nullptr && opt.value_hint_2 != nullptr) {
-            throw std::runtime_error(string_format(
-                "common_preset::to_args(): option '%s' has two values, which is not supported yet",
-                opt.args.back()
-            ));
-        }
-    }
-
-    return args;
-}
-
-std::string common_preset::to_ini() const {
-    std::ostringstream ss;
-
-    ss << "[" << name << "]\n";
-    for (const auto & [opt, value] : options) {
-        auto espaced_value = value;
-        string_replace_all(espaced_value, "\n", "\\\n");
-        ss << rm_leading_dashes(opt.args.back()) << " = ";
-        ss << espaced_value << "\n";
-    }
-    ss << "\n";
-
-    return ss.str();
-}
-
-static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
-    std::map<std::string, std::map<std::string, std::string>> parsed;
-
-    if (!std::filesystem::exists(path)) {
-        throw std::runtime_error("preset file does not exist: " + path);
-    }
-
-    std::ifstream file(path);
-    if (!file.good()) {
-        throw std::runtime_error("failed to open server preset file: " + path);
-    }
-
-    std::string contents((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
-
-    static const auto parser = build_peg_parser([](auto & p) {
-        // newline ::= "\r\n" / "\n" / "\r"
-        auto newline = p.rule("newline", p.literal("\r\n") | p.literal("\n") | p.literal("\r"));
-
-        // ws ::= [ \t]*
-        auto ws = p.rule("ws", p.chars("[ \t]", 0, -1));
-
-        // comment ::= [;#] (!newline .)*
-        auto comment = p.rule("comment", p.chars("[;#]", 1, 1) + p.zero_or_more(p.negate(newline) + p.any()));
-
-        // eol ::= ws comment? (newline / EOF)
-        auto eol = p.rule("eol", ws + p.optional(comment) + (newline | p.end()));
-
-        // ident ::= [a-zA-Z_] [a-zA-Z0-9_.-]*
-        auto ident = p.rule("ident", p.chars("[a-zA-Z_]", 1, 1) + p.chars("[a-zA-Z0-9_.-]", 0, -1));
-
-        // value ::= (!eol-start .)*
-        auto eol_start = p.rule("eol-start", ws + (p.chars("[;#]", 1, 1) | newline | p.end()));
-        auto value = p.rule("value", p.zero_or_more(p.negate(eol_start) + p.any()));
-
-        // header-line ::= "[" ws ident ws "]" eol
-        auto header_line = p.rule("header-line", "[" + ws + p.tag("section-name", p.chars("[^]]")) + ws + "]" + eol);
-
-        // kv-line ::= ident ws "=" ws value eol
-        auto kv_line = p.rule("kv-line", p.tag("key", ident) + ws + "=" + ws + p.tag("value", value) + eol);
-
-        // comment-line ::= ws comment (newline / EOF)
-        auto comment_line = p.rule("comment-line", ws + comment + (newline | p.end()));
-
-        // blank-line ::= ws (newline / EOF)
-        auto blank_line = p.rule("blank-line", ws + (newline | p.end()));
-
-        // line ::= header-line / kv-line / comment-line / blank-line
-        auto line = p.rule("line", header_line | kv_line | comment_line | blank_line);
-
-        // ini ::= line* EOF
-        auto ini = p.rule("ini", p.zero_or_more(line) + p.end());
-
-        return ini;
-    });
-
-    common_peg_parse_context ctx(contents);
-    const auto result = parser.parse(ctx);
-    if (!result.success()) {
-        throw std::runtime_error("failed to parse server config file: " + path);
-    }
-
-    std::string current_section = COMMON_PRESET_DEFAULT_NAME;
-    std::string current_key;
-
-    ctx.ast.visit(result, [&](const auto & node) {
-        if (node.tag == "section-name") {
-            const std::string section = std::string(node.text);
-            current_section = section;
-            parsed[current_section] = {};
-        } else if (node.tag == "key") {
-            const std::string key = std::string(node.text);
-            current_key = key;
-        } else if (node.tag == "value" && !current_key.empty() && !current_section.empty()) {
-            parsed[current_section][current_key] = std::string(node.text);
-            current_key.clear();
-        }
-    });
-
-    return parsed;
-}
-
-static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
-    std::map<std::string, common_arg> mapping;
-    for (const auto & opt : ctx_params.options) {
-        for (const auto & env : opt.get_env()) {
-            mapping[env] = opt;
-        }
-        for (const auto & arg : opt.get_args()) {
-            mapping[rm_leading_dashes(arg)] = opt;
-        }
-    }
-    return mapping;
-}
-
-common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
-    common_presets out;
-    auto key_to_opt = get_map_key_opt(ctx_params);
-    auto ini_data = parse_ini_from_file(path);
-
-    for (auto section : ini_data) {
-        common_preset preset;
-        if (section.first.empty()) {
-            preset.name = COMMON_PRESET_DEFAULT_NAME;
-        } else {
-            preset.name = section.first;
-        }
-        LOG_DBG("loading preset: %s\n", preset.name.c_str());
-        for (const auto & [key, value] : section.second) {
-            LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
-            if (key_to_opt.find(key) != key_to_opt.end()) {
-                preset.options[key_to_opt[key]] = value;
-                LOG_DBG("accepted option: %s = %s\n", key.c_str(), value.c_str());
-            } else {
-                // TODO: maybe warn about unknown key?
-            }
-        }
-        out[preset.name] = preset;
-    }
-
-    return out;
-}
--- a/common/preset.h
+++ b/common/preset.h
@@ -1,32 +0,0 @@
-#pragma once
-
-#include "common.h"
-#include "arg.h"
-
-#include <string>
-#include <vector>
-#include <map>
-
-//
-// INI preset parser and writer
-//
-
-constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default";
-
-struct common_preset {
-    std::string name;
-    // TODO: support repeated args in the future
-    std::map<common_arg, std::string> options;
-
-    // convert preset to CLI argument list
-    std::vector<std::string> to_args() const;
-
-    // convert preset to INI format string
-    std::string to_ini() const;
-
-    // TODO: maybe implement to_env() if needed
-};
-
-// interface for multiple presets in one file
-using common_presets = std::map<std::string, common_preset>;
-common_presets common_presets_load(const std::string & path, common_params_context & ctx_params);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -383,17 +383,6 @@ class ModelBase:
                        s = self.model_tensors[name]
                        self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
                        tensors_to_remove.append(name)
-                    if name.endswith(".activation_scale"):  # unused
-                        tensors_to_remove.append(name)
-                    # mistral format
-                    if name.endswith(".qscale_weight"):
-                        weight_name = name.removesuffix("qscale_weight") + "weight"
-                        w = self.model_tensors[weight_name]
-                        s = self.model_tensors[name]
-                        self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
-                        tensors_to_remove.append(name)
-                    if name.endswith(".qscale_act"):
-                        tensors_to_remove.append(name)
            elif quant_method == "gptq":
                for name in self.model_tensors.keys():
                    if name.endswith(".qweight"):
@@ -2865,10 +2854,13 @@ class Mistral3Model(LlamaModel):
            self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
+        # TODO: probably not worth supporting quantized weight, as official BF16 is also available
+        if name.endswith("weight_scale_inv"):
+            raise ValueError("This is a quantized weight, please use BF16 weight instead")
+
        name = name.replace("language_model.", "")
        if "multi_modal_projector" in name or "vision_tower" in name:
            return []
-
        return super().modify_tensors(data_torch, name, bid)


@@ -7286,10 +7278,6 @@ class DeepseekV2Model(TextModel):
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
-
-            # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
-            # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
-            # ref https://github.com/ggml-org/llama.cpp/pull/17945
            self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])

    _experts: list[dict[str, Tensor]] | None = None
@@ -9910,18 +9898,6 @@ class MistralModel(LlamaModel):
            self.gguf_writer.add_architecture()
            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)

-    def dequant_model(self):
-        # transform quantization config into HF format
-        quant_config = self.hparams.get("quantization")
-        if quant_config is not None:
-            assert quant_config["qformat_weight"] == "fp8_e4m3"
-            self.hparams["quantization_config"] = {
-                "activation_scheme": "static",
-                "quant_method": "fp8",
-                "weight_block_size": None,
-            }
-        return super().dequant_model()
-
    @staticmethod
    def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool):
        assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg
@@ -10045,10 +10021,6 @@ class MistralMoeModel(DeepseekV2Model):
        MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
        yarn_params = self.hparams["yarn"]
        self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"])
-
-        # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
-        # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
-        # ref https://github.com/ggml-org/llama.cpp/pull/17945
        self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -56,7 +56,7 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /model
 or with a server image:

 ```bash
-docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
+docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
 ```

 ## Docker With CUDA
@@ -91,7 +91,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne
 ```bash
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```

 ## Docker With MUSA
@@ -125,5 +125,5 @@ After building locally, Usage is similar to the non-MUSA examples, but you'll ne
 ```bash
 docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -16,12 +16,12 @@ Legend:
 |-----------|------|------|------|------|------|------|------|------|------|------|------|
 |                              ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
 |                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
 |                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
@@ -36,16 +36,15 @@ Legend:
 |                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |               CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                           CUMSUM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                             DIAG | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                           CUMSUM | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
 |                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                             FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ | ❌ |
+|                             FILL | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
 |                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
 |                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@@ -65,8 +64,8 @@ Legend:
 |                          L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
 |                              LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
-|                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
 |                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@@ -75,7 +74,7 @@ Legend:
 |                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                     OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | ❌ |
-|                              PAD | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
+|                              PAD | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@@ -84,7 +83,7 @@ Legend:
 |                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                             ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
@@ -103,21 +102,21 @@ Legend:
 |                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
 |                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
-|                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
-|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
-|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
-|                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                        SOLVE_TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
+|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
+|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
+|                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
 |                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              SUM | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
-|                         SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
+|                         SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                       SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ❌ | ❌ |
+|                       SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ |
 |                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                            TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
-|                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                            TOP_K | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
+|                              TRI | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
-|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
+|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
 |                            XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
--- a/docs/ops/CPU.csv
+++ b/docs/ops/CPU.csv
@@ -4964,7 +4964,6 @@
 "CPU","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","1","yes","CPU"
 "CPU","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","CPU"
 "CPU","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","CPU"
-"CPU","CONV_TRANSPOSE_2D","ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","1","yes","CPU"
 "CPU","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","1","yes","CPU"
 "CPU","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","1","yes","CPU"
 "CPU","ARGMAX","type=f32,ne=[32,1,1,1]","support","1","yes","CPU"
@@ -5420,45 +5419,17 @@
 "CPU","CPY","type_src=f16,type_dst=f16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
 "CPU","CPY","type_src=f32,type_dst=f32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
 "CPU","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
-"CPU","CPY","type_src=i32,type_dst=i32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CPU"
-"CPU","CPY","type_src=i32,type_dst=i32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CPU"
 "CPU","CPY","type_src=f32,type_dst=f32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=i32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=f16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=bf16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=bf16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=bf16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=bf16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=bf16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CPU"
-"CPU","CONT","type=bf16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[10,10,10,1]","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[2,1,1,1]","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[2,1,3,5]","support","1","yes","CPU"
+"CPU","CONT","type=f32,ne=[2,3,5,7]","support","1","yes","CPU"
+"CPU","CONT","type=f16,ne=[2,1,1,1]","support","1","yes","CPU"
+"CPU","CONT","type=f16,ne=[2,1,3,5]","support","1","yes","CPU"
+"CPU","CONT","type=f16,ne=[2,3,5,7]","support","1","yes","CPU"
+"CPU","CONT","type=bf16,ne=[2,1,1,1]","support","1","yes","CPU"
+"CPU","CONT","type=bf16,ne=[2,1,3,5]","support","1","yes","CPU"
+"CPU","CONT","type=bf16,ne=[2,3,5,7]","support","1","yes","CPU"
 "CPU","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
 "CPU","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
 "CPU","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
@@ -5684,7 +5655,6 @@
 "CPU","MUL","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
 "CPU","DIV","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CPU"
 "CPU","ADD1","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
-"CPU","ADD1","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=0.000000,inplace=0","support","1","yes","CPU"
 "CPU","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=0","support","1","yes","CPU"
 "CPU","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=1","support","1","yes","CPU"
@@ -8674,13 +8644,9 @@
 "CPU","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CPU"
 "CPU","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CPU"
 "CPU","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
-"CPU","FLOOR","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
-"CPU","CEIL","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
-"CPU","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","CPU"
-"CPU","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
 "CPU","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","CPU"
 "CPU","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
@@ -8700,13 +8666,9 @@
 "CPU","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CPU"
 "CPU","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CPU"
 "CPU","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
-"CPU","FLOOR","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","CEIL","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
-"CPU","CEIL","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","ROUND","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
-"CPU","ROUND","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","TRUNC","type=f32,ne=[7,1,5,3]","support","1","yes","CPU"
-"CPU","TRUNC","type=f32,ne=[1024,1024,1,1]","support","1","yes","CPU"
 "CPU","DIAG_MASK_INF","type=f32,ne=[10,10,1,1],n_past=5","support","1","yes","CPU"
 "CPU","DIAG_MASK_INF","type=f32,ne=[10,10,3,1],n_past=5","support","1","yes","CPU"
 "CPU","DIAG_MASK_INF","type=f32,ne=[10,10,3,2],n_past=5","support","1","yes","CPU"
@@ -9449,405 +9411,18 @@
 "CPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","CPU"
 "CPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","CPU"
 "CPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1023,2,1,3],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1024,2,1,3],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1025,2,1,3],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2047,2,1,3],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2048,2,1,3],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2049,2,1,3],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[8,1,1,1],order=1","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[16,10,10,10],order=1","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[60,10,10,10],order=1","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1023,2,1,3],order=1","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1024,2,1,3],order=1","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[1025,2,1,3],order=1","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2047,2,1,3],order=1","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","CPU"
-"CPU","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[1024,1,1,1],order=1","support","1","yes","CPU"
+"CPU","ARGSORT","type=f32,ne=[16384,1,1,1],order=1","support","1","yes","CPU"
 "CPU","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=500,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","1","yes","CPU"
-"CPU","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","1","yes","CPU"
@@ -9860,10 +9435,6 @@
 "CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","1","yes","CPU"
-"CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=0","support","1","yes","CPU"
-"CPU","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=1","support","1","yes","CPU"
-"CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","1","yes","CPU"
-"CPU","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","1","yes","CPU"
 "CPU","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","1","yes","CPU"
@@ -9892,30 +9463,15 @@
 "CPU","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","1","yes","CPU"
 "CPU","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","1","yes","CPU"
 "CPU","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0,circular=0","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","1","yes","CPU"
 "CPU","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","1","yes","CPU"
 "CPU","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","1","yes","CPU"
 "CPU","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","1","yes","CPU"
 "CPU","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","1","yes","CPU"
-"CPU","ARANGE","type=f32,start=0.000000,stop=1048576.000000,step=1.000000","support","1","yes","CPU"
 "CPU","TIMESTEP_EMBEDDING","type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000","support","1","yes","CPU"
 "CPU","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","CPU"
 "CPU","CUMSUM","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[127,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[128,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[128,128,4,4]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[255,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[256,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[511,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[512,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[1023,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[1024,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[2047,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[2048,5,4,3]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[242004,1,1,1]","support","1","yes","CPU"
-"CPU","CUMSUM","type=f32,ne=[375960,1,1,1]","support","1","yes","CPU"
 "CPU","XIELU","type=f32,ne=[10,5,4,3]","support","1","yes","CPU"
 "CPU","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","CPU"
 "CPU","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","CPU"
@@ -9924,10 +9480,6 @@
 "CPU","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","1","yes","CPU"
 "CPU","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","CPU"
 "CPU","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","CPU"
-"CPU","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","CPU"
-"CPU","DIAG","type=f32,ne=[10,1,4,3]","support","1","yes","CPU"
-"CPU","DIAG","type=f32,ne=[79,1,19,13]","support","1","yes","CPU"
-"CPU","DIAG","type=f32,ne=[256,1,8,16]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","1","yes","CPU"
@@ -9935,16 +9487,10 @@
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","1","yes","CPU"
 "CPU","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","1","yes","CPU"
-"CPU","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[31,128,4,4]","support","1","yes","CPU"
-"CPU","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[300,64,4,4]","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=0","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=0","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=1","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=1","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=0","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=0","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=1","support","1","yes","CPU"
-"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=1","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1","support","1","yes","CPU"
+"CPU","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1","support","1","yes","CPU"
 "CPU","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","CPU"
 "CPU","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","CPU"
 "CPU","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","1","yes","CPU"
--- a/docs/ops/CUDA.csv
+++ b/docs/ops/CUDA.csv
@@ -4964,7 +4964,6 @@
 "CUDA0","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","1","yes","CUDA"
 "CUDA0","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","CUDA"
 "CUDA0","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","CUDA"
-"CUDA0","CONV_TRANSPOSE_2D","ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","1","yes","CUDA"
 "CUDA0","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","1","yes","CUDA"
 "CUDA0","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","1","yes","CUDA"
 "CUDA0","ARGMAX","type=f32,ne=[32,1,1,1]","support","1","yes","CUDA"
@@ -5420,45 +5419,17 @@
 "CUDA0","CPY","type_src=f16,type_dst=f16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
 "CUDA0","CPY","type_src=f32,type_dst=f32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
 "CUDA0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
-"CUDA0","CPY","type_src=i32,type_dst=i32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","CUDA"
-"CUDA0","CPY","type_src=i32,type_dst=i32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CUDA"
 "CUDA0","CPY","type_src=f32,type_dst=f32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=i32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=f16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=bf16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=bf16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=bf16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=bf16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=bf16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","CUDA"
-"CUDA0","CONT","type=bf16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[10,10,10,1]","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[2,1,1,1]","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[2,1,3,5]","support","1","yes","CUDA"
+"CUDA0","CONT","type=f32,ne=[2,3,5,7]","support","1","yes","CUDA"
+"CUDA0","CONT","type=f16,ne=[2,1,1,1]","support","1","yes","CUDA"
+"CUDA0","CONT","type=f16,ne=[2,1,3,5]","support","1","yes","CUDA"
+"CUDA0","CONT","type=f16,ne=[2,3,5,7]","support","1","yes","CUDA"
+"CUDA0","CONT","type=bf16,ne=[2,1,1,1]","support","1","yes","CUDA"
+"CUDA0","CONT","type=bf16,ne=[2,1,3,5]","support","1","yes","CUDA"
+"CUDA0","CONT","type=bf16,ne=[2,3,5,7]","support","1","yes","CUDA"
 "CUDA0","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
 "CUDA0","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
 "CUDA0","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
@@ -5684,7 +5655,6 @@
 "CUDA0","MUL","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
 "CUDA0","DIV","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","CUDA"
 "CUDA0","ADD1","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
-"CUDA0","ADD1","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=0.000000,inplace=0","support","1","yes","CUDA"
 "CUDA0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=0","support","1","yes","CUDA"
 "CUDA0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=1","support","1","yes","CUDA"
@@ -8674,13 +8644,9 @@
 "CUDA0","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CUDA"
 "CUDA0","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CUDA"
 "CUDA0","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
-"CUDA0","FLOOR","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
-"CUDA0","CEIL","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
-"CUDA0","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","CUDA"
-"CUDA0","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
 "CUDA0","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","CUDA"
 "CUDA0","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
@@ -8700,13 +8666,9 @@
 "CUDA0","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","CUDA"
 "CUDA0","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","CUDA"
 "CUDA0","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
-"CUDA0","FLOOR","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","CEIL","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
-"CUDA0","CEIL","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","ROUND","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
-"CUDA0","ROUND","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","TRUNC","type=f32,ne=[7,1,5,3]","support","1","yes","CUDA"
-"CUDA0","TRUNC","type=f32,ne=[1024,1024,1,1]","support","1","yes","CUDA"
 "CUDA0","DIAG_MASK_INF","type=f32,ne=[10,10,1,1],n_past=5","support","1","yes","CUDA"
 "CUDA0","DIAG_MASK_INF","type=f32,ne=[10,10,3,1],n_past=5","support","1","yes","CUDA"
 "CUDA0","DIAG_MASK_INF","type=f32,ne=[10,10,3,2],n_past=5","support","1","yes","CUDA"
@@ -9449,405 +9411,18 @@
 "CUDA0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","no","CUDA"
 "CUDA0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","CUDA"
 "CUDA0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","no","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1023,2,1,3],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1024,2,1,3],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1025,2,1,3],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2047,2,1,3],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2048,2,1,3],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2049,2,1,3],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[8,1,1,1],order=1","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[16,10,10,10],order=1","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[60,10,10,10],order=1","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1023,2,1,3],order=1","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1024,2,1,3],order=1","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[1025,2,1,3],order=1","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2047,2,1,3],order=1","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","CUDA"
-"CUDA0","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[1024,1,1,1],order=1","support","1","yes","CUDA"
+"CUDA0","ARGSORT","type=f32,ne=[16384,1,1,1],order=1","support","1","yes","CUDA"
 "CUDA0","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=500,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","0","no","CUDA"
-"CUDA0","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","0","no","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","1","yes","CUDA"
@@ -9860,10 +9435,6 @@
 "CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","1","yes","CUDA"
-"CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=0","support","1","yes","CUDA"
-"CUDA0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=1","support","1","yes","CUDA"
-"CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","1","yes","CUDA"
-"CUDA0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","1","yes","CUDA"
 "CUDA0","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","1","yes","CUDA"
@@ -9892,59 +9463,34 @@
 "CUDA0","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","1","yes","CUDA"
 "CUDA0","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","1","yes","CUDA"
 "CUDA0","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0,circular=0","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","1","yes","CUDA"
 "CUDA0","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","1","yes","CUDA"
 "CUDA0","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","1","yes","CUDA"
 "CUDA0","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","1","yes","CUDA"
 "CUDA0","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","1","yes","CUDA"
-"CUDA0","ARANGE","type=f32,start=0.000000,stop=1048576.000000,step=1.000000","support","1","yes","CUDA"
 "CUDA0","TIMESTEP_EMBEDDING","type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000","support","1","yes","CUDA"
 "CUDA0","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[10,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[127,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[128,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[128,128,4,4]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[255,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[256,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[511,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[512,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[1023,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[1024,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[2047,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[2048,5,4,3]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[242004,1,1,1]","support","1","yes","CUDA"
-"CUDA0","CUMSUM","type=f32,ne=[375960,1,1,1]","support","1","yes","CUDA"
+"CUDA0","CUMSUM","type=f32,ne=[10,5,4,3]","support","0","no","CUDA"
 "CUDA0","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","CUDA"
-"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","CUDA"
-"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","CUDA"
-"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","1","yes","CUDA"
-"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","1","yes","CUDA"
-"CUDA0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","1","yes","CUDA"
-"CUDA0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","CUDA"
-"CUDA0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","CUDA"
-"CUDA0","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","CUDA"
-"CUDA0","DIAG","type=f32,ne=[10,1,4,3]","support","1","yes","CUDA"
-"CUDA0","DIAG","type=f32,ne=[79,1,19,13]","support","1","yes","CUDA"
-"CUDA0","DIAG","type=f32,ne=[256,1,8,16]","support","1","yes","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","1","yes","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","1","yes","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","1","yes","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","1","yes","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","1","yes","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","1","yes","CUDA"
+"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","CUDA"
+"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","CUDA"
+"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","0","no","CUDA"
+"CUDA0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","0","no","CUDA"
+"CUDA0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","0","no","CUDA"
+"CUDA0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","0","no","CUDA"
+"CUDA0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","0","no","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","0","no","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","0","no","CUDA"
+"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","0","no","CUDA"
 "CUDA0","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","0","no","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[31,128,4,4]","support","0","no","CUDA"
-"CUDA0","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[300,64,4,4]","support","0","no","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=0","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=0","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=1","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=1","support","1","yes","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=0","support","0","no","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=0","support","0","no","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=1","support","0","no","CUDA"
-"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=1","support","0","no","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","1","yes","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1","support","0","no","CUDA"
+"CUDA0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1","support","0","no","CUDA"
 "CUDA0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","CUDA"
 "CUDA0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","CUDA"
 "CUDA0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","CUDA"
--- a/docs/ops/OpenCL.csv
+++ b/docs/ops/OpenCL.csv
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@@ -14,13 +14,12 @@ static void write_table_header(std::ofstream & file) {
 static void write_table_entry(std::ofstream & file, const common_arg & opt) {
    file << "| `";
    // args
-    auto all_args = opt.get_args();
-    for (const auto & arg : all_args) {
-    if (arg == all_args.front()) {
+    for (const auto & arg : opt.args) {
+    if (arg == opt.args.front()) {
            file << arg;
-            if (all_args.size() > 1) file << ", ";
+            if (opt.args.size() > 1) file << ", ";
        } else {
-            file << arg << (arg != all_args.back() ? ", " : "");
+            file << arg << (arg != opt.args.back() ? ", " : "");
        }
    }
    // value hint
@@ -77,7 +76,7 @@ static void export_md(std::string fname, llama_example ex) {
 }

 int main(int, char **) {
-    export_md("autogen-main.md", LLAMA_EXAMPLE_COMPLETION);
+    export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
    export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);

    return 0;
--- a/examples/model-conversion/scripts/causal/compare-logits.py
+++ b/examples/model-conversion/scripts/causal/compare-logits.py
@@ -32,6 +32,10 @@ def quick_logits_check(pytorch_file, llamacpp_file):
    print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}")
    print(f"Max absolute difference: {max_diff:.4f}")

+    if max_diff > 1.0:
+        print(f"❌ NOK: Large differences detected - max diff: {max_diff:.4f}")
+        return False
+
    return True

 def main():
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -99,7 +99,6 @@ extern "C" {
    GGML_BACKEND_API int ggml_cpu_has_sme        (void);
    // other
    GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
-    GGML_BACKEND_API int ggml_cpu_get_rvv_vlen   (void);  // risc-v vector length in bytes
    GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
    GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
    GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2305,11 +2305,13 @@ extern "C" {
            float                 stop,
            float                 step);

-    // q:    [n_embd_k, n_batch, n_head,    ne3 ]
-    // k:    [n_embd_k, n_kv,    n_head_kv, ne3 ]
-    // v:    [n_embd_v, n_kv,    n_head_kv, ne3 ] !! not transposed !!
-    // mask: [n_kv,     n_batch, ne32,      ne33]
-    // res:  [n_embd_v, n_head,  n_batch,   ne3 ] !! permuted !!
+#define GGML_KQ_MASK_PAD 1
+
+    // q:    [n_embd_k, n_batch,     n_head,    ne3 ]
+    // k:    [n_embd_k, n_kv,        n_head_kv, ne3 ]
+    // v:    [n_embd_v, n_kv,        n_head_kv, ne3 ] !! not transposed !!
+    // mask: [n_kv,     n_batch_pad, ne32,      ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
+    // res:  [n_embd_v, n_head,      n_batch,   ne3 ] !! permuted !!
    //
    // broadcast:
    //   n_head % n_head_kv == 0
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -25,7 +25,6 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
 // ops that return true for this function must not use restrict pointers for their backend implementations
 bool ggml_op_can_inplace(enum ggml_op op) {
    switch (op) {
-        case GGML_OP_FILL:
        case GGML_OP_SCALE:
        case GGML_OP_DIAG_MASK_ZERO:
        case GGML_OP_DIAG_MASK_INF:
@@ -312,9 +311,16 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
 }

 // this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_dyn_tallocr_free_bytes(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) {
+static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
    size = aligned_offset(NULL, size, alloc->alignment);

+    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
+        __func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    remove_allocated_tensor(alloc, addr, tensor);
+#endif
+
    struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];

    // see if we can merge with an existing block
@@ -350,6 +356,8 @@ static void ggml_dyn_tallocr_free_bytes(struct ggml_dyn_tallocr * alloc, struct
    }
    // otherwise, add a new block
    ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
+
+    GGML_UNUSED(tensor);
 }

 static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
@@ -607,17 +615,13 @@ static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_ten

    GGML_ASSERT(parent_size >= node_size);

-    // note: we want after the freeing the chunks to continue to be aligned
-    struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
-    parent_size = aligned_offset(NULL, parent_size, p_alloc->alignment);
-    node_size = aligned_offset(NULL, node_size, p_alloc->alignment);
-
    if (parent_size > node_size) {
+        struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
        struct buffer_address p_addr = p_hn->addr;
        p_addr.offset += node_size;
        size_t extra_size = parent_size - node_size;
        AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
-        ggml_dyn_tallocr_free_bytes(p_alloc, p_addr, extra_size);
+        ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
    }
 }

@@ -701,14 +705,7 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
    struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
    ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
    size_t size = ggml_backend_buft_get_alloc_size(buft, node);
-
-    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
-        __func__, node->name, hn->addr.chunk, hn->addr.offset, size, alloc->chunks[hn->addr.chunk]->n_free_blocks);
-#ifdef GGML_ALLOCATOR_DEBUG
-    remove_allocated_tensor(alloc, hn->addr, node);
-#endif
-
-    ggml_dyn_tallocr_free_bytes(alloc, hn->addr, size);
+    ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
    hn->allocated = false;
 }

--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -2251,12 +2251,12 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
                                  int                         sections[4],
                                  bool                        mrope_used,
                                  bool                        is_imrope,
-                                  bool                        indep_sects,
-                                  int64_t                     rope_dims) {
+                                  bool                        indep_sects) {
+    ggml_tensor * src0 = dst->src[0];  // input
    ggml_tensor * src1 = dst->src[1];  // position
    ggml_tensor * src2 = dst->src[2];  // freq_factors

-    int64_t theta_scale_length = rope_dims / 2;
+    int64_t theta_scale_length = src0->ne[0] / 2;
    int64_t position_length    = dst->ne[2];

    // TODO: check theta_scale_length and position_length.
@@ -2331,17 +2331,18 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
        ACL_CHECK(aclrtMemcpyAsync(ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float),
                                   ctx.rope_cache.theta_scale_exp_host, theta_scale_length * sizeof(float),
                                   ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream()));
+
+        acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
+                                                         theta_scale_ne, theta_scale_nb, 1);
    }
-    acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
-                                                     theta_scale_ne, theta_scale_nb, 1);

    // Step1.2: prepare rope_yarn_ramp, if this part updated, should update theta_scale_tensor.
-    // TODO: acl_yarn_ramp_tensor use rope cache.
    bool                 yarn_ramp_tensor_updated = false;
    ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool());
    acl_tensor_ptr       acl_yarn_ramp_tensor;
-    if (ext_factor != 0 && (theta_scale_updated || ctx.rope_cache.theta_scale_length != theta_scale_length ||
-                            ctx.rope_cache.freq_scale != freq_scale)) {
+    if (ext_factor != 0 &&
+        // TODO: check more parameter.
+        (ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.freq_scale != freq_scale)) {
        yarn_ramp_tensor_updated = true;

        // -rope_yarn_ramp
@@ -2589,7 +2590,7 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
        aclnn_muls(ctx, acl_cos_tensor.get(), attn_factor, nullptr, true);
    }

-    int64_t sin_reshape_ne[4] = { rope_dims, 1, dst->ne[2], 1 };
+    int64_t sin_reshape_ne[4] = { src0->ne[0], 1, dst->ne[2], 1 };
    size_t  sin_reshape_nb[GGML_MAX_DIMS];
    sin_reshape_nb[0] = sizeof(float);
    for (int i = 1; i < GGML_MAX_DIMS; i++) {
@@ -2644,7 +2645,7 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {

    // param
    float     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-    int       sections[4];
+    int sections[4];
    // const int n_past     = ((int32_t *) dst->op_params)[0];
    const int n_dims     = ((int32_t *) dst->op_params)[1];
    const int mode       = ((int32_t *) dst->op_params)[2];
@@ -2653,60 +2654,44 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {

    GGML_TENSOR_UNARY_OP_LOCALS

-    memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
-    memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
-    memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
-    memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
-    memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
-    memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int) * 4);
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+    memcpy(&sections,    (int32_t *) dst->op_params + 11, sizeof(int)*4);

+    // TODO: n_dims <= ne0
+    GGML_ASSERT(n_dims == ne0);
    GGML_ASSERT(n_dims % 2 == 0);
-    GGML_ASSERT(n_dims <= ne00);

    const float theta_scale = powf(freq_base, -2.0f / n_dims);

    float corr_dims[2];
    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);

-    bool       is_neox    = mode & GGML_ROPE_TYPE_NEOX;
-    const bool is_imrope  = mode == GGML_ROPE_TYPE_IMROPE;  // qwen3vl apply interleaved mrope
-    // mrope_used means the GGML_ROPE_TYPE_MROPE bit is set.
-    // Note: this bit is also set for imrope and some vision modes,
-    // so mrope_used does NOT exclusively indicate pure mrope.
-    const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
-    const bool is_vision  = mode == GGML_ROPE_TYPE_VISION;
+    bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
+    const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;  // ggml_rope_multi, note: also true for vision (24 & 8 == true) and for imrope
+    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;

    if (mrope_used) {
        GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
    }

    if (is_vision) {
-        GGML_ASSERT(n_dims == ne0 / 2);
+        GGML_ASSERT(n_dims == ne0/2);
    }

    if (is_imrope || mrope_used) {
        is_neox = true;
    }

-    int64_t rope_dims = n_dims;
-
-    //Our current RotaryPositionEmbedding does not support the VISION mode,
-    //but essentially it only modifies theta_base in mrope,
-    //then repeats it at the end in the same way as is_neox.
-    //In fact, RoPE is still applied across all dimensions.
-    if (is_vision) {
-        rope_dims = src0->ne[0];
-    }
-    int64_t tail_dims = ne00 - rope_dims;
-    bool    has_tail  = tail_dims > 0;
-
    // init ctx.rope_cos/rope_sin cache
-    aclnn_rope_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, sections,
-                          mrope_used, is_imrope, is_vision, rope_dims);
+    aclnn_rope_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, sections, mrope_used, is_imrope, is_vision);

-    // Cache is generated with ne00 dimensions, so we use ne00 for reshape
-    int64_t sin_reshape_ne[4] = { rope_dims, 1, ne02, 1 };
+    int64_t sin_reshape_ne[4] = { ne00, 1, ne02, 1 };
    size_t  sin_reshape_nb[GGML_MAX_DIMS];
    sin_reshape_nb[0] = sizeof(float);
    for (int i = 1; i < GGML_MAX_DIMS; i++) {
@@ -2719,6 +2704,7 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {

    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
 #ifdef ASCEND_310P
    // Special ROPE operation for 310P

@@ -2858,124 +2844,46 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
    }
    return;
 #endif
+
    int64_t acl_mode = is_neox ? 0 : 1;

-    // Pre-define head and tail dimensions for reuse
-    int64_t head_ne[GGML_MAX_DIMS] = { rope_dims, ne01, ne02, ne03 };
-    int64_t tail_ne[GGML_MAX_DIMS] = { tail_dims, ne01, ne02, ne03 };
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
+                                        acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get());
+                break;
+            }
+        case GGML_TYPE_F16:
+            {
+                ggml_cann_pool_alloc src_trans_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(float));
+                void *               src_trans_buffer = src_trans_allocator.get();
+                ggml_cann_pool_alloc dst_trans_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float));
+                void *               dst_trans_buffer = dst_trans_allocator.get();

-    // Step 1: Prepare trans tensors for F16 type conversion to F32 if needed
-    bool                 src_dst_need_trans = false;
-    ggml_cann_pool_alloc src_trans_allocator(ctx.pool());
-    ggml_cann_pool_alloc dst_trans_allocator(ctx.pool());
-    acl_tensor_ptr       acl_src_trans_tensor;
-    acl_tensor_ptr       acl_dst_trans_tensor;
-    void *               src_trans_buffer = nullptr;
-    void *               dst_trans_buffer = nullptr;
-    size_t               src_dst_trans_nb[GGML_MAX_DIMS];
-    if (src0->type == GGML_TYPE_F16) {
-        src_dst_need_trans = true;
-        src_trans_buffer   = src_trans_allocator.alloc(ggml_nelements(src0) * sizeof(float));
-        dst_trans_buffer   = dst_trans_allocator.alloc(ggml_nelements(dst) * sizeof(float));
+                size_t src_trans_nb[GGML_MAX_DIMS];
+                src_trans_nb[0] = sizeof(float);
+                for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                    src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+                }

-        src_dst_trans_nb[0] = sizeof(float);
-        for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            src_dst_trans_nb[i] = src_dst_trans_nb[i - 1] * src0->ne[i - 1];
-        }
-        acl_src_trans_tensor = ggml_cann_create_tensor(src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne,
-                                                       src_dst_trans_nb, GGML_MAX_DIMS);
-        acl_dst_trans_tensor = ggml_cann_create_tensor(dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne,
-                                                       src_dst_trans_nb, GGML_MAX_DIMS);
-        aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT);
-    }
+                acl_tensor_ptr acl_src_trans_tensor = ggml_cann_create_tensor(
+                    src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb, GGML_MAX_DIMS);
+                acl_tensor_ptr acl_dst_trans_tensor = ggml_cann_create_tensor(
+                    dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb, GGML_MAX_DIMS);

-    // Step 2: Prepare head tensors for tail splitting if needed
-    acl_tensor_ptr acl_src_head;
-    acl_tensor_ptr acl_dst_head;
-    if (has_tail) {
-        // Create head views for RotaryPositionEmbedding (only first rope_dims dimensions)
-        // RotaryPositionEmbedding requires contiguous dst tensor, so we use a temporary buffer
-        if (src_dst_need_trans) {
-            // Use F32 trans tensor strides
-            acl_src_head = ggml_cann_create_tensor((char *) src_trans_buffer, ACL_FLOAT, sizeof(float), head_ne,
-                                                   src_dst_trans_nb, GGML_MAX_DIMS);
-        } else {
-            // Use original F32 tensor strides
-            acl_src_head = ggml_cann_create_tensor((char *) src0->data, ACL_FLOAT, sizeof(float), head_ne, src0->nb,
-                                                   GGML_MAX_DIMS);
-        }
+                aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT);

-        int64_t              head_elements = rope_dims * ne01 * ne02 * ne03;
-        ggml_cann_pool_alloc dst_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float));
-        void *               dst_head_contiguous_buffer = dst_head_contiguous_allocator.get();
+                GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(),
+                                        acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode,
+                                        acl_dst_trans_tensor.get());

-        size_t head_contiguous_nb[GGML_MAX_DIMS];
-        head_contiguous_nb[0] = sizeof(float);
-        for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            head_contiguous_nb[i] = head_contiguous_nb[i - 1] * head_ne[i - 1];
-        }
-        acl_dst_head = ggml_cann_create_tensor(dst_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne,
-                                               head_contiguous_nb, GGML_MAX_DIMS);
-    }
-
-    // Step 3: Execute RotaryPositionEmbedding
-    if (has_tail) {
-        // Rotate only the head portion (first rope_dims dimensions)
-        GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_head.get(), acl_cos_reshape_tensor.get(),
-                                acl_sin_reshape_tensor.get(), acl_mode, acl_dst_head.get());
-
-        // Copy head result from contiguous buffer back to destination tensor
-        if (src_dst_need_trans) {
-            acl_tensor_ptr acl_dst_head_target = ggml_cann_create_tensor(
-                (char *) dst_trans_buffer, ACL_FLOAT, sizeof(float), head_ne, src_dst_trans_nb, GGML_MAX_DIMS);
-            cann_copy(ctx, acl_dst_head.get(), acl_dst_head_target.get());
-        } else {
-            acl_tensor_ptr acl_dst_head_target =
-                ggml_cann_create_tensor((char *) dst->data, ACL_FLOAT, sizeof(float), head_ne, dst->nb, GGML_MAX_DIMS);
-            cann_copy(ctx, acl_dst_head.get(), acl_dst_head_target.get());
-        }
-    } else if (src_dst_need_trans) {
-        // Rotate full tensor (no tail), using trans tensors
-        GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(), acl_cos_reshape_tensor.get(),
-                                acl_sin_reshape_tensor.get(), acl_mode, acl_dst_trans_tensor.get());
-    } else {
-        // Rotate full tensor (no tail), using original tensors
-        GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
-                                acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get());
-    }
-
-    // Step 4: Copy unrotated tail portion from source to destination
-    if (has_tail) {
-        size_t src_tail_offset;
-        size_t dst_tail_offset;
-
-        auto copy_tail_device = [&](void * src_ptr, void * dst_ptr, aclDataType dtype, size_t elem_size,
-                                    size_t * nb_src_arr, size_t * nb_dst_arr) {
-            acl_tensor_ptr acl_src_tail =
-                ggml_cann_create_tensor(src_ptr, dtype, elem_size, tail_ne, nb_src_arr, GGML_MAX_DIMS);
-            acl_tensor_ptr acl_dst_tail =
-                ggml_cann_create_tensor(dst_ptr, dtype, elem_size, tail_ne, nb_dst_arr, GGML_MAX_DIMS);
-            cann_copy(ctx, acl_src_tail.get(), acl_dst_tail.get());
-        };
-
-        if (src_dst_need_trans) {
-            // Use F32 trans tensor strides and offsets
-            src_tail_offset = rope_dims * src_dst_trans_nb[0];
-            dst_tail_offset = rope_dims * src_dst_trans_nb[0];
-            copy_tail_device((char *) src_trans_buffer + src_tail_offset, (char *) dst_trans_buffer + dst_tail_offset,
-                             ACL_FLOAT, sizeof(float), src_dst_trans_nb, src_dst_trans_nb);
-        } else {
-            // Use original tensor strides and offsets
-            src_tail_offset = rope_dims * nb00;
-            dst_tail_offset = rope_dims * nb0;
-            copy_tail_device((char *) src0->data + src_tail_offset, (char *) dst->data + dst_tail_offset,
-                             ggml_cann_type_mapping(dst->type), ggml_element_size(dst), src0->nb, dst->nb);
-        }
-    }
-
-    // Step 5: Cast back to F16 if needed
-    if (src_dst_need_trans) {
-        aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16);
+                aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16);
+                break;
+            }
+        default:
+            GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE");
+            break;
    }
 }

--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@@ -315,7 +315,7 @@ struct ggml_cann_rope_cache {
        if (theta_scale_exp_host) {
            free(theta_scale_exp_host);
        }
-        if (position_select_index_host) {
+        if(position_select_index_host) {
            free(position_select_index_host);
        }
    }
@@ -340,7 +340,7 @@ struct ggml_cann_rope_cache {

    void set(int64_t theta_scale_length,
             int64_t position_length,
-             float   ext_factor,
+             float    ext_factor,
             float   theta_scale,
             float   freq_scale,
             float   attn_factor,
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -2308,7 +2308,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,

    bool cann_graph_update_required = false;
 #ifdef USE_ACL_GRAPH
-    bool use_cann_graph = true;
+    bool use_cann_graph             = true;

    static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
    if (!prefill_use_graph) {
@@ -2338,7 +2338,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
        }
    }
 #else
-    bool use_cann_graph = false;
+    bool use_cann_graph             = false;
 #endif  // USE_ACL_GRAPH
    evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, cann_graph_update_required);

@@ -2474,14 +2474,16 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
            }
        case GGML_OP_ROPE:
            {
+                // TODO: with ops-test v == 1
+                // TODO: n_dims <= ne0
+                if (op->src[0]->ne[0] != op->op_params[1]) {
+                    return false;
+                }
+
                if (op->src[0]->ne[0] > 896) {
                    return false;
                }
 #ifdef ASCEND_310P
-                // TODO: Support rope_dim < ne00(dim)
-                if (op->src[0]->ne[0] != op->op_params[1]) {
-                    return false;
-                }
                if (!ggml_is_contiguous(op->src[0])) {
                    return false;
                }
@@ -2548,7 +2550,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
        case GGML_OP_ARGSORT:
        case GGML_OP_ACC:
        case GGML_OP_GROUP_NORM:
-            return true;
        case GGML_OP_PAD:
            // TODO: add circular padding support for cann, see https://github.com/ggml-org/llama.cpp/pull/16985
            return ggml_get_op_params_i32(op, 8) == 0;
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -81,11 +81,6 @@ struct ggml_arm_arch_features_type {
 } ggml_arm_arch_features = { 0 };
 #endif

-#if defined(__riscv)
-struct ggml_riscv_arch_features_type {
-    int rvv_vlen;
-} ggml_riscv_arch_features = { 0 };
-#endif

 #if defined(_WIN32)

@@ -192,9 +187,6 @@ typedef void * thread_ret_t;

 typedef pthread_t ggml_thread_t;

-#define GGML_THREADPOOL_N_THREADS_MASK (0xffffU)
-#define GGML_THREADPOOL_N_THREADS_BITS (16)
-
 #if defined(__APPLE__)
 #include <unistd.h>
 #include <mach/mach.h>
@@ -457,7 +449,7 @@ struct ggml_threadpool {
    struct ggml_cplan  * cplan;

    // synchronization primitives
-    atomic_int n_graph;       // updated when there is work to be done (i.e each graph) holds graph and active thread counts.
+    atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
    atomic_int GGML_CACHE_ALIGN n_barrier;
    atomic_int GGML_CACHE_ALIGN n_barrier_passed;
    atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
@@ -465,10 +457,12 @@ struct ggml_threadpool {
    // these are atomic as an annotation for thread-sanitizer
    atomic_bool stop;         // Used for stopping the threadpool altogether
    atomic_bool pause;        // Used for pausing the threadpool or individual threads
-    atomic_int  abort;        // Used for aborting processing of a graph
+    atomic_int abort;         // Used for aborting processing of a graph

    struct ggml_compute_state * workers;   // per thread state
-    int          n_threads;   // Number of threads in the pool
+    int          n_threads_max; // number of threads in the pool
+    atomic_int   n_threads_cur; // number of threads used in the current graph
+
    int32_t      prio;        // Scheduling priority
    uint32_t     poll;        // Polling level (0 - no polling)

@@ -545,7 +539,7 @@ struct ggml_state {
 static struct ggml_state g_state = {0};

 void ggml_barrier(struct ggml_threadpool * tp) {
-    int n_threads = atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK;
+    int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
    if (n_threads == 1) {
        return;
    }
@@ -562,7 +556,7 @@ void ggml_barrier(struct ggml_threadpool * tp) {
        // last thread
        atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);

-        // exit barrier (full seq-cst fence)
+        // exit barrier (fill seq-cst fence)
        atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
        return;
    }
@@ -708,15 +702,6 @@ static void ggml_init_arm_arch_features(void) {}
 #endif
 #endif // __ARM_ARCH

-#if defined(__riscv) && defined(__riscv_v_intrinsic)
-#include <riscv_vector.h>
-static void ggml_init_riscv_arch_features(void) {
-    ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
-}
-#else
-static void ggml_init_riscv_arch_features(void) {}
-#endif
-
 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
    GGML_ASSERT(!ggml_get_no_alloc(ctx));

@@ -2643,7 +2628,7 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
 void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
    if (!threadpool) return;

-    const int n_threads = threadpool->n_threads;
+    const int n_threads = threadpool->n_threads_max;

 #ifndef GGML_USE_OPENMP
    struct ggml_compute_state* workers = threadpool->workers;
@@ -2719,7 +2704,7 @@ struct ggml_cplan ggml_graph_plan(
        //GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
    }
    if (n_threads <= 0) {
-        n_threads = threadpool ? threadpool->n_threads : GGML_DEFAULT_N_THREADS;
+        n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
    }

 #if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
@@ -2927,14 +2912,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {

    struct ggml_compute_params params = {
        /*.ith       =*/ state->ith,
-        /*.nth       =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
+        /*.nth       =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
        /*.wsize     =*/ cplan->work_size,
        /*.wdata     =*/ cplan->work_data,
        /*.threadpool=*/ tp,
    };

-    GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
-
    for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
        struct ggml_tensor * node = cgraph->nodes[node_n];

@@ -2956,8 +2939,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
        }
    }

-    GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
-
    ggml_barrier(state->threadpool);

    return 0;
@@ -2965,23 +2946,27 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {

 #ifndef GGML_USE_OPENMP

+// check if thread is active
+static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
+    struct ggml_threadpool * threadpool = state->threadpool;
+    int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
+    return (state->ith < n_threads);
+}
+
 // check if thread is ready to proceed (exit from polling or sleeping)
-// returns true if loops should exit, sets state->pending to indicate new work
 static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
    struct ggml_threadpool * threadpool = state->threadpool;

    if (state->pending || threadpool->stop || threadpool->pause) { return true; }

    // check for new graph/work
-    int n_graph   = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
-    int n_threads = n_graph & GGML_THREADPOOL_N_THREADS_MASK;
-    if (n_graph != state->last_graph) {
-        state->pending    = (state->ith < n_threads);
-        state->last_graph = n_graph;
-        return true;
+    int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
+    if (new_graph != state->last_graph) {
+        state->pending    = ggml_graph_compute_thread_active(state);
+        state->last_graph = new_graph;
    }

-    return false;
+    return state->pending;
 }

 // sync thread state after polling
@@ -2998,6 +2983,11 @@ static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * st
 static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
    struct ggml_threadpool * threadpool = state->threadpool;

+    // Skip polling for unused threads
+    if (!ggml_graph_compute_thread_active(state)) {
+        return state->pending;
+    }
+
    // This seems to make 0 ... 100 a decent range for polling level across modern processors.
    // Perhaps, we can adjust it dynamically based on load and things.
    const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
@@ -3059,6 +3049,7 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
        ggml_graph_compute_check_for_work(state);
        if (state->pending) {
            state->pending = false;
+
            ggml_graph_compute_thread(state);
        }
    }
@@ -3073,15 +3064,14 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int

    ggml_mutex_lock(&threadpool->mutex);

-    // Update the number of active threads and the graph count
-    int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed) >> GGML_THREADPOOL_N_THREADS_BITS;
-    n_graph = ((n_graph + 1) << GGML_THREADPOOL_N_THREADS_BITS) | (n_threads & GGML_THREADPOOL_N_THREADS_MASK);
+    GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);

-    GGML_PRINT_DEBUG("compute-kickoff: n_threads %d n_graph %d\n", n_threads, n_graph);
+    // Update the number of active threads
+    atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);

    // Indicate the graph is ready to be processed
    // We need the full seq-cst fence here because of the polling threads (used in thread_sync)
-    atomic_store_explicit(&threadpool->n_graph, n_graph, memory_order_seq_cst);
+    atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);

    if (threadpool->pause) {
       // Update main thread prio and affinity to match the threadpool settings
@@ -3119,7 +3109,8 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
        threadpool->pause            = tpp->paused;
        threadpool->abort            = -1;
        threadpool->workers          = NULL;
-        threadpool->n_threads        = tpp->n_threads;
+        threadpool->n_threads_max    = tpp->n_threads;
+        threadpool->n_threads_cur    = tpp->n_threads;
        threadpool->poll             = tpp->poll;
        threadpool->prio             = tpp->prio;
        threadpool->ec               = GGML_STATUS_SUCCESS;
@@ -3214,7 +3205,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
            {
                // update the number of threads from the actual number of threads that we got from OpenMP
                n_threads = omp_get_num_threads();
-                atomic_store_explicit(&threadpool->n_graph, n_threads, memory_order_relaxed);
+                atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
            }

            // Apply thread CPU mask and priority
@@ -3227,13 +3218,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
            ggml_graph_compute_thread(&threadpool->workers[ith]);
        }
    } else {
-        atomic_store_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
+        atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
        ggml_graph_compute_thread(&threadpool->workers[0]);
    }
 #else
-    if (n_threads > threadpool->n_threads) {
-        GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads);
-        n_threads = threadpool->n_threads;
+    if (n_threads > threadpool->n_threads_max) {
+        GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
+        n_threads = threadpool->n_threads_max;
    }

    // Kick all threads to start the new graph
@@ -3473,14 +3464,6 @@ int ggml_cpu_has_riscv_v(void) {
 #endif
 }

-int ggml_cpu_get_rvv_vlen(void) {
-#if defined(__riscv) && defined(__riscv_v_intrinsic)
-    return ggml_riscv_arch_features.rvv_vlen;
-#else
-    return 0;
-#endif
-}
-
 int ggml_cpu_has_f16c(void) {
 #if defined(__F16C__)
    return 1;
@@ -3647,10 +3630,6 @@ void ggml_cpu_init(void) {
        ggml_init_arm_arch_features();
 #endif

-#if defined(__riscv)
-        ggml_init_riscv_arch_features();
-#endif
-
        is_first_call = false;
    }

--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -583,10 +583,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
        if (ggml_cpu_has_riscv_v()) {
            features.push_back({ "RISCV_V", "1" });
        }
-        if (ggml_cpu_get_rvv_vlen() > 0) {
-            static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
-            features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
-        }
        if (ggml_cpu_has_vsx()) {
            features.push_back({ "VSX", "1" });
        }
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -2169,8 +2169,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
    static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;

    if (cur->type == GGML_TYPE_Q4_0) {
-        if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
-            || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
+        if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
            if (cur->ne[1] % 8 == 0) {
                return &q4_0_8x8_q8_0;
            }
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -67,22 +67,19 @@
 #define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
 #define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
 #define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
-#define GGML_CUDA_CC_RDNA3_5    (GGML_CUDA_CC_OFFSET_AMD + 0x1150) // AI 370, AI Max 395 laptops.
 #define GGML_CUDA_CC_RDNA4      (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000

-#define GGML_CUDA_CC_IS_AMD(cc)     (cc >= GGML_CUDA_CC_OFFSET_AMD)
-#define GGML_CUDA_CC_IS_RDNA(cc)    (cc >= GGML_CUDA_CC_RDNA1)
-#define GGML_CUDA_CC_IS_RDNA1(cc)   (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
-#define GGML_CUDA_CC_IS_RDNA2(cc)   (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
-#define GGML_CUDA_CC_IS_RDNA3_0(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA3_5)
-#define GGML_CUDA_CC_IS_RDNA3_5(cc) (cc >= GGML_CUDA_CC_RDNA3_5 && cc < GGML_CUDA_CC_RDNA4)
-#define GGML_CUDA_CC_IS_RDNA3(cc)   (GGML_CUDA_CC_IS_RDNA3_0(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc))
-#define GGML_CUDA_CC_IS_RDNA4(cc)   (cc >= GGML_CUDA_CC_RDNA4)
-#define GGML_CUDA_CC_IS_GCN(cc)     (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
-#define GGML_CUDA_CC_IS_CDNA(cc)    (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
-#define GGML_CUDA_CC_IS_CDNA1(cc)   (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
-#define GGML_CUDA_CC_IS_CDNA2(cc)   (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
-#define GGML_CUDA_CC_IS_CDNA3(cc)   (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_AMD(cc)   (cc >= GGML_CUDA_CC_OFFSET_AMD)
+#define GGML_CUDA_CC_IS_RDNA(cc)  (cc >= GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
+#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
+#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
+#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
+#define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
+#define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
+#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
+#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)

 // Moore Threads
 #define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons
--- a/ggml/src/ggml-cuda/diag.cu
+++ b/ggml/src/ggml-cuda/diag.cu
@@ -1,77 +0,0 @@
-#include "convert.cuh"
-#include "diag.cuh"
-#include "ggml.h"
-
-template <typename T>
-static __global__ void diag_kernel(T * __restrict__ dst,
-                                   const T * __restrict__ src,
-                                   const int64_t ne0,
-                                   const int64_t ne1,
-                                   const int64_t ne2,
-                                   const int64_t ne3,
-                                   const int64_t total_elements) {
-    const int64_t global_idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (global_idx >= total_elements) {
-        return;
-    }
-
-    const int64_t i0 = global_idx % ne0;
-    const int64_t i1 = (global_idx / ne0) % ne1;
-    const int64_t i2 = (global_idx / (ne0 * ne1)) % ne2;
-    const int64_t i3 = global_idx / (ne0 * ne1 * ne2);
-
-    const int64_t dst_idx = ((i3 * ne2 + i2) * ne1 + i1) * ne0 + i0;
-
-    if (i0 == i1) {
-        const int64_t batch_idx = i3 * ne2 + i2;
-        const int64_t src_idx   = batch_idx * ne0 + i0;
-        dst[dst_idx]            = src[src_idx];
-    } else {
-        dst[dst_idx] = ggml_cuda_cast<T>(0);
-    }
-    GGML_UNUSED_VARS(ne3);
-}
-
-void ggml_cuda_op_diag(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-
-    void *       dst_d  = dst->data;
-    const void * src0_d = src0->data;
-
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_is_contiguous(src0));
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t ne3 = dst->ne[3];
-
-    GGML_ASSERT(ne00 == ne0);
-    GGML_ASSERT(ne01 == 1);
-    GGML_ASSERT(ne02 == ne2);
-    GGML_ASSERT(ne03 == ne3);
-
-    const int64_t n_elems    = ggml_nelements(dst);
-    const int64_t num_blocks = (n_elems + CUDA_DIAG_BLOCK_SIZE - 1) / CUDA_DIAG_BLOCK_SIZE;
-
-    switch (dst->type) {
-        case GGML_TYPE_F32:
-            diag_kernel<<<num_blocks, CUDA_DIAG_BLOCK_SIZE, 0, stream>>>((float *) dst_d, (const float *) src0_d, ne0,
-                                                                         ne1, ne2, ne3, n_elems);
-            break;
-        case GGML_TYPE_F16:
-            diag_kernel<<<num_blocks, CUDA_DIAG_BLOCK_SIZE, 0, stream>>>((half *) dst_d, (const half *) src0_d, ne0,
-                                                                         ne1, ne2, ne3, n_elems);
-            break;
-        default:
-            GGML_ABORT("unsupported type");
-    }
-}
--- a/ggml/src/ggml-cuda/diag.cuh
+++ b/ggml/src/ggml-cuda/diag.cuh
@@ -1,5 +0,0 @@
-#include "common.cuh"
-
-#define CUDA_DIAG_BLOCK_SIZE 256
-
-void ggml_cuda_op_diag(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -642,8 +642,8 @@ static __global__ void flash_attn_stream_k_fixup(
    const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa;
    const int iter_j = (ne01 + (ncols1    - 1)) / ncols1;

-    const int kbc0      = int64_t(bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
-    const int kbc0_stop = int64_t(bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    const int kbc0      = (bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    const int kbc0_stop = (bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;

    const bool did_not_have_any_data   = kbc0 == kbc0_stop;
    const bool wrote_beginning_of_tile = kbc0 % iter_k == 0;
@@ -679,7 +679,7 @@ static __global__ void flash_attn_stream_k_fixup(
    int bidx = bidx0 - 1;
    int kbc_stop = kbc0;
    while(true) {
-        const int kbc = int64_t(bidx)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+        const int kbc = bidx*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
        if (kbc == kbc_stop) { // Did not have any data.
            bidx--;
            kbc_stop = kbc;
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -955,31 +955,22 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
            (K_h2 + int64_t(kb0)*nbatch_fa*stride_K, tile_K, nbatch_K2, stride_K, k_VKQ_sup);
    }

-    // kb0_start is always < kb0_stop so the last iter can be executed unconditionally.
-    if constexpr (ncols2 == 1) {
-        constexpr bool oob_check = true;
-        for (; kb0 < kb0_stop-1; ++kb0) {
-            constexpr bool last_iter = false;
-            constexpr int  k_VKQ_sup = nbatch_fa;
-            flash_attn_ext_f16_iter
-                <DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter, oob_check,
-                 T_A_KQ, T_B_KQ, T_C_KQ, T_A_VKQ, T_B_VKQ, T_C_VKQ>
-                (Q_f2, K_h2, V_h2, mask_h, dstk, dstk_fixup, scale, slope, logit_softcap,
-                 ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C,
-                 KQ_max, KQ_rowsum, jt, kb0, k_VKQ_sup);
-        }
-        constexpr bool last_iter = true;
-        const     int  k_VKQ_sup = ne11 - kb0*nbatch_fa;
+    for (; kb0 < kb0_stop-1; ++kb0) {
+        constexpr bool last_iter = false;
+        constexpr bool oob_check = false;
+        constexpr int  k_VKQ_sup = nbatch_fa;
        flash_attn_ext_f16_iter
            <DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter, oob_check,
-              T_A_KQ, T_B_KQ, T_C_KQ, T_A_VKQ, T_B_VKQ, T_C_VKQ>
+             T_A_KQ, T_B_KQ, T_C_KQ, T_A_VKQ, T_B_VKQ, T_C_VKQ>
            (Q_f2, K_h2, V_h2, mask_h, dstk, dstk_fixup, scale, slope, logit_softcap,
             ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C,
             KQ_max, KQ_rowsum, jt, kb0, k_VKQ_sup);
-    } else {
-        constexpr bool oob_check = false;
-        for (; kb0 < kb0_stop-1; ++kb0) {
-            constexpr bool last_iter = false;
+    }
+    // kb0_start is always < kb0_stop so the last iter can be executed unconditionally.
+    if constexpr (ncols2 == 1) {
+        if (ne11 % nbatch_fa == 0) {
+            constexpr bool last_iter = true;
+            constexpr bool oob_check = false;
            constexpr int  k_VKQ_sup = nbatch_fa;
            flash_attn_ext_f16_iter
                <DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter, oob_check,
@@ -987,8 +978,20 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
                (Q_f2, K_h2, V_h2, mask_h, dstk, dstk_fixup, scale, slope, logit_softcap,
                 ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C,
                 KQ_max, KQ_rowsum, jt, kb0, k_VKQ_sup);
+        } else {
+            constexpr bool last_iter = true;
+            constexpr bool oob_check = true;
+            const     int  k_VKQ_sup = ne11 - kb0*nbatch_fa;
+            flash_attn_ext_f16_iter
+                <DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter, oob_check,
+                 T_A_KQ, T_B_KQ, T_C_KQ, T_A_VKQ, T_B_VKQ, T_C_VKQ>
+                (Q_f2, K_h2, V_h2, mask_h, dstk, dstk_fixup, scale, slope, logit_softcap,
+                 ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C,
+                 KQ_max, KQ_rowsum, jt, kb0, k_VKQ_sup);
        }
+    } else {
        constexpr bool last_iter = true;
+        constexpr bool oob_check = false;
        constexpr int  k_VKQ_sup = nbatch_fa;
        flash_attn_ext_f16_iter
            <DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter, oob_check,
@@ -1380,8 +1383,8 @@ static __global__ void flash_attn_ext_f16(
    const int iter_j = (ne01.z + (ncols1    - 1)) / ncols1;

    // kbc == k block continuous, current index in continuous ijk space.
-    int       kbc      = int64_t(blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
-    const int kbc_stop = int64_t(blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    int       kbc      = (blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    const int kbc_stop = (blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;

    // If the seams of 2 CUDA blocks fall within an output tile their results need to be combined.
    // For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup).
@@ -1401,7 +1404,7 @@ static __global__ void flash_attn_ext_f16(
        const float2 * Q_f2   = (const float2 *) (Q + nb03*sequence + nb02* head0);
        const half2  * K_h2   = (const half2  *) (K + nb13*sequence + nb12*(head0 / gqa_ratio));
        const half   * mask_h = ncols2 == 1 && !mask ? nullptr :
-            (const half *) (mask + nb33*(sequence % ne33));
+            (const half  *) (mask + nb33*(sequence % ne33));
        float2       * dstk   = ((float2 *) dst) + (sequence*ne01.z*ne02 + head0) * (DV/2);

        const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio));
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -36,26 +36,12 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con
    const ggml_tensor * KQV  = dst;
    const ggml_tensor * Q    = dst->src[0];
    const ggml_tensor * K    = dst->src[1];
-    const ggml_tensor * V    = dst->src[2];
    const ggml_tensor * mask = dst->src[3];

    float max_bias = 0.0f;
    memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));

-    // Edge cases like no mask, ALiBi, unpadded K/V, or misaligned addresses for large data transfers
-    //     are put into the template specialization without GQA optimizations.
-    bool use_gqa_opt = mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;
-    for (const ggml_tensor * t : {Q, K, V, mask}) {
-        if (t == nullptr) {
-            continue;
-        }
-        for (size_t i = 1; i < GGML_MAX_DIMS; ++i) {
-            if (t->nb[i] % 16 != 0) {
-                use_gqa_opt = false;
-                break;
-            }
-        }
-    }
+    const bool use_gqa_opt = mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;

    GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
    const int gqa_ratio = Q->ne[2] / K->ne[2];
--- a/ggml/src/ggml-cuda/fill.cu
+++ b/ggml/src/ggml-cuda/fill.cu
@@ -4,7 +4,7 @@
 #define CUDA_FILL_BLOCK_SIZE 256

 template <typename T>
-static __global__ void fill_kernel(T * dst, const int64_t k, const T value) {
+static __global__ void fill_kernel(T * __restrict__ dst, const int64_t k, const T value) {
    const int64_t i = (int64_t)blockDim.x * blockIdx.x + threadIdx.x;
    if (i >= k) {
        return;
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -20,7 +20,6 @@
 #include "ggml-cuda/cpy.cuh"
 #include "ggml-cuda/cross-entropy-loss.cuh"
 #include "ggml-cuda/diagmask.cuh"
-#include "ggml-cuda/diag.cuh"
 #include "ggml-cuda/fattn.cuh"
 #include "ggml-cuda/getrows.cuh"
 #include "ggml-cuda/im2col.cuh"
@@ -2642,9 +2641,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_PERMUTE:
        case GGML_OP_TRANSPOSE:
                break;
-        case GGML_OP_DIAG:
-            ggml_cuda_op_diag(ctx, dst);
-            break;
        case GGML_OP_DIAG_MASK_INF:
            ggml_cuda_op_diag_mask_inf(ctx, dst);
            break;
@@ -4313,7 +4309,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                case GGML_UNARY_OP_EXPM1:
                case GGML_UNARY_OP_SOFTPLUS:
                case GGML_UNARY_OP_ELU:
-                case GGML_UNARY_OP_XIELU:
                case GGML_UNARY_OP_FLOOR:
                case GGML_UNARY_OP_CEIL:
                case GGML_UNARY_OP_ROUND:
@@ -4629,10 +4624,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_FILL:
        case GGML_OP_CUMSUM:
        case GGML_OP_TRI:
-        case GGML_OP_DIAG:
-        case GGML_OP_SOLVE_TRI:
            return true;
-
+        case GGML_OP_SOLVE_TRI:
+            return op->src[0]->ne[0] <= 64 && op->src[1]->ne[0] <= 32;
        default:
            return false;
    }
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -189,9 +189,6 @@ namespace ggml_cuda_mma {
                return 8 * (threadIdx.x / 16) + l;
 #elif defined(RDNA3)
                return 2 * l + (threadIdx.x / 16);
-#else
-                NO_DEVICE_CODE;
-                return -1;
 #endif // defined(RDNA4)
            } else {
                NO_DEVICE_CODE;
@@ -293,12 +290,8 @@ namespace ggml_cuda_mma {
            }
        }
 #elif defined(AMD_WMMA_AVAILABLE)
-#if defined(RDNA3)
-        // RDNA3 has duplicated data as input.
-        static constexpr int ne = I * J / 32 * 2;
-#else
+
        static constexpr int ne = I * J / 32;
-#endif // defined(RDNA3)
        half2 x[ne] = {{0.0f, 0.0f}};

        static constexpr __device__ bool supported() {
@@ -317,14 +310,7 @@ namespace ggml_cuda_mma {

        static __device__ __forceinline__ int get_j(const int l) {
            if constexpr (I == 16 && J == 8) {
-#if defined(RDNA4)
                return 4 * (threadIdx.x / 16) + l;
-#elif defined(RDNA3)
-                return l;
-#else
-                NO_DEVICE_CODE;
-                return -1;
-#endif // defined(RDNA4)
            } else {
                NO_DEVICE_CODE;
                return -1;
@@ -380,16 +366,11 @@ namespace ggml_cuda_mma {
        static constexpr int         I  = I_;
        static constexpr int         J  = J_;
        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
+        static constexpr int         ne = I * J / WARP_SIZE;

-#if defined(AMD_WMMA_AVAILABLE)
-#if defined(RDNA3)
-        // RDNA3 has duplicated data as input.
-        static constexpr int ne = I * J / 32 * 2;
-#else
-        static constexpr int ne = I * J / 32;
-#endif // defined(RDNA3)
        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};

+#if defined(AMD_WMMA_AVAILABLE)
        static constexpr __device__ bool supported() {
            if (I == 16 && J == 8) return true;
            return false;
@@ -406,23 +387,13 @@ namespace ggml_cuda_mma {

        static __device__ __forceinline__ int get_j(const int l) {
            if constexpr (I == 16 && J == 8) {
-#if defined(RDNA4)
                return 4 * (threadIdx.x / 16) + l;
-#elif defined(RDNA3)
-                return l;
-#else
-                NO_DEVICE_CODE;
-                return -1;
-#endif // defined(RDNA4)
            } else {
                NO_DEVICE_CODE;
                return -1;
            }
        }
 #else
-        static constexpr int ne = I * J / WARP_SIZE;
-        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
-
        static constexpr __device__ bool supported() {
            if (I ==  8 && J ==  8) return true;
            if (I == 16 && J ==  4) return true;
@@ -575,14 +546,8 @@ namespace ggml_cuda_mma {
        }
 #elif defined(AMD_WMMA_AVAILABLE)
        if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
-#if defined(RDNA4)
-                ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
-#elif defined(RDNA3)
-                ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
-                ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x + t.ne/2, xs0 + t.get_i(0) * stride + t.get_j(t.ne/2));
-#else
-                NO_DEVICE_CODE;
-#endif // defined(RDNA4)
+            ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
+
        } else if constexpr (std::is_same_v<T, int>) {
            if constexpr (I == 16 && J == 4) {
                int64_t * xi = (int64_t *) t.x;
@@ -923,16 +888,6 @@ namespace ggml_cuda_mma {
        const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
        const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
-#elif defined(RDNA3)
-        using halfx16_t = __attribute__((ext_vector_type(16))) _Float16;
-        using floatx8_t = __attribute__((ext_vector_type(8))) float;
-        floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
-        const halfx16_t& a_frag = reinterpret_cast<const halfx16_t&>(A.x[0]);
-        const halfx16_t& b_frag = reinterpret_cast<const halfx16_t&>(B.x[0]);
-        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag, b_frag, acc_frag);
-#else
-        GGML_UNUSED_VARS(D, A, B);
-        NO_DEVICE_CODE;
 #endif // RDNA4
 #else
        GGML_UNUSED_VARS(D, A, B);
@@ -950,16 +905,6 @@ namespace ggml_cuda_mma {
        const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
        const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
-#elif defined(RDNA3)
-        using bf16x16_t = __attribute__((ext_vector_type(16))) __bf16;
-        using floatx8_t = __attribute__((ext_vector_type(8))) float;
-        floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
-        const bf16x16_t& a_frag = reinterpret_cast<const bf16x16_t&>(A.x[0]);
-        const bf16x16_t& b_frag = reinterpret_cast<const bf16x16_t&>(B.x[0]);
-        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a_frag, b_frag, acc_frag);
-#else
-        GGML_UNUSED_VARS(D, A, B);
-        NO_DEVICE_CODE;
 #endif // RDNA4
 #else
        GGML_UNUSED_VARS(D, A, B);
--- a/ggml/src/ggml-cuda/mmf.cu
+++ b/ggml/src/ggml-cuda/mmf.cu
@@ -151,9 +151,7 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
            return false;
        }
    } else {
-        if (GGML_CUDA_CC_IS_RDNA3_0(cc) && src1_ncols > 8) {
-            return false;
-        } else if (src1_ncols > 16) {
+        if (src1_ncols > 16) {
            return false;
        }
    }
@@ -162,9 +160,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
        case GGML_TYPE_F32:
            return ampere_mma_available(cc);
        case GGML_TYPE_F16:
-            return volta_mma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc);
+            return volta_mma_available(cc) || turing_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
        case GGML_TYPE_BF16:
-            return ampere_mma_available(cc) || amd_wmma_available(cc);
+            return ampere_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
        default:
            return false;
    }
--- a/ggml/src/ggml-cuda/mmvf.cu
+++ b/ggml/src/ggml-cuda/mmvf.cu
@@ -765,10 +765,7 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0
                return ne11 <= 8;
            } else if (GGML_CUDA_CC_IS_AMD(cc)) {
                if (fp16_mma_hardware_available(cc)) {
-                    if (GGML_CUDA_CC_IS_RDNA3(cc)) {
-                        return ne11 <= 3;
-                    }
-                    if (GGML_CUDA_CC_IS_RDNA4(cc)) {
+                    if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
                        return ne11 <= 5;
                    }
                    return ne11 <= 2;
--- a/ggml/src/ggml-cuda/solve_tri.cu
+++ b/ggml/src/ggml-cuda/solve_tri.cu
@@ -3,80 +3,6 @@
 #include "solve_tri.cuh"

 #define MAX_N_FAST 64
-#define MAX_K_FAST 32
-
-static __global__ void get_batch_pointers(const float *  A,
-                                          float *        X,
-                                          const float ** A_ptrs,
-                                          float **       X_ptrs,
-                                          int64_t        ne02,
-                                          int64_t        total_batches,
-                                          size_t         s02,
-                                          size_t         s03,
-                                          size_t         s2,
-                                          size_t         s3) {
-    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= total_batches) {
-        return;
-    }
-
-    const int64_t i3 = idx / ne02;
-    const int64_t i2 = idx % ne02;
-
-    A_ptrs[idx] = A + i3 * s03 + i2 * s02;
-    X_ptrs[idx] = X + i3 * s3 + i2 * s2;
-}
-
-static void solve_tri_f32_cublas(ggml_backend_cuda_context & ctx,
-                                 const float *               A,
-                                 const float *               B,
-                                 float *                     X,
-                                 int                         n,
-                                 int                         k,
-                                 int64_t                     ne02,
-                                 int64_t                     ne03,
-                                 size_t                      s02,
-                                 size_t                      s03,
-                                 size_t                      s12,
-                                 size_t                      s13,
-                                 size_t                      s2,
-                                 size_t                      s3,
-                                 cudaStream_t                stream) {
-    const float   alpha         = 1.0f;
-    const int64_t total_batches = ne02 * ne03;
-    if (total_batches == 0) {
-        return;
-    }
-
-    // Bulk copy B -> X (contiguous tensors)
-    if (X != B) {
-        const int64_t total_elements_BX = n * k * total_batches;
-        CUDA_CHECK(cudaMemcpyAsync(X, B, total_elements_BX * sizeof(float), cudaMemcpyDeviceToDevice, stream));
-    }
-
-    const int id = ggml_cuda_get_device();
-
-    ggml_cuda_pool_alloc<const float *> A_ptrs_alloc(ctx.pool(id), total_batches);
-    ggml_cuda_pool_alloc<float *>       X_ptrs_alloc(ctx.pool(id), total_batches);
-
-    const float ** A_ptrs_dev = A_ptrs_alloc.get();
-    float **       X_ptrs_dev = X_ptrs_alloc.get();
-
-    get_batch_pointers<<<(total_batches + 255) / 256, 256, 0, stream>>>(A, X, A_ptrs_dev, X_ptrs_dev, ne02,
-                                                                        total_batches, s02, s03, s2, s3);
-
-    CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
-
-    // Yes, this is necessary, without this we get RMSE errors
-    CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_DEFAULT_MATH));
-    CUBLAS_CHECK(cublasStrsmBatched(ctx.cublas_handle(id), CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
-                                    CUBLAS_DIAG_NON_UNIT, k, n, &alpha, A_ptrs_dev, n, X_ptrs_dev, k, total_batches));
-
-    // revert to standard mode from common.cuh
-    CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_TF32_TENSOR_OP_MATH));
-
-    GGML_UNUSED_VARS(s12, s13);
-}

 // ======================
 // Fast Kernel (n <= 64, k <= 32) - Warp-based parallel reduction
@@ -137,7 +63,7 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
    float x_low  = (lane < n) ? B_batch[lane * k + col_idx] : 0.0f;
    float x_high = (WARP_SIZE + lane < n) ? B_batch[(WARP_SIZE + lane) * k + col_idx] : 0.0f;

-    const int half      = WARP_SIZE;
+    const int half = WARP_SIZE;
    const int nrows_low = (n < half) ? n : half;

 #pragma unroll
@@ -155,8 +81,8 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,

 #pragma unroll
    for (int row = half; row < n; ++row) {
-        float     sum = sA[row * n + lane] * x_low;
-        const int j   = half + lane;
+        float sum = sA[row * n + lane] * x_low;
+        const int j = half + lane;
        if (j < row) {
            sum += sA[row * n + j] * x_high;
        }
@@ -171,7 +97,7 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
    for (int rr = 0; rr < 2; ++rr) {
        const int row = rr * WARP_SIZE + lane;
        if (row < n) {
-            const float val            = (row < half) ? x_low : x_high;
+            const float val = (row < half) ? x_low : x_high;
            X_batch[row * k + col_idx] = val;
        }
    }
@@ -250,26 +176,20 @@ static void solve_tri_f32_cuda(const float * A,
 }

 void ggml_cuda_op_solve_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];  // A (n×n, lower triangular)
-    const ggml_tensor * src1 = dst->src[1];  // B (n×k)
+    const ggml_tensor * src0 = dst->src[0];  // A (triangular n x x matrix)
+    const ggml_tensor * src1 = dst->src[1];  // B (right hand side of n x k equation columns)

    ggml_is_contiguous(src0);
    ggml_is_contiguous(src1);

-    const int64_t n    = src0->ne[0];
-    const int64_t k    = src1->ne[0];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
+    const int64_t n = src0->ne[0];
+    const int64_t k = src1->ne[0];

-    if (n <= MAX_N_FAST && k <= MAX_K_FAST) {
-        solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
-                           src0->ne[2], src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
-                           src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
-                           dst->nb[3] / sizeof(float), ctx.stream());
-    } else {
-        solve_tri_f32_cublas(ctx, (const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
-                             ne02, ne03, src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
-                             src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
-                             dst->nb[3] / sizeof(float), ctx.stream());
-    }
+    GGML_ASSERT(n <= 64);
+    GGML_ASSERT(k <= 32);
+
+    solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k, src0->ne[2],
+                       src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
+                       src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
+                       dst->nb[3] / sizeof(float), ctx.stream());
 }
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -19,9 +19,6 @@
 #define CUDA_R_16F  HIPBLAS_R_16F
 #define CUDA_R_16BF HIPBLAS_R_16B
 #define CUDA_R_32F  HIPBLAS_R_32F
-#define CUBLAS_SIDE_RIGHT HIPBLAS_SIDE_RIGHT
-#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
-#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT
 #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
 #define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
 #define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
@@ -33,7 +30,6 @@
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define __all_sync(mask, var) __all(var)
 #define __any_sync(mask, var) __any(var)
-#define cublasStrsmBatched hipblasStrsmBatched
 #define cublasCreate hipblasCreate
 #define cublasDestroy hipblasDestroy
 #define cublasGemmEx hipblasGemmEx
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@@ -12,16 +12,11 @@
 #define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
 #define CUBLAS_OP_N MUBLAS_OP_N
 #define CUBLAS_OP_T MUBLAS_OP_T
-#define CUBLAS_DEFAULT_MATH MUBLAS_DEFAULT_MATH
-#define CUBLAS_SIDE_RIGHT MUBLAS_SIDE_RIGHT
-#define CUBLAS_FILL_MODE_UPPER MUBLAS_FILL_MODE_UPPER
-#define CUBLAS_DIAG_NON_UNIT MUBLAS_DIAG_NON_UNIT
 #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
 #define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH
 #define CUDA_R_16F  MUSA_R_16F
 #define CUDA_R_16BF MUSA_R_16BF
 #define CUDA_R_32F  MUSA_R_32F
-#define cublasStrsmBatched mublasStrsmBatched
 #define cublasComputeType_t cudaDataType_t
 #define cublasCreate mublasCreate
 #define cublasDestroy mublasDestroy
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -73,15 +73,15 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
    return (1 - MIN(1, MAX(0, y)));
 }

-static void rope_cache_init(const float    theta_base,
-                            const float    freq_scale,
-                            const float *  freq_factors,
-                            float *        corr_dims,
-                            const uint32_t ne0,
-                            const float    ext_factor,
-                            const float    mscale,
-                            float *        cache,
-                            const float    theta_scale) {
+static void rope_cache_init(const float   theta_base,
+                            float         freq_scale,
+                            const float * freq_factors,
+                            float *       corr_dims,
+                            uint32_t      ne0,
+                            float         ext_factor,
+                            float         mscale,
+                            float *       cache,
+                            float         theta_scale) {
    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
    float theta = theta_base;

@@ -92,19 +92,18 @@ static void rope_cache_init(const float    theta_base,

        // Get n-d rotational scaling corrected for extrapolation
        float theta_interp = freq_scale * theta_extrap;
-        float theta_final  = theta_interp;
-        float mscale_final = mscale;
+        float theta2       = theta_interp;

        if (ext_factor != 0.0f) {
            float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
-            theta_final    = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+            theta2         = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;

            // Get n-d magnitude scaling corrected for interpolation
-            mscale_final *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+            mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
        }

-        cache[i0 + 0] = cosf(theta_final) * mscale_final;
-        cache[i0 + 1] = sinf(theta_final) * mscale_final;
+        cache[i0 + 0] = cosf(theta2) * mscale;
+        cache[i0 + 1] = sinf(theta2) * mscale;

        theta *= theta_scale;
    }
@@ -152,9 +151,9 @@ static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context
 }

 static void hvx_calc_rope_neox_f32(const float * restrict src0,
-                                   float * restrict dst,
-                                   const int num_elems,
-                                   const float * restrict theta_cache) {
+                              float * restrict dst,
+                              const int num_elems,
+                              const float * restrict theta_cache) {
    // for (int i = 0; i < num_elems; i += 2) {
    //const float cos_theta = theta_cache[i + 0];
    //const float sin_theta = theta_cache[i + 1];
@@ -193,7 +192,7 @@ static void hvx_calc_rope_neox_f32(const float * restrict src0,
        HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
        HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);

-        *(HVX_Vector *) dst_curr               = Q6_Vsf_equals_Vqf32(v4);
+        *(HVX_Vector *) dst_curr          = Q6_Vsf_equals_Vqf32(v4);
        *(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5);

        src0_curr += VLEN;
@@ -260,7 +259,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                         const uint32_t       ir1,
                         int                  nth,
                         int                  ith,
-                         const int            opt_path) {
+                         int                  opt_path) {
    struct htp_ops_context * octx = rope_ctx->octx;

    const struct htp_tensor * src0 = &octx->src0;
@@ -268,8 +267,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
    const struct htp_tensor * src2 = &octx->src2;
    struct htp_tensor *       dst  = &octx->dst;

-    const int32_t mode    = rope_ctx->mode;
-    const bool    is_neox = mode & HTP_ROPE_TYPE_NEOX;
+    const int32_t mode  = rope_ctx->mode;
+    const bool is_neox  = mode & HTP_ROPE_TYPE_NEOX;

    htp_rope_preamble;

@@ -282,9 +281,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
        freq_factors = (const float *) src2->data;
    }

-    const uint32_t i1_end       = MIN(ir1, ne1);
-    const int32_t  half_dims    = rope_ctx->n_dims / 2;
-    const size_t   remain_bytes = (ne0 - rope_ctx->n_dims) * sizeof(float);
+    int ir = 0;
+
    for (uint32_t i3 = 0; i3 < ne3; i3++) {      // batch
        for (uint32_t i2 = 0; i2 < ne2; i2++) {  // seq-len
            const int32_t p = pos[i2];
@@ -292,7 +290,14 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
            rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor,
                            rope_ctx->attn_factor, wp0, rope_ctx->theta_scale);

-            for (uint32_t i1 = ir0; i1 < i1_end; i1++) {  // attn-heads
+            for (uint32_t i1 = 0; i1 < ne1; i1++) {  // attn-heads
+                if (ir++ < ir0) {
+                    continue;
+                }
+                if (ir > ir1) {
+                    break;
+                }
+
                const float * src      = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
                float *       dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);

@@ -305,9 +310,6 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                    } else {
                        hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
                    }
-
-                    src_loc += rope_ctx->n_dims;
-                    dst_data_loc += rope_ctx->n_dims;
                } else {
                    for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
                        const float cos_theta = wp0[i0 + 0];
@@ -315,10 +317,10 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,

                        if (is_neox) {
                            const float x0 = src_loc[0];
-                            const float x1 = src_loc[half_dims];
+                            const float x1 = src_loc[rope_ctx->n_dims/2];

-                            dst_data_loc[0]         = x0 * cos_theta - x1 * sin_theta;
-                            dst_data_loc[half_dims] = x0 * sin_theta + x1 * cos_theta;
+                            dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
+                            dst_data_loc[rope_ctx->n_dims/2] = x0 * sin_theta + x1 * cos_theta;

                            src_loc += 1;
                            dst_data_loc += 1;
@@ -333,13 +335,15 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                            dst_data_loc += 2;
                        }
                    }
-
-                    src_loc += (is_neox ? half_dims : 0);
-                    dst_data_loc += (is_neox ? half_dims : 0);
                }

-                // TODO: use simd to speed up the remaining elements copy
-                memcpy(dst_data_loc, src_loc, remain_bytes);
+                for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) {
+                    dst_data_loc[0] = src_loc[0];
+                    dst_data_loc[1] = src_loc[1];
+
+                    src_loc += 2;
+                    dst_data_loc += 2;
+                }
            }
        }
    }
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -411,38 +411,6 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv(ggml_me
    return res;
 }

-ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv_batched(ggml_metal_library_t lib, const ggml_tensor * op, int ssm_conv_bs) {
-    GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
-
-    GGML_ASSERT(ggml_is_contiguous(op->src[0]));
-    GGML_ASSERT(ggml_is_contiguous(op->src[1]));
-
-    char base[256];
-    char name[256];
-
-    const char * suffix = "";
-    if (op->src[1]->ne[0] % 4 == 0) {
-        suffix = "_4";
-    }
-
-    snprintf(base, 256, "kernel_ssm_conv_%s_%s_batched%s", ggml_type_name(op->src[0]->type), ggml_type_name(op->src[1]->type), suffix);
-    snprintf(name, 256, "%s_ssm_conv_bs=%d", base, ssm_conv_bs);
-
-    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
-    if (!res.pipeline) {
-        ggml_metal_cv_t cv = ggml_metal_cv_init();
-
-        ggml_metal_cv_set_int16(cv, ssm_conv_bs, FC_SSM_CONV + 0);
-
-        res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
-
-        ggml_metal_cv_free(cv);
-    }
-
-    return res;
-}
-
 ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan(ggml_metal_library_t lib, const ggml_tensor * op)  {
    GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);

@@ -459,12 +427,7 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan(ggml_me
        res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
    }

-    // Shared memory layout:
-    // - sgptg * NW floats for partial sums (nsg * 32)
-    // - sgptg floats for shared_x_dt (nsg)
-    // - sgptg floats for shared_dA (nsg)
-    // Total: nsg * (32 + 2) floats
-    res.smem = (32 + 2)*sizeof(float)*nsg;
+    res.smem = 32*sizeof(float)*nsg;

    return res;
 }
--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@@ -117,7 +117,6 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_cumsum_ad
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_tri               (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_soft_max          (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv          (ggml_metal_library_t lib, const struct ggml_tensor * op);
-struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv_batched  (ggml_metal_library_t lib, const struct ggml_tensor * op, int ssm_conv_bs);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan          (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv              (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext        (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg);
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -77,7 +77,6 @@
 #define FC_MUL_MV                      600
 #define FC_MUL_MM                      700
 #define FC_ROPE                        800
-#define FC_SSM_CONV                    900

 // op-specific constants
 #define OP_FLASH_ATTN_EXT_NQPTG 8
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -221,7 +221,7 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
        }

        if (ctx->debug_graph > 0) {
-            GGML_LOG_DEBUG("%s: node[%5d] - %-12s %-12s %s\n", __func__, idx, ggml_op_name(node->op), ggml_get_name(node), is_concurrent ? "(concurrent)" : "");
+            GGML_LOG_DEBUG("%s: node[%5d] - %-12s %s\n", __func__, idx, ggml_op_name(node->op), is_concurrent ? "(concurrent)" : "");
        }
        if (ctx->debug_graph > 1) {
            GGML_TENSOR_LOCALS( int64_t, ne0, node->src[0], ne);
@@ -1365,43 +1365,15 @@ int ggml_metal_op_ssm_conv(ggml_metal_op_t ctx, int idx) {
        /*.nb2  =*/ nb2,
    };

-    // Use batched kernel for prefill (ne1 > 1) to reduce threadgroup dispatch overhead
-    const bool use_batched = (ne1 > 1);
+    auto pipeline = ggml_metal_library_get_pipeline_ssm_conv(lib, op);

-    if (use_batched) {
-        // Determine the smallest power of 2 that's >= ne1, but <= 256
-        int BATCH_SIZE;
-        if      (ne1 > 128) BATCH_SIZE = 256;
-        else if (ne1 > 64 ) BATCH_SIZE = 128;
-        else if (ne1 > 32 ) BATCH_SIZE = 64;
-        else if (ne1 > 16 ) BATCH_SIZE = 32;
-        else if (ne1 > 8  ) BATCH_SIZE = 16;
-        else if (ne1 > 4  ) BATCH_SIZE = 8;
-        else                BATCH_SIZE = 2;
+    ggml_metal_encoder_set_pipeline(enc, pipeline);
+    ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
+    ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1);
+    ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[1]), 2);
+    ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op),         3);

-        auto pipeline = ggml_metal_library_get_pipeline_ssm_conv_batched(lib, op, BATCH_SIZE);
-
-        ggml_metal_encoder_set_pipeline(enc, pipeline);
-        ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
-        ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1);
-        ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[1]), 2);
-        ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op),         3);
-
-        // Dispatch: ne01 rows, ceil(ne1/BATCH_SIZE) token batches, ne02 sequences
-        // Each threadgroup has BATCH_SIZE threads, each handling one token
-        const int n_token_batches = (ne1 + BATCH_SIZE - 1) / BATCH_SIZE;
-        ggml_metal_encoder_dispatch_threadgroups(enc, ne01, n_token_batches, ne02, BATCH_SIZE, 1, 1);
-    } else {
-        auto pipeline = ggml_metal_library_get_pipeline_ssm_conv(lib, op);
-
-        ggml_metal_encoder_set_pipeline(enc, pipeline);
-        ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
-        ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1);
-        ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[1]), 2);
-        ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op),         3);
-
-        ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne1, ne02, 1, 1, 1);
-    }
+    ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne1, ne02, 1, 1, 1);

    return 1;
 }
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2343,102 +2343,7 @@ kernel void kernel_ssm_conv_f32_f32_4(
    x[0] = sumf;
 }

-constant short FC_ssm_conv_bs   [[function_constant(FC_SSM_CONV + 0)]];
-
-// Batched version: each threadgroup processes multiple tokens for better efficiency
-// Thread layout: each thread handles one token, threadgroup covers BATCH_SIZE tokens
-kernel void kernel_ssm_conv_f32_f32_batched(
-        constant ggml_metal_kargs_ssm_conv & args,
-        device const  void * src0,
-        device const  void * src1,
-        device       float * dst,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    // tgpig.x = row index (ir)
-    // tgpig.y = batch of tokens (i2_base / BATCH_SIZE)
-    // tgpig.z = sequence index (i3)
-    // tpitg.x = thread within batch (0..BATCH_SIZE-1)
-    const short BATCH_SIZE = FC_ssm_conv_bs;
-
-    const int64_t ir      = tgpig.x;
-    const int64_t i2_base = tgpig.y * BATCH_SIZE;
-    const int64_t i3      = tgpig.z;
-    const int64_t i2_off  = tpitg.x;
-    const int64_t i2      = i2_base + i2_off;
-
-    const int64_t nc  = args.ne10;  // conv kernel size (typically 4)
-    const int64_t n_t = args.ne1;   // number of tokens
-
-    // Bounds check for partial batches at the end
-    if (i2 >= n_t) {
-        return;
-    }
-
-    // Load conv weights (shared across all tokens for this row)
-    device const float * c = (device const float *) ((device const char *) src1 + ir*args.nb11);
-
-    // Load source for this specific token
-    device const float * s = (device const float *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02);
-
-    // Output location for this token
-    device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2);
-
-    float sumf = 0.0f;
-    for (int64_t i0 = 0; i0 < nc; ++i0) {
-        sumf += s[i0] * c[i0];
-    }
-
-    x[0] = sumf;
-}
-
-kernel void kernel_ssm_conv_f32_f32_batched_4(
-        constant ggml_metal_kargs_ssm_conv & args,
-        device const  void * src0,
-        device const  void * src1,
-        device       float * dst,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    // tgpig.x = row index (ir)
-    // tgpig.y = batch of tokens (i2_base / BATCH_SIZE)
-    // tgpig.z = sequence index (i3)
-    // tpitg.x = thread within batch (0..BATCH_SIZE-1)
-    const short BATCH_SIZE = FC_ssm_conv_bs;
-
-    const int64_t ir      = tgpig.x;
-    const int64_t i2_base = tgpig.y * BATCH_SIZE;
-    const int64_t i3      = tgpig.z;
-    const int64_t i2_off  = tpitg.x;
-    const int64_t i2      = i2_base + i2_off;
-
-    const int64_t nc  = args.ne10;  // conv kernel size (typically 4)
-    const int64_t n_t = args.ne1;   // number of tokens
-
-    // Bounds check for partial batches at the end
-    if (i2 >= n_t) {
-        return;
-    }
-
-    // Load conv weights (shared across all tokens for this row)
-    device const float4 * c = (device const float4 *) ((device const char *) src1 + ir*args.nb11);
-
-    // Load source for this specific token
-    device const float4 * s = (device const float4 *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02);
-
-    // Output location for this token
-    device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2);
-
-    float sumf = 0.0f;
-    for (int64_t i0 = 0; i0 < nc/4; ++i0) {
-        sumf += dot(s[i0], c[i0]);
-    }
-
-    x[0] = sumf;
-}
-
 // ref: ggml.c:ggml_compute_forward_ssm_scan_f32, Mamba-2 part
-// Optimized version: reduces redundant memory loads by having one thread load shared values
 kernel void kernel_ssm_scan_f32(
        constant ggml_metal_kargs_ssm_scan & args,
        device const void * src0,
@@ -2458,15 +2363,7 @@ kernel void kernel_ssm_scan_f32(
        uint3    tgpg[[threadgroups_per_grid]]) {
    constexpr short NW = N_SIMDWIDTH;

-    // Shared memory layout:
-    // [0..sgptg*NW-1]: partial sums for reduction (existing)
-    // [sgptg*NW..sgptg*NW+sgptg-1]: pre-computed x_dt values for each token in batch
-    // [sgptg*NW+sgptg..sgptg*NW+2*sgptg-1]: pre-computed dA values for each token in batch
-    threadgroup float * shared_sums = shared;
-    threadgroup float * shared_x_dt = shared + sgptg * NW;
-    threadgroup float * shared_dA   = shared + sgptg * NW + sgptg;
-
-    shared_sums[tpitg.x] = 0.0f;
+    shared[tpitg.x] = 0.0f;

    const int32_t i0 = tpitg.x;
    const int32_t i1 = tgpig.x;
@@ -2506,47 +2403,32 @@ kernel void kernel_ssm_scan_f32(
    for (int i2 = 0; i2 < n_t; i2 += sgptg) {
        threadgroup_barrier(mem_flags::mem_threadgroup);

-        // Pre-compute x_dt and dA for this batch of tokens
-        // Only first sgptg threads do the loads and expensive math
-        if (i0 < sgptg && i2 + i0 < n_t) {
-            // ns12 and ns21 are element strides (nb12/nb10, nb21/nb20)
-            device const float * x_t  = x  + i0 * args.ns12;
-            device const float * dt_t = dt + i0 * args.ns21;
-
-            const float dt0  = dt_t[0];
-            const float dtsp = dt0 <= 20.0f ? log(1.0f + exp(dt0)) : dt0;
-            shared_x_dt[i0] = x_t[0] * dtsp;
-            shared_dA[i0]   = dtsp;  // Store dtsp, compute exp(dtsp * A0) per-thread since A0 varies
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
        for (int t = 0; t < sgptg && i2 + t < n_t; t++) {
-            const float x_dt = shared_x_dt[t];
-            const float dA   = exp(shared_dA[t] * A0);
+            const float dt0  = dt[0];
+            const float dtsp = dt0 <= 20.0f ? log(1.0f + exp(dt0)) : dt0;
+            const float x_dt = x[0] * dtsp;
+            const float dA   = exp(dtsp * A0);

            s = (s0 * dA) + (B[i0] * x_dt);

            const float sumf = simd_sum(s * C[i0]);

            if (tiisg == 0) {
-                shared_sums[t*NW + sgitg] = sumf;
+                shared[t*NW + sgitg] = sumf;
            }

            // recurse
            s0 = s;

+            x  += args.ns12;
+            dt += args.ns21;
            B  += args.ns42;
            C  += args.ns52;
        }

-        // Advance pointers for next batch
-        x  += sgptg * args.ns12;
-        dt += sgptg * args.ns21;
-
        threadgroup_barrier(mem_flags::mem_threadgroup);

-        const float sumf = simd_sum(shared_sums[sgitg*NW + tiisg]);
+        const float sumf = simd_sum(shared[sgitg*NW + tiisg]);

        if (tiisg == 0 && i2 + sgitg < n_t) {
            y[sgitg*nh*nr] = sumf;
--- a/ggml/src/ggml-sycl/softmax.cpp
+++ b/ggml/src/ggml-sycl/softmax.cpp
@@ -277,7 +277,7 @@ static void soft_max_f32_sycl(const float *x, const T *mask,
    const int id       = get_current_device_id();
    const size_t smpbo = ggml_sycl_info().devices[id].smpbo;

-    if (nbytes_shared <= smpbo && ncols_x <= max_block_size) {
+    if (nbytes_shared <= smpbo) {
        launch_soft_max_kernels<32, 64, 128, 256, 512, 1024, 2048, 4096>(
            x, mask, sinks, dst, params, stream, block_dims, block_nums,
            nbytes_shared);
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -124,13 +124,6 @@ static void ggml_print_backtrace_symbols(void) {
    int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
    backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
 }
-#elif defined(__APPLE__)
-#include <execinfo.h>
-static void ggml_print_backtrace_symbols(void) {
-    void * trace[100];
-    int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
-    backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
-}
 #else
 static void ggml_print_backtrace_symbols(void) {
    // platform not supported
@@ -142,20 +135,6 @@ void ggml_print_backtrace(void) {
    if (GGML_NO_BACKTRACE) {
        return;
    }
-#if defined(__APPLE__)
-    // On macOS, fork+debugger attachment is problematic due to:
-    // 1. libdispatch "poisons" forked child processes
-    // 2. lldb has issues attaching to parent from forked child
-    // Use simple backtrace() instead to avoid Terminal.app crashes
-    const char * GGML_BACKTRACE_LLDB = getenv("GGML_BACKTRACE_LLDB");
-    if (!GGML_BACKTRACE_LLDB) {
-        fprintf(stderr, "WARNING: Using native backtrace. Set GGML_BACKTRACE_LLDB for more info.\n");
-        fprintf(stderr, "WARNING: GGML_BACKTRACE_LLDB may cause native MacOS Terminal.app to crash.\n");
-        fprintf(stderr, "See: https://github.com/ggml-org/llama.cpp/pull/17869\n");
-        ggml_print_backtrace_symbols();
-        return;
-    }
-#endif
 #if defined(__linux__)
    FILE * f = fopen("/proc/self/status", "r");
    size_t size = 0;
@@ -3439,7 +3418,6 @@ struct ggml_tensor * ggml_cast(

    result->op     = GGML_OP_CPY;
    result->src[0] = a;
-    result->src[1] = result;

    return result;
 }
@@ -5260,6 +5238,8 @@ struct ggml_tensor * ggml_flash_attn_ext(

    if (mask) {
        GGML_ASSERT(ggml_is_contiguous(mask));
+        GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
+                "the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
        //GGML_ASSERT(ggml_can_repeat_rows(mask, qk));

        GGML_ASSERT(q->ne[2] % mask->ne[2] == 0);
--- a/models/templates/mistralai-Ministral-3-14B-Reasoning-2512.jinja
+++ b/models/templates/mistralai-Ministral-3-14B-Reasoning-2512.jinja
@@ -1,126 +0,0 @@
-{#- Default system message if no system prompt is passed. #}
-{%- set default_system_message = '# HOW YOU SHOULD THINK AND ANSWER\n\nFirst draft your thinking process (inner monologue) until you arrive at a response. Format your response using Markdown, and use LaTeX for any mathematical equations. Write both your thoughts and the response in the same language as the input.\n\nYour thinking process must follow the template below:[THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. Be as casual and as long as you want until you are confident to generate the response to the user.[/THINK]Here, provide a self-contained response.' %}
-
-{#- Begin of sequence token. #}
-{{- bos_token }}
-
-{#- Handle system prompt if it exists. #}
-{#- System prompt supports text content or text and thinking chunks. #}
-{%- if messages[0]['role'] == 'system' %}
-    {{- '[SYSTEM_PROMPT]' -}}
-    {%- if messages[0]['content'] is string %}
-        {{- messages[0]['content'] -}}
-    {%- else %}        
-        {%- for block in messages[0]['content'] %}
-            {%- if block['type'] == 'text' %}
-                {{- block['text'] }}
-            {%- elif block['type'] == 'thinking' %}
-                {{- '[THINK]' + block['thinking'] + '[/THINK]' }}
-            {%- else %}
-                {{- raise_exception('Only text and thinking chunks are supported in system message contents.') }}
-            {%- endif %}
-        {%- endfor %}
-    {%- endif %}
-    {{- '[/SYSTEM_PROMPT]' -}}
-    {%- set loop_messages = messages[1:] %}
-{%- else %}
-    {%- set loop_messages = messages %}
-    {%- if default_system_message != '' %}
-        {{- '[SYSTEM_PROMPT]' + default_system_message + '[/SYSTEM_PROMPT]' }}
-    {%- endif %}
-{%- endif %}
-
-
-{#- Tools definition #}
-{%- set tools_definition = '' %}
-{%- set has_tools = false %}
-{%- if tools is defined and tools is not none and tools|length > 0 %}
-    {%- set has_tools = true %}
-    {%- set tools_definition = '[AVAILABLE_TOOLS]' + (tools| tojson) + '[/AVAILABLE_TOOLS]' %}
-    {{- tools_definition }}
-{%- endif %}
-
-{#- Checks for alternating user/assistant messages. #}
-{%- set ns = namespace(index=0) %}
-{%- for message in loop_messages %}
-    {%- if message.role == 'user' or (message.role == 'assistant' and (message.tool_calls is not defined or message.tool_calls is none or message.tool_calls | length == 0)) %}
-        {%- if (message['role'] == 'user') != (ns.index % 2 == 0) %}
-            {{- raise_exception('After the optional system message, conversation roles must alternate user and assistant roles except for tool calls and results.') }}
-        {%- endif %}
-        {%- set ns.index = ns.index + 1 %}
-    {%- endif %}
-{%- endfor %}
-
-{#- Handle conversation messages. #}
-{%- for message in loop_messages %}
-
-    {#- User messages supports text content or text and image chunks. #}
-    {%- if message['role'] == 'user' %}
-        {%- if message['content'] is string %}
-            {{- '[INST]' + message['content'] + '[/INST]' }}
-        {%- elif message['content'] | length > 0 %}
-            {{- '[INST]' }}
-            {%- if message['content'] | length == 2 %}
-                {%- set blocks = message['content'] | sort(attribute='type') %}
-            {%- else %}
-                {%- set blocks = message['content'] %}
-            {%- endif %}
-            {%- for block in blocks %}
-                {%- if block['type'] == 'text' %}
-                    {{- block['text'] }}
-                {%- elif block['type'] in ['image', 'image_url'] %}
-                    {{- '[IMG]' }}
-                {%- else %}
-                    {{- raise_exception('Only text, image and image_url chunks are supported in user message content.') }}
-                {%- endif %}
-            {%- endfor %}
-            {{- '[/INST]' }}
-        {%- else %}
-            {{- raise_exception('User message must have a string or a list of chunks in content') }}
-        {%- endif %}
-
-    {#- Assistant messages supports text content or text, image and thinking chunks. #}
-    {%- elif message['role'] == 'assistant' %}
-        {%- if (message['content'] is none or message['content'] == '' or message['content']|length == 0) and (message['tool_calls'] is not defined or message['tool_calls'] is none or message['tool_calls']|length == 0) %}
-            {{- raise_exception('Assistant message must have a string or a list of chunks in content or a list of tool calls.') }}
-        {%- endif %}
-
-        {%- if message['content'] is string and message['content'] != '' %}
-            {{- message['content'] }}
-        {%- elif message['content'] | length > 0 %}
-            {%- for block in message['content'] %}
-                {%- if block['type'] == 'text' %}
-                    {{- block['text'] }}
-                {%- elif block['type'] == 'thinking' %}
-                    {{- '[THINK]' + block['thinking'] + '[/THINK]' }}
-                {%- else %}
-                    {{- raise_exception('Only text and thinking chunks are supported in assistant message contents.') }}
-                {%- endif %}
-            {%- endfor %}
-        {%- endif %}
-        
-        {%- if message['tool_calls'] is defined and message['tool_calls'] is not none and message['tool_calls']|length > 0 %}
-            {%- for tool in message['tool_calls'] %}
-                {{- '[TOOL_CALLS]' }}
-                {%- set name = tool['function']['name'] %}
-                {%- set arguments = tool['function']['arguments'] %}
-                {%- if arguments is not string %}
-                    {%- set arguments = arguments|tojson|safe %}
-                {%- elif arguments == '' %}
-                    {%- set arguments = '{}' %}
-                {%- endif %}
-                {{- name + '[ARGS]' + arguments }}
-            {%- endfor %}
-        {%- endif %}
-
-        {{- eos_token }}
-
-    {#- Tool messages only supports text content. #}
-    {%- elif message['role'] == 'tool' %}
-        {{- '[TOOL_RESULTS]' + message['content']|string + '[/TOOL_RESULTS]' }}
-
-    {#- Raise exception for unsupported roles. #}
-    {%- else %}
-        {{- raise_exception('Only user, assistant and tool roles are supported, got ' + message['role'] + '.') }}
-    {%- endif %}
-{%- endfor %}
--- a/scripts/snapdragon/adb/run-cli.sh
+++ b/scripts/snapdragon/adb/run-cli.sh
@@ -46,7 +46,7 @@ adb $adbserial shell " \
    LD_LIBRARY_PATH=$basedir/$branch/lib   \
    ADSP_LIBRARY_PATH=$basedir/$branch/lib \
    $verbose $experimental $sched $opmask $profile $nhvx $ndev       \
-      ./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model   \
+      ./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model   \
         --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1             \
         --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \
         -ngl 99 --device $device $cli_opts $@ \
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -139,7 +139,6 @@ add_library(llama
 set_target_properties(llama PROPERTIES
    VERSION ${LLAMA_INSTALL_VERSION}
    SOVERSION 0
-    MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
 )

 target_include_directories(llama PRIVATE .)
--- a/src/llama-batch.cpp
+++ b/src/llama-batch.cpp
@@ -695,8 +695,6 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
    udata->seq_idx   .resize(LLAMA_MAX_SEQ, -1);
    udata->output    .resize(n_tokens);

-    udata->seq_id_data.reserve(n_tokens);
-
    seq_set_t seq_set_unq;

    for (size_t i = 0; i < idxs.size(); ++i) {
@@ -718,13 +716,11 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
        }

        udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
+        udata->seq_id[i]   = batch.seq_id[idxs[i]];
        udata->output[i]   = batch.logits[idxs[i]];

        for (int s = 0; s < udata->n_seq_id[i]; ++s) {
-            const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
-
-            udata->seq_id_data.push_back(seq_id);
-            seq_set_unq.set(seq_id);
+            seq_set_unq.set(udata->seq_id[i][s]);
        }

        if (udata->output[i]) {
@@ -732,12 +728,6 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
        }
    }

-    llama_seq_id * seq_id_ptr = udata->seq_id_data.data();
-    for (size_t i = 0; i < idxs.size(); ++i) {
-        udata->seq_id[i] = seq_id_ptr;
-        seq_id_ptr += udata->n_seq_id[i];
-    }
-
    for (uint32_t s = 0; s < n_seq_max; ++s) {
        if (seq_set_unq.test(s)) {
            udata->seq_idx[s] = udata->seq_id_unq.size();
--- a/src/llama-batch.h
+++ b/src/llama-batch.h
@@ -56,15 +56,13 @@ struct llama_ubatch {
        std::vector<float>          embd;
        std::vector<llama_pos>      pos;
        std::vector<int32_t>        n_seq_id;
-        std::vector<llama_seq_id *> seq_id;      // these point into the seq_id_data below
+        std::vector<llama_seq_id *> seq_id;
        std::vector<llama_seq_id>   seq_id_unq;
        std::vector<int32_t>        seq_idx;
        std::vector<int8_t>         output;
-
-        std::vector<llama_seq_id> seq_id_data;
    };

-    // the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
+    // the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
    std::shared_ptr<data_t> data;
 };

--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -93,6 +93,14 @@ llama_context::llama_context(
    // with causal attention, the batch size is limited by the context size
    cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;

+    // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
+    // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
+    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
+    // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
+    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
+        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
+        cparams.n_batch = GGML_KQ_MASK_PAD;
+    }
    cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);

    cparams.op_offload = params.op_offload;
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -385,7 +385,7 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
  //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there

    res &= self_kq_mask->ne[0] == mctx->get_n_kv();
-    res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
+    res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);

    return res;
 }
@@ -416,10 +416,10 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
  //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there

    res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
-    res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
+    res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);

    res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
-    res &= self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
+    res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);

    return res;
 }
@@ -452,7 +452,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
            }
        }

-        for (int i = n_tokens; i < n_tokens; ++i) {
+        for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
            for (int j = 0; j < n_enc; ++j) {
                data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
            }
@@ -574,7 +574,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    freq_base        (cparams.rope_freq_base),
    freq_scale       (cparams.rope_freq_scale),
    ext_factor       (cparams.yarn_ext_factor),
-    attn_factor      (llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor)),
+    attn_factor      (cparams.yarn_attn_factor),
    beta_fast        (cparams.yarn_beta_fast),
    beta_slow        (cparams.yarn_beta_slow),
    norm_eps         (hparams.f_norm_eps),
@@ -1470,13 +1470,13 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
    auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);

    // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
-    inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
+    inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
    ggml_set_input(inp->self_kq_mask);

    inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;

    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
-        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
+        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
        ggml_set_input(inp->self_kq_mask_swa);

        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
@@ -1558,7 +1558,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
        inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
        inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);

-        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
        ggml_set_input(inp->self_kq_mask);

        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1701,7 +1701,7 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {

    const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;

-    inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1, 1);
+    inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
    ggml_set_input(inp->cross_kq_mask);

    inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
@@ -1767,7 +1767,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
        inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
        inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);

-        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
        ggml_set_input(inp->self_kq_mask);

        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1781,7 +1781,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
        inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
        inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);

-        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
        ggml_set_input(inp->self_kq_mask_swa);

        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -1,9 +1,7 @@
 #include "llama-hparams.h"

 #include "ggml.h"
-
 #include <cassert>
-#include <cmath>

 void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
    if (dense_first) {
@@ -231,13 +229,3 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama

    return false;
 }
-
-float llama_hparams::yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor) {
-    GGML_ASSERT(ext_factor >= 0.0f);
-
-    if (ext_factor != 0.0f) {
-        attn_factor *= 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
-    }
-
-    return attn_factor;
-}
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -107,7 +107,6 @@ struct llama_hparams {
    float    rope_freq_base_train_swa;
    float    rope_freq_scale_train;
    float    rope_freq_scale_train_swa;
-
    uint32_t n_ctx_orig_yarn;
    float    rope_yarn_log_mul = 0.0f;

@@ -268,13 +267,7 @@ struct llama_hparams {
    // TODO: think of a better place for this function
    // TODO: pack the SWA params in a struct?
    static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
-
-    // when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
-    // https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
-    //
-    // ref: https://github.com/ggml-org/llama.cpp/discussions/7416
-    //      https://github.com/ggml-org/llama.cpp/pull/17945
-    static float yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor);
 };

 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
+
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1232,7 +1232,8 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
    GGML_ASSERT(n_tokens%n_stream == 0);

    // n_tps == n_tokens_per_stream
-    const int64_t n_tps = n_tokens/n_stream;
+    const int64_t n_tps     = n_tokens/n_stream;
+    const int64_t n_tps_pad = GGML_PAD(n_tps, GGML_KQ_MASK_PAD);

    std::fill(data, data + ggml_nelements(dst), -INFINITY);

@@ -1265,7 +1266,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
                const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
                const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens]   : 0;

-                const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
+                const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii);

                for (uint32_t j = 0; j < n_kv; ++j) {
                    if (cells.is_empty(j)) {
@@ -1369,10 +1370,9 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
                      float   freq_scale) const {
    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;

-    const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
-    const auto & yarn_beta_fast   = cparams.yarn_beta_fast;
-    const auto & yarn_beta_slow   = cparams.yarn_beta_slow;
-    const auto & yarn_attn_factor = llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor);
+    const auto & yarn_ext_factor = cparams.yarn_ext_factor;
+    const auto & yarn_beta_fast  = cparams.yarn_beta_fast;
+    const auto & yarn_beta_slow  = cparams.yarn_beta_slow;

    const auto & n_rot     = hparams.n_rot;
    const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
@@ -1383,6 +1383,12 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
                                ? LLAMA_ROPE_TYPE_NEOX
                                : hparams.rope_type;

+    // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
+    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
+    const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
+                                    ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
+                                    : cparams.yarn_attn_factor;
+
    ggml_tensor * tmp;

    if (ggml_is_quantized(cur->type)) {
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -120,7 +120,6 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_16B_A1B:       return "16B.A1B";
        case LLM_TYPE_21B_A3B:       return "21B.A3B";
        case LLM_TYPE_30B_A3B:       return "30B.A3B";
-        case LLM_TYPE_80B_A3B:       return "80B.A3B";
        case LLM_TYPE_100B_A6B:      return "100B.A6B";
        case LLM_TYPE_106B_A12B:     return "106B.A12B";
        case LLM_TYPE_230B_A10B:     return "230B.A10B";
@@ -1607,9 +1606,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);

-                switch (hparams.n_ff_exp) {
-                    case 1408: type = LLM_TYPE_16B; break;
-                    case 1792: type = LLM_TYPE_20B; break;
+                switch (hparams.n_layer) {
+                    case 28: type = LLM_TYPE_20B; break;
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
@@ -1635,12 +1633,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    // that have no expert_gating_func model parameter set
                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
                }
-
-                if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
-                    // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
-                    // cancel the factor from the convert script
-                    hparams.rope_yarn_log_mul /= 0.1f;
-                }
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);

                // (optional) temperature tuning - used by mistral-large
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE,  hparams.f_attn_temp_scale,       false);
@@ -2263,7 +2256,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                }

                switch (hparams.n_layer) {
-                    case 48: type = LLM_TYPE_80B_A3B; break;
+                    case 80: type = LLM_TYPE_80B_A3B; break;
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
@@ -2272,9 +2265,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);

-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast,    false);
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow,    false);
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,   hparams.rope_yarn_log_mul, 0.0f);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,   hparams.yarn_beta_fast, false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,   hparams.yarn_beta_slow, false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,     hparams.rope_yarn_log_mul, false);

                // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
                if (hparams.f_attn_temp_scale != 0.0f) {
@@ -2284,6 +2277,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    }
                }

+                // TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
+                //       but may need further verification with other values
+                if (hparams.rope_yarn_log_mul != 0.0f) {
+                    float factor = 1.0f / hparams.rope_freq_scale_train;
+                    float mscale = 1.0f;
+                    float mscale_all_dims = hparams.rope_yarn_log_mul;
+                    static auto get_mscale = [](float scale, float mscale) {
+                        return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
+                    };
+                    hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
+                }
+
                switch (hparams.n_layer) {
                    case 26: type = LLM_TYPE_3B; break;
                    case 34: type = LLM_TYPE_8B; break;
@@ -2294,32 +2299,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
        default: throw std::runtime_error("unsupported model architecture");
    }

-    // ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
-    if (hparams.rope_yarn_log_mul != 0.0f) {
-        const float factor = 1.0f / hparams.rope_freq_scale_train;
-
-        // note: here we assume `mscale == 1.0f`
-        // TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
-              float mscale          = 1.0f;
-        const float mscale_all_dims = hparams.rope_yarn_log_mul;
-
-        // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
-        // special-case DEEPSEEK v2:
-        // https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
-        if (arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
-            mscale = mscale_all_dims;
-        }
-
-        static auto get_mscale = [](float scale, float mscale) {
-            return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
-        };
-
-        hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
-
-        LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
-                __func__, hparams.yarn_attn_factor, mscale, mscale_all_dims);
-    }
-
    pimpl->n_bytes = ml.n_bytes;

    pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
@@ -6825,7 +6804,6 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
        LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
-        LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n",   __func__, hparams.rope_yarn_log_mul);
        LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
        // MRoPE (Multi-axis Rotary Position Embedding) sections
        if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
@@ -6889,6 +6867,7 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
        LLAMA_LOG_INFO("%s: expert_weights_norm  = %d\n",     __func__, hparams.expert_weights_norm);
        LLAMA_LOG_INFO("%s: expert_gating_func   = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
+        LLAMA_LOG_INFO("%s: rope_yarn_log_mul    = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
    }

    if (arch == LLM_ARCH_QWEN2MOE) {
--- a/src/models/deepseek2.cpp
+++ b/src/models/deepseek2.cpp
@@ -1,5 +1,7 @@
 #include "models.h"

+
+
 llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
    // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
@@ -18,15 +20,9 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr

    // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
-    // And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
-
-    // first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
-    GGML_ASSERT(ext_factor >= 0.0f);
-    const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
-
-    // use the original attn_factor to pre-scale the kq_scale
-    const float mscale   = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
-    const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
+    const float mscale      = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
+    const float kq_scale    = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
+    const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -20,20 +20,20 @@ int main(void) {
            std::unordered_set<std::string> seen_env_vars;
            for (const auto & opt : ctx_arg.options) {
                // check for args duplications
-                for (const auto & arg : opt.get_args()) {
+                for (const auto & arg : opt.args) {
                    if (seen_args.find(arg) == seen_args.end()) {
                        seen_args.insert(arg);
                    } else {
-                        fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg.c_str());
+                        fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg);
                        exit(1);
                    }
                }
                // check for env var duplications
-                for (const auto & env : opt.get_env()) {
-                    if (seen_env_vars.find(env) == seen_env_vars.end()) {
-                        seen_env_vars.insert(env);
+                if (opt.env) {
+                    if (seen_env_vars.find(opt.env) == seen_env_vars.end()) {
+                        seen_env_vars.insert(opt.env);
                    } else {
-                        fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", env.c_str());
+                        fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", opt.env);
                        exit(1);
                    }
                }
@@ -115,14 +115,6 @@ int main(void) {
    assert(params.model.path == "blah.gguf");
    assert(params.cpuparams.n_threads == 1010);

-    printf("test-arg-parser: test negated environment variables\n\n");
-
-    setenv("LLAMA_ARG_MMAP", "0", true);
-    setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format
-    argv = {"binary_name"};
-    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
-    assert(params.use_mmap == false);
-    assert(params.no_perf == true);

    printf("test-arg-parser: test environment variables being overwritten\n\n");

--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -5875,7 +5875,7 @@ struct test_flash_attn_ext : public test_case {

        ggml_tensor * m = nullptr;
        if (mask) {
-            m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, nb, 1, nr23[1]);
+            m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, nr23[1]);
            ggml_set_name(m, "m");
        }

@@ -6253,31 +6253,6 @@ struct test_solve_tri : public test_case {
    }
 };

-// GGML_OP_DIAG
-struct test_diag : public test_case {
-    const ggml_type              type;
-    const std::array<int64_t, 4> ne;
-
-    std::string vars() override { return VARS_TO_STR2(type, ne); }
-
-    test_diag(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = { 10, 1, 4, 3 })
-        : type(type), ne(ne) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        GGML_ASSERT(ne[1] == 1);
-        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
-        ggml_set_param(a);
-        ggml_set_name(a, "a");
-
-        ggml_tensor * out = ggml_diag(ctx, a);
-        ggml_set_name(out, "out");
-
-        return out;
-    }
-};
-
-
 enum llm_norm_type {
    LLM_NORM,
    LLM_NORM_RMS,
@@ -7851,34 +7826,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_fill(-152.0f, GGML_TYPE_F32, { 800, 600, 4, 4 }));
    test_cases.emplace_back(new test_fill(3.5f, GGML_TYPE_F32, { 2048, 512, 2, 2 }));

-    test_cases.emplace_back(new test_diag());
-    test_cases.emplace_back(new test_diag(GGML_TYPE_F32, { 79, 1, 19, 13 }));
-    test_cases.emplace_back(new test_diag(GGML_TYPE_F32, { 256, 1, 8, 16 }));
-
    test_cases.emplace_back(new test_solve_tri());
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 11, 11, 1, 1 }, { 5, 11, 1, 1 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 17, 17, 2, 4 }, { 9, 17, 2, 4 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 30, 30, 7, 1 }, { 8, 30, 7, 1 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 64, 64, 2, 2 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 79, 79, 5, 3 }, { 417, 79, 5, 3 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 80, 80, 2, 8 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 79, 80, 2, 8 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 81, 80, 2, 8 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 80, 80, 8, 8 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 79, 80, 8, 8 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 81, 80, 8, 8 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 84, 84, 4, 4 }, { 32, 84, 4, 4 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 95, 95, 8, 8 }, { 40, 95, 8, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 31, 128, 4, 4 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 32, 128, 4, 4 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 3, 4 }, { 32, 128, 3, 4 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 32, 128, 4, 1 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 200, 64, 4, 4 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 384, 64, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 300, 64, 4, 4 }));

    for (bool v : {false, true}) {
        for (bool circular : {false, true}) {
@@ -8079,13 +8035,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8,  1}, {4, 1}, {0, 2, 1, 3}));
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8,  1}, {4, 1}, {0, 1, 2, 3}, 2*16416));

-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 32, 64, 4, 4 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 2 }, { 6, 64, 4, 2 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 8, 128, 4, 1 }));
    // qwen3next with CHUNK_SIZE 64
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 8, 32 }, { 64, 64, 8, 32 }));
    // qwen3next with CHUNK_SIZE 128
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 32 }, { 128, 128, 4, 32 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 256, 256, 4, 2 }, { 128, 256, 4, 2 }));

    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER, GGML_TYPE_F32, { 256, 256, 4, 4 }));
    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024, 1024, 8, 4 }));
@@ -8209,13 +8164,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
        }
    }

-    // Examples from granite-4.0-h-1b/ggml-model-Q8_0.gguf
-    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {515, 3328, 1, 1}, {4, 3328, 1, 1})); // prefill
-    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4,   3328, 1, 1}, {4, 3328, 1, 1})); // generate
-    test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 128, 64, 48, 1, 512, 1)); // prefill
-    test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 128, 64, 48, 1, 1,   1)); // generate
-
-
    return test_cases;
 }

--- a/tests/test-barrier.cpp
+++ b/tests/test-barrier.cpp
@@ -11,7 +11,19 @@

 #define MAX_NARGS 2

-static void test_barrier(int n_threads, int n_rounds) {
+int main(int argc, char *argv[]) {
+
+    int n_threads = std::max(1, std::min(4, (int) std::thread::hardware_concurrency()));
+    int n_rounds  = 100;
+
+    if (argc > 1) {
+        n_threads = std::atoi(argv[1]);
+    }
+
+    if (argc > 2) {
+        n_rounds  = std::atoi(argv[2]);
+    }
+
    struct ggml_init_params params = {
        /* .mem_size   = */ 1024*1024*1024,
        /* .mem_buffer = */ NULL,
@@ -44,7 +56,7 @@ static void test_barrier(int n_threads, int n_rounds) {
        exit(1);
    }

-    // The test runs with constant number of threads
+    // Create compute plan
    struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);

    std::vector<uint8_t> work_data(cplan.work_size);
@@ -77,160 +89,6 @@ static void test_barrier(int n_threads, int n_rounds) {

    ggml_threadpool_free(threadpool);
    ggml_free(ctx);
-}
-
-static void test_active(int n_threads, int n_rounds) {
-    struct ggml_init_params params = {
-        /* .mem_size   = */ 1024*1024*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ false,
-    };
-
-    struct ggml_context * ctx = ggml_init(params);
-
-    // Create graph
-    struct ggml_cgraph * gf = ggml_new_graph(ctx);
-
-    // Small graph with, parallel ops with barriers
-    struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  64);
-    for (int i = 0; i < 2; i++) {
-        struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
-        out = ggml_mul_mat(ctx, a, out);
-
-        struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
-        out = ggml_mul_mat(ctx, d, out);
-    }
-
-    ggml_build_forward_expand(gf, out);
-    int n_nodes = ggml_graph_n_nodes(gf);
-
-    // Create threadpool
-    struct ggml_threadpool_params tpp  = ggml_threadpool_params_default(n_threads);
-    struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
-    if (!threadpool) {
-        fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
-        exit(1);
-    }
-
-    std::cerr << "graph-compute with"
-              << "\n n_threads: " << n_threads
-              << "\n   n_nodes: " << n_nodes
-              << "\n  n_rounds: " << n_rounds
-              << "\n";
-    // ggml_graph_print(gf);
-
-    // In this test we keep changing the number of threads every 4th iteration
-    // to test for race conditions in that path
-
-    for (int i=0; i < n_rounds; i++) {
-        struct ggml_cplan cplan = ggml_graph_plan(gf, (i % 4) == 0 ? 1 : n_threads, threadpool);
-
-        std::vector<uint8_t> work_data(cplan.work_size);
-        cplan.work_data = work_data.data();
-
-        ggml_graph_compute(gf, &cplan);
-    }
-
-    ggml_threadpool_free(threadpool);
-    ggml_free(ctx);
-}
-
-static void test_multi_graph(int n_threads, int n_rounds) {
-    struct ggml_init_params params = {
-        /* .mem_size   = */ 1024*1024*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ false,
-    };
-
-    struct ggml_context * ctx = ggml_init(params);
-
-    // Create graphs
-    struct ggml_cgraph * gf0 = ggml_new_graph(ctx);
-    {
-        // Small graph with parallel ops with barriers
-        struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  64);
-        for (int i = 0; i < 2; i++) {
-            struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
-            out = ggml_mul_mat(ctx, a, out);
-
-            struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
-            out = ggml_mul_mat(ctx, d, out);
-        }
-
-        ggml_build_forward_expand(gf0, out);
-    }
-
-    struct ggml_cgraph * gf1 = ggml_new_graph(ctx);
-    {
-        // Small graph with parallel ops with barriers
-        // Use larger tensors to make sure work_data size is larger than gf0
-        struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  256);
-        for (int i = 0; i < 4; i++) {
-            struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 256, 128);
-            out = ggml_mul_mat(ctx, a, out);
-
-            struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 256);
-            out = ggml_mul_mat(ctx, d, out);
-        }
-
-        ggml_build_forward_expand(gf1, out);
-    }
-
-
-    // Create threadpool
-    struct ggml_threadpool_params tpp  = ggml_threadpool_params_default(n_threads);
-    struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
-    if (!threadpool) {
-        fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
-        exit(1);
-    }
-
-    std::cerr << "graph-compute with"
-              << "\n gf0 n_nodes: " << ggml_graph_n_nodes(gf0)
-              << "\n gf1 n_nodes: " << ggml_graph_n_nodes(gf1)
-              << "\n   n_threads: " << n_threads
-              << "\n    n_rounds: " << n_rounds
-              << "\n";
-
-    // In this test we keep changing the number of threads every 4th iteration
-    // and we compute two graphs back to back to test graph frequent graph switching
-
-    for (int i=0; i < n_rounds; i++) {
-        struct ggml_cplan cplan0 = ggml_graph_plan(gf0, (i % 4) == 0 ? 1 : n_threads, threadpool);
-        std::vector<uint8_t> work_data0(cplan0.work_size);
-        cplan0.work_data = work_data0.data();
-
-        struct ggml_cplan cplan1 = ggml_graph_plan(gf1, (i % 4) == 0 ? 1 : n_threads, threadpool);
-        std::vector<uint8_t> work_data1(cplan1.work_size);
-        cplan1.work_data = work_data1.data();
-
-        ggml_graph_compute(gf0, &cplan0);
-        ggml_graph_compute(gf1, &cplan1);
-    }
-
-    ggml_threadpool_free(threadpool);
-    ggml_free(ctx);
-}
-
-
-int main(int argc, char *argv[]) {
-
-    int n_threads = std::max(1, std::min(4, (int) std::thread::hardware_concurrency()));
-    int n_rounds  = 100;
-
-    if (argc > 1) {
-        n_threads = std::atoi(argv[1]);
-    }
-
-    if (argc > 2) {
-        n_rounds  = std::atoi(argv[2]);
-    }
-
-    test_barrier(n_threads, n_rounds);
-
-    test_active(n_threads,  n_rounds * 100);
-
-    test_multi_graph(n_threads,  n_rounds * 10);

    return 0;
 }
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -539,71 +539,6 @@ const common_chat_msg message_assist_call_python_lines           = simple_assist
 const common_chat_msg message_assist_call_python_lines_unclosed  = simple_assist_msg("", "", "python", "{\"code\":\"# This is a program:\\nprint('hey')");
 const common_chat_msg message_assist_call_code_interpreter       = simple_assist_msg("", "", "code_interpreter", "{\"code\":\"print('hey')\"}");

-// Use for PEG parser implementations
-struct peg_test_case {
-    common_chat_templates_inputs params;
-    std::string input;
-    common_chat_msg expect;
-};
-
-struct make_peg_parser {
-    common_chat_params params_;
-    common_peg_arena arena_;
-
-    make_peg_parser(common_chat_templates * tmpls, const common_chat_templates_inputs & inputs) {
-        params_ = common_chat_templates_apply(tmpls, inputs);
-        arena_.load(params_.parser);
-    }
-
-    common_chat_msg parse(const std::string & msg, bool is_partial) {
-        return common_chat_peg_parse(arena_, msg, is_partial, /* syntax = */ {params_.format});
-    }
-};
-
-static void test_peg_parser(common_chat_templates * tmpls, const std::function<void(peg_test_case &)> & init) {
-    peg_test_case tc;
-    init(tc);
-    if (tc.params.messages.empty()) {
-        tc.params.messages = {message_user};
-    }
-    if (tc.expect.role.empty()) {
-        tc.expect.role = "assistant";
-    }
-
-    auto parser = make_peg_parser(tmpls, tc.params);
-
-    common_chat_msg msg_accum;
-    common_chat_msg msg_prev;
-    msg_accum.role = msg_prev.role = "assistant";
-
-    for (size_t i = 1; i <= tc.input.size(); ++i) {
-        auto is_partial = i < tc.input.size();
-        common_chat_msg msg_current = parser.parse(tc.input.substr(0, i), is_partial);
-
-        for (const auto & diff : common_chat_msg_diff::compute_diffs(msg_prev, msg_current)) {
-            if (!diff.reasoning_content_delta.empty()) {
-                msg_accum.reasoning_content += diff.reasoning_content_delta;
-            }
-            if (!diff.content_delta.empty()) {
-                msg_accum.content += diff.content_delta;
-            }
-            if (diff.tool_call_index != std::string::npos) {
-                if (!diff.tool_call_delta.name.empty()) {
-                    msg_accum.tool_calls.push_back({diff.tool_call_delta.name, "", ""});
-                }
-                if (!diff.tool_call_delta.arguments.empty()) {
-                    msg_accum.tool_calls.back().arguments += diff.tool_call_delta.arguments;
-                }
-            }
-        }
-        assert_msg_equals(msg_current, msg_accum, true);
-        msg_prev = msg_current;
-    }
-
-    assert_msg_equals(tc.expect, parser.parse(tc.input, false), true);
-    assert_msg_equals(tc.expect, msg_accum, true);
-}
-
 static void test_msgs_oaicompat_json_conversion() {
    printf("[%s]\n", __func__);
    std::vector<common_chat_msg> msgs{
@@ -3499,95 +3434,7 @@ Hey there!<|im_end|>
        auto grammar = build_grammar(params.grammar);
        GGML_ASSERT(grammar && "Failed to build Qwen3-Coder grammar with union types");
    }
-}

-static void test_template_output_peg_parsers() {
-    printf("[%s]\n", __func__);
-
-    // JSON schemas
-    const char * invoice_schema = R"({
-        "type": "object",
-        "properties": {
-            "amount": {"type": "number"},
-            "date": {"type": "string"}
-        }
-    })";
-
-    {
-        // Ministral-3-14B-Reasoning-2512
-        auto tmpls = read_templates("models/templates/mistralai-Ministral-3-14B-Reasoning-2512.jinja");
-
-        // Test basic message
-        test_peg_parser(tmpls.get(), [&](auto & t) {
-            t.input = "Hello, world!\nWhat's up?";
-            t.expect = message_assist;
-        });
-
-        // Test basic message and reasoning with reasoning_format = none
-        test_peg_parser(tmpls.get(), [&](auto & t) {
-            t.input = "[THINK]I'm\nthinking[/THINK]Hello, world!\nWhat's up?";
-            t.expect.content = "[THINK]I'm\nthinking[/THINK]Hello, world!\nWhat's up?";
-        });
-
-        // Test basic message and reasoning with reasoning_format = auto
-        test_peg_parser(tmpls.get(), [&](auto & t) {
-            t.input = "[THINK]I'm\nthinking[/THINK]Hello, world!\nWhat's up?";
-            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
-
-            t.expect = message_assist_thoughts;
-        });
-
-        // Test tool call
-        test_peg_parser(tmpls.get(), [&](auto & t) {
-            t.input = R"([TOOL_CALLS]special_function[ARGS]{"arg1":1})";
-            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
-            t.params.tools = {special_function_tool};
-
-            t.expect = message_assist_call;
-        });
-
-        // Test tool call with reasoning
-        test_peg_parser(tmpls.get(), [&](auto & t) {
-            t.input = "[THINK]I'm\nthinking[/THINK]"
-                      R"([TOOL_CALLS]special_function[ARGS]{"arg1":1})";
-            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
-            t.params.tools = {special_function_tool};
-
-            t.expect = message_assist_call_thoughts;
-        });
-
-        // Test parallel tool calls
-        test_peg_parser(tmpls.get(), [&](auto & t) {
-            t.input = R"([TOOL_CALLS]special_function[ARGS]{"arg1": 1})"
-                      R"([TOOL_CALLS]special_function_with_opt[ARGS]{"arg1": 1, "arg2": 2})";
-            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
-            t.params.parallel_tool_calls = true;
-            t.params.tools = {special_function_tool, special_function_tool_with_optional_param};
-
-            t.expect.tool_calls = {{
-                /* .name = */      "special_function",
-                /* .arguments = */ R"({"arg1": 1})",
-                /* .id = */        {},
-            }, {
-                /* .name = */      "special_function_with_opt",
-                /* .arguments = */ R"({"arg1": 1, "arg2": 2})",
-                /* .id = */        {},
-            }};
-        });
-
-        // Test response format
-        test_peg_parser(tmpls.get(), [&](auto & t) {
-            t.input = "[THINK]I need to output the invoice details in JSON[/THINK]"
-                      "```json\n"
-                      R"({"amount": 123.45, "date": "2025-12-03"})"
-                      "\n```";
-            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
-            t.params.json_schema = invoice_schema;
-
-            t.expect.reasoning_content = "I need to output the invoice details in JSON";
-            t.expect.content =R"({"amount": 123.45, "date": "2025-12-03"})";
-        });
-    }
 }

 static void test_msg_diffs_compute() {
@@ -3713,7 +3560,6 @@ int main(int argc, char ** argv) {
            test_msgs_oaicompat_json_conversion();
            test_tools_oaicompat_json_conversion();
            test_template_output_parsers();
-            test_template_output_peg_parsers();
            std::cout << "\n[chat] All tests passed!" << '\n';
        }
        return 0;
--- a/tests/test-lora-conversion-inference.sh
+++ b/tests/test-lora-conversion-inference.sh
@@ -79,19 +79,19 @@ run_conversion_and_inference_lora() {

    # Run inference
    echo -e "\n\n---------------------------\n\n"
-    echo "Running llama-completion without lora for $model_name with hidden_size $hidden_size..."
-    OUTPUT_BASE=$(./llama-completion -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
+    echo "Running llama-cli without lora for $model_name with hidden_size $hidden_size..."
+    OUTPUT_BASE=$(./llama-cli -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
        -p "$EXPECTED_BASE_FIRST_WORD" -n 50 --seed 42 --temp 0)

    echo -e "\n\n---------------------------\n\n"
-    echo "Running llama-completion with hot lora for $model_name with hidden_size $hidden_size..."
-    OUTPUT_LORA_HOT=$(./llama-completion -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
+    echo "Running llama-cli with hot lora for $model_name with hidden_size $hidden_size..."
+    OUTPUT_LORA_HOT=$(./llama-cli -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
        --lora $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora/Lora-F32-LoRA.gguf \
        -p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0)

    echo -e "\n\n---------------------------\n\n"
-    echo "Running llama-completion with merged lora for $model_name with hidden_size $hidden_size..."
-    OUTPUT_LORA_MERGED=$(./llama-completion -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \
+    echo "Running llama-cli with merged lora for $model_name with hidden_size $hidden_size..."
+    OUTPUT_LORA_MERGED=$(./llama-cli -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \
        -p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0)

    # Remove any initial white space
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -18,8 +18,7 @@ else()
    add_subdirectory(gguf-split)
    add_subdirectory(imatrix)
    add_subdirectory(llama-bench)
-    add_subdirectory(cli)
-    add_subdirectory(completion)
+    add_subdirectory(main)
    add_subdirectory(perplexity)
    add_subdirectory(quantize)
    if (LLAMA_BUILD_SERVER)
--- a/tools/cli/CMakeLists.txt
+++ b/tools/cli/CMakeLists.txt
@@ -1,10 +0,0 @@
-set(TARGET llama-cli)
-add_executable(${TARGET} cli.cpp)
-target_link_libraries(${TARGET} PRIVATE server-context PUBLIC common ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
-
-include_directories(../server)
-
-if(LLAMA_TOOLS_INSTALL)
-    install(TARGETS ${TARGET} RUNTIME)
-endif()
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -1,395 +0,0 @@
-#include "common.h"
-#include "arg.h"
-#include "console.h"
-// #include "log.h"
-
-#include "server-context.h"
-#include "server-task.h"
-
-#include <atomic>
-#include <fstream>
-#include <thread>
-#include <signal.h>
-
-#if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#   define NOMINMAX
-#endif
-#include <windows.h>
-#endif
-
-const char * LLAMA_ASCII_LOGO = R"(
-▄▄ ▄▄
-██ ██
-██ ██  ▀▀█▄ ███▄███▄  ▀▀█▄    ▄████ ████▄ ████▄
-██ ██ ▄█▀██ ██ ██ ██ ▄█▀██    ██    ██ ██ ██ ██
-██ ██ ▀█▄██ ██ ██ ██ ▀█▄██ ██ ▀████ ████▀ ████▀
-                                    ██    ██
-                                    ▀▀    ▀▀
-)";
-
-static std::atomic<bool> g_is_interrupted = false;
-static bool should_stop() {
-    return g_is_interrupted.load();
-}
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-static void signal_handler(int) {
-    if (g_is_interrupted.load()) {
-        // second Ctrl+C - exit immediately
-        // make sure to clear colors before exiting (not using LOG or console.cpp here to avoid deadlock)
-        fprintf(stdout, "\033[0m\n");
-        fflush(stdout);
-        std::exit(130);
-    }
-    g_is_interrupted.store(true);
-}
-#endif
-
-struct cli_context {
-    server_context ctx_server;
-    json messages = json::array();
-    std::vector<raw_buffer> input_files;
-    task_params defaults;
-
-    // thread for showing "loading" animation
-    std::atomic<bool> loading_show;
-
-    cli_context(const common_params & params) {
-        defaults.sampling    = params.sampling;
-        defaults.speculative = params.speculative;
-        defaults.n_keep      = params.n_keep;
-        defaults.n_predict   = params.n_predict;
-        defaults.antiprompt  = params.antiprompt;
-
-        defaults.stream = true; // make sure we always use streaming mode
-        defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way
-        // defaults.return_progress = true; // TODO: show progress
-        defaults.oaicompat_chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
-    }
-
-    std::string generate_completion(result_timings & out_timings) {
-        server_response_reader rd = ctx_server.get_response_reader();
-        {
-            // TODO: reduce some copies here in the future
-            server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
-            task.id        = rd.get_new_id();
-            task.index     = 0;
-            task.params    = defaults;    // copy
-            task.cli_input = messages;    // copy
-            task.cli_files = input_files; // copy
-            rd.post_task({std::move(task)});
-        }
-
-        // wait for first result
-        console::spinner::start();
-        server_task_result_ptr result = rd.next(should_stop);
-
-        console::spinner::stop();
-        std::string curr_content;
-        bool is_thinking = false;
-
-        while (result) {
-            if (should_stop()) {
-                break;
-            }
-            if (result->is_error()) {
-                json err_data = result->to_json();
-                if (err_data.contains("message")) {
-                    console::error("Error: %s\n", err_data["message"].get<std::string>().c_str());
-                } else {
-                    console::error("Error: %s\n", err_data.dump().c_str());
-                }
-                return curr_content;
-            }
-            auto res_partial = dynamic_cast<server_task_result_cmpl_partial *>(result.get());
-            if (res_partial) {
-                out_timings = std::move(res_partial->timings);
-                for (const auto & diff : res_partial->oaicompat_msg_diffs) {
-                    if (!diff.content_delta.empty()) {
-                        if (is_thinking) {
-                            console::log("\n[End thinking]\n\n");
-                            console::set_display(DISPLAY_TYPE_RESET);
-                            is_thinking = false;
-                        }
-                        curr_content += diff.content_delta;
-                        console::log("%s", diff.content_delta.c_str());
-                        console::flush();
-                    }
-                    if (!diff.reasoning_content_delta.empty()) {
-                        console::set_display(DISPLAY_TYPE_REASONING);
-                        if (!is_thinking) {
-                            console::log("[Start thinking]\n");
-                        }
-                        is_thinking = true;
-                        console::log("%s", diff.reasoning_content_delta.c_str());
-                        console::flush();
-                    }
-                }
-            }
-            auto res_final = dynamic_cast<server_task_result_cmpl_final *>(result.get());
-            if (res_final) {
-                out_timings = std::move(res_final->timings);
-                break;
-            }
-            result = rd.next(should_stop);
-        }
-        g_is_interrupted.store(false);
-        // server_response_reader automatically cancels pending tasks upon destruction
-        return curr_content;
-    }
-
-    // TODO: support remote files in the future (http, https, etc)
-    std::string load_input_file(const std::string & fname, bool is_media) {
-        std::ifstream file(fname, std::ios::binary);
-        if (!file) {
-            return "";
-        }
-        if (is_media) {
-            raw_buffer buf;
-            buf.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
-            input_files.push_back(std::move(buf));
-            return mtmd_default_marker();
-        } else {
-            std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
-            return content;
-        }
-    }
-};
-
-int main(int argc, char ** argv) {
-    common_params params;
-
-    params.verbosity = LOG_LEVEL_ERROR; // by default, less verbose logs
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CLI)) {
-        return 1;
-    }
-
-    // TODO: maybe support it later?
-    if (params.conversation_mode == COMMON_CONVERSATION_MODE_DISABLED) {
-        console::error("--no-conversation is not supported by llama-cli\n");
-        console::error("please use llama-completion instead\n");
-    }
-
-    common_init();
-
-    // struct that contains llama context and inference
-    cli_context ctx_cli(params);
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    // TODO: avoid using atexit() here by making `console` a singleton
-    console::init(params.simple_io, params.use_color);
-    atexit([]() { console::cleanup(); });
-
-    console::set_display(DISPLAY_TYPE_RESET);
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-    struct sigaction sigint_action;
-    sigint_action.sa_handler = signal_handler;
-    sigemptyset (&sigint_action.sa_mask);
-    sigint_action.sa_flags = 0;
-    sigaction(SIGINT, &sigint_action, NULL);
-    sigaction(SIGTERM, &sigint_action, NULL);
-#elif defined (_WIN32)
-    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
-        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
-    };
-    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
-#endif
-
-    console::log("\nLoading model... "); // followed by loading animation
-    console::spinner::start();
-    if (!ctx_cli.ctx_server.load_model(params)) {
-        console::spinner::stop();
-        console::error("\nFailed to load the model\n");
-        return 1;
-    }
-
-    ctx_cli.ctx_server.init();
-
-    console::spinner::stop();
-    console::log("\n");
-
-    std::thread inference_thread([&ctx_cli]() {
-        ctx_cli.ctx_server.start_loop();
-    });
-
-    auto inf = ctx_cli.ctx_server.get_info();
-    std::string modalities = "text";
-    if (inf.has_inp_image) {
-        modalities += ", vision";
-    }
-    if (inf.has_inp_audio) {
-        modalities += ", audio";
-    }
-
-    if (!params.system_prompt.empty()) {
-        ctx_cli.messages.push_back({
-            {"role",    "system"},
-            {"content", params.system_prompt}
-        });
-    }
-
-    console::log("\n");
-    console::log("%s\n", LLAMA_ASCII_LOGO);
-    console::log("build      : %s\n", inf.build_info.c_str());
-    console::log("model      : %s\n", inf.model_name.c_str());
-    console::log("modalities : %s\n", modalities.c_str());
-    if (!params.system_prompt.empty()) {
-        console::log("using custom system prompt\n");
-    }
-    console::log("\n");
-    console::log("available commands:\n");
-    console::log("  /exit or Ctrl+C     stop or exit\n");
-    console::log("  /regen              regenerate the last response\n");
-    console::log("  /clear              clear the chat history\n");
-    console::log("  /read               add a text file\n");
-    if (inf.has_inp_image) {
-        console::log("  /image <file>       add an image file\n");
-    }
-    if (inf.has_inp_audio) {
-        console::log("  /audio <file>       add an audio file\n");
-    }
-    console::log("\n");
-
-    // interactive loop
-    std::string cur_msg;
-    while (true) {
-        std::string buffer;
-        console::set_display(DISPLAY_TYPE_USER_INPUT);
-        if (params.prompt.empty()) {
-            console::log("\n> ");
-            std::string line;
-            bool another_line = true;
-            do {
-                another_line = console::readline(line, params.multiline_input);
-                buffer += line;
-            } while (another_line);
-        } else {
-            // process input prompt from args
-            for (auto & fname : params.image) {
-                std::string marker = ctx_cli.load_input_file(fname, true);
-                if (marker.empty()) {
-                    console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
-                    break;
-                }
-                console::log("Loaded media from '%s'\n", fname.c_str());
-                cur_msg += marker;
-            }
-            buffer = params.prompt;
-            if (buffer.size() > 500) {
-                console::log("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str());
-            } else {
-                console::log("\n> %s\n", buffer.c_str());
-            }
-            params.prompt.clear(); // only use it once
-        }
-        console::set_display(DISPLAY_TYPE_RESET);
-        console::log("\n");
-
-        if (should_stop()) {
-            g_is_interrupted.store(false);
-            break;
-        }
-
-        // remove trailing newline
-        if (!buffer.empty() &&buffer.back() == '\n') {
-            buffer.pop_back();
-        }
-
-        // skip empty messages
-        if (buffer.empty()) {
-            continue;
-        }
-
-        bool add_user_msg = true;
-
-        // process commands
-        if (string_starts_with(buffer, "/exit")) {
-            break;
-        } else if (string_starts_with(buffer, "/regen")) {
-            if (ctx_cli.messages.size() >= 2) {
-                size_t last_idx = ctx_cli.messages.size() - 1;
-                ctx_cli.messages.erase(last_idx);
-                add_user_msg = false;
-            } else {
-                console::error("No message to regenerate.\n");
-                continue;
-            }
-        } else if (string_starts_with(buffer, "/clear")) {
-            ctx_cli.messages.clear();
-            ctx_cli.input_files.clear();
-            console::log("Chat history cleared.\n");
-            continue;
-        } else if (
-                (string_starts_with(buffer, "/image ") && inf.has_inp_image) ||
-                (string_starts_with(buffer, "/audio ") && inf.has_inp_audio)) {
-            // just in case (bad copy-paste for example), we strip all trailing/leading spaces
-            std::string fname = string_strip(buffer.substr(7));
-            std::string marker = ctx_cli.load_input_file(fname, true);
-            if (marker.empty()) {
-                console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
-                continue;
-            }
-            cur_msg += marker;
-            console::log("Loaded media from '%s'\n", fname.c_str());
-            continue;
-        } else if (string_starts_with(buffer, "/read ")) {
-            std::string fname = string_strip(buffer.substr(6));
-            std::string marker = ctx_cli.load_input_file(fname, false);
-            if (marker.empty()) {
-                console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
-                continue;
-            }
-            cur_msg += marker;
-            console::log("Loaded text from '%s'\n", fname.c_str());
-            continue;
-        } else {
-            // not a command
-            cur_msg += buffer;
-        }
-
-        // generate response
-        if (add_user_msg) {
-            ctx_cli.messages.push_back({
-                {"role",    "user"},
-                {"content", cur_msg}
-            });
-            cur_msg.clear();
-        }
-        result_timings timings;
-        std::string assistant_content = ctx_cli.generate_completion(timings);
-        ctx_cli.messages.push_back({
-            {"role",    "assistant"},
-            {"content", assistant_content}
-        });
-        console::log("\n");
-
-        if (params.show_timings) {
-            console::set_display(DISPLAY_TYPE_INFO);
-            console::log("\n");
-            console::log("[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n", timings.prompt_per_second, timings.predicted_per_second);
-            console::set_display(DISPLAY_TYPE_RESET);
-        }
-
-        if (params.single_turn) {
-            break;
-        }
-    }
-
-    console::set_display(DISPLAY_TYPE_RESET);
-
-    console::log("\nExiting...\n");
-    ctx_cli.ctx_server.terminate();
-    inference_thread.join();
-
-    // bump the log level to display timings
-    common_log_set_verbosity_thold(LOG_LEVEL_INFO);
-    llama_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context());
-
-    return 0;
-}
--- a/tools/gguf-split/tests.sh
+++ b/tools/gguf-split/tests.sh
@@ -19,7 +19,7 @@ fi
 set -x

 SPLIT=$1/llama-gguf-split
-MAIN=$1/llama-completion
+MAIN=$1/llama-cli
 WORK_PATH=$TMP_DIR/gguf-split
 ROOT_DIR=$(realpath $(dirname $0)/../../)

--- a/tools/completion/CMakeLists.txt
+++ b/tools/completion/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET llama-completion)
-add_executable(${TARGET} completion.cpp)
+set(TARGET llama-cli)
+add_executable(${TARGET} main.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

--- a/tools/completion/README.md
+++ b/tools/completion/README.md
--- a/tools/completion/completion.cpp
+++ b/tools/completion/completion.cpp
@@ -86,11 +86,7 @@ static void sigint_handler(int signo) {
 int main(int argc, char ** argv) {
    common_params params;
    g_params = &params;
-
-    // disable jinja by default
-    params.use_jinja = false;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPLETION, print_usage)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
        return 1;
    }

@@ -525,6 +521,12 @@ int main(int argc, char ** argv) {
        is_interacting = params.interactive_first;
    }

+    LOG_WRN("*****************************\n");
+    LOG_WRN("IMPORTANT: The current llama-cli will be moved to llama-completion in the near future\n");
+    LOG_WRN("  New llama-cli will have enhanced features and improved user experience\n");
+    LOG_WRN("  More info: https://github.com/ggml-org/llama.cpp/discussions/17618\n");
+    LOG_WRN("*****************************\n");
+
    bool is_antiprompt        = false;
    bool input_echo           = true;
    bool display              = true;
@@ -541,7 +543,7 @@ int main(int argc, char ** argv) {
    std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode

    // the first thing we will do is to output the prompt, so set color accordingly
-    console::set_display(DISPLAY_TYPE_PROMPT);
+    console::set_display(console::prompt);
    display = params.display_prompt;

    std::vector<llama_token> embd;
@@ -586,9 +588,9 @@ int main(int argc, char ** argv) {
                const int skipped_tokens = (int) embd.size() - max_embd_size;
                embd.resize(max_embd_size);

-                console::set_display(DISPLAY_TYPE_ERROR);
+                console::set_display(console::error);
                LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
-                console::set_display(DISPLAY_TYPE_RESET);
+                console::set_display(console::reset);
            }

            if (ga_n == 1) {
@@ -770,7 +772,7 @@ int main(int argc, char ** argv) {

        // reset color to default if there is no pending user input
        if (input_echo && (int) embd_inp.size() == n_consumed) {
-            console::set_display(DISPLAY_TYPE_RESET);
+            console::set_display(console::reset);
            display = true;
        }

@@ -866,7 +868,7 @@ int main(int argc, char ** argv) {
                }

                // color user input only
-                console::set_display(DISPLAY_TYPE_USER_INPUT);
+                console::set_display(console::user_input);
                display = params.display_prompt;

                std::string line;
@@ -877,7 +879,7 @@ int main(int argc, char ** argv) {
                } while (another_line);

                // done taking input, reset color
-                console::set_display(DISPLAY_TYPE_RESET);
+                console::set_display(console::reset);
                display = true;

                if (buffer.empty()) { // Ctrl+D on empty line exits
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -6,31 +6,16 @@ add_library(mtmd
            mtmd.cpp
            mtmd-audio.cpp
            mtmd.h
-            mtmd-helper.cpp
-            mtmd-helper.h
            clip.cpp
            clip.h
            clip-impl.h
-            clip-model.h
-            clip-graph.h
-            models/models.h
-            models/cogvlm.cpp
-            models/internvl.cpp
-            models/kimivl.cpp
-            models/llama4.cpp
-            models/llava.cpp
-            models/minicpmv.cpp
-            models/pixtral.cpp
-            models/qwen2vl.cpp
-            models/qwen3vl.cpp
-            models/siglip.cpp
-            models/whisper-enc.cpp
+            mtmd-helper.cpp
+            mtmd-helper.h
            )

 set_target_properties(mtmd PROPERTIES
    VERSION ${LLAMA_INSTALL_VERSION}
    SOVERSION 0
-    MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
 )

 target_link_libraries     (mtmd PUBLIC ggml llama)
@@ -67,15 +52,6 @@ if (TARGET BUILD_INFO)
    add_dependencies(mtmd-helper BUILD_INFO)
 endif()

-# if mtmd is linked against common, we throw an error
-if (TARGET mtmd)
-    get_target_property(libs mtmd LINK_LIBRARIES)
-    if (libs AND "common" IN_LIST libs)
-        message(FATAL_ERROR "mtmd is designed to be a public library.\n"
-                            "It must not link against common")
-    endif()
-endif()
-
 add_executable(llama-llava-cli    deprecation-warning.cpp)
 add_executable(llama-gemma3-cli   deprecation-warning.cpp)
 add_executable(llama-minicpmv-cli deprecation-warning.cpp)
--- a/tools/mtmd/clip-graph.h
+++ b/tools/mtmd/clip-graph.h
@@ -1,115 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-cpp.h"
-#include "clip.h"
-#include "clip-impl.h"
-#include "clip-model.h"
-
-#include <vector>
-#include <functional>
-
-struct clip_graph {
-    const clip_model & model;
-    const clip_hparams & hparams;
-    projector_type proj_type;
-
-    // we only support single image per batch
-    const clip_image_f32 & img;
-
-    const int patch_size;
-    const int n_patches_x;
-    const int n_patches_y;
-    const int n_patches;
-    const int n_embd;
-    const int n_head;
-    const int d_head;
-    const int n_layer;
-    const int n_mmproj_embd;
-    const float eps;
-    const float kq_scale;
-    const clip_flash_attn_type flash_attn_type;
-
-    // for debugging
-    const bool debug_graph;
-    std::vector<ggml_tensor *> & debug_print_tensors;
-
-    ggml_context_ptr ctx0_ptr;
-    ggml_context * ctx0;
-    ggml_cgraph * gf;
-
-    clip_graph(clip_ctx * ctx, const clip_image_f32 & img);
-
-    virtual ~clip_graph() = default;
-    virtual ggml_cgraph * build() = 0;
-
-    //
-    // utility functions
-    //
-    void cb(ggml_tensor * cur0, const char * name, int il) const;
-
-    // siglip2 naflex
-    ggml_tensor * resize_position_embeddings();
-
-    // build vision transformer (ViT) cgraph
-    // this function should cover most of the models
-    // if your model has specific features, you should probably duplicate this function
-    ggml_tensor * build_vit(
-                ggml_tensor * inp,
-                int64_t n_pos,
-                norm_type norm_t,
-                ffn_op_type ffn_t,
-                ggml_tensor * learned_pos_embd,
-                std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos);
-
-    // build the input after conv2d (inp_raw --> patches)
-    // returns tensor with shape [n_embd, n_patches]
-    ggml_tensor * build_inp();
-
-    ggml_tensor * build_inp_raw(int channels = 3);
-
-    ggml_tensor * build_norm(
-            ggml_tensor * cur,
-            ggml_tensor * mw,
-            ggml_tensor * mb,
-            norm_type type,
-            float norm_eps,
-            int il) const;
-
-    ggml_tensor * build_ffn(
-            ggml_tensor * cur,
-            ggml_tensor * up,
-            ggml_tensor * up_b,
-            ggml_tensor * gate,
-            ggml_tensor * gate_b,
-            ggml_tensor * down,
-            ggml_tensor * down_b,
-            ffn_op_type type_op,
-            int il) const;
-
-    ggml_tensor * build_attn(
-            ggml_tensor * wo,
-            ggml_tensor * wo_b,
-            ggml_tensor * q_cur,
-            ggml_tensor * k_cur,
-            ggml_tensor * v_cur,
-            ggml_tensor * kq_mask,
-            float kq_scale,
-            int il) const;
-
-    // implementation of the 2D RoPE without adding a new op in ggml
-    // this is not efficient (use double the memory), but works on all backends
-    // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
-    ggml_tensor * build_rope_2d(
-        ggml_context * ctx0,
-        ggml_tensor * cur,
-        ggml_tensor * pos_a, // first half
-        ggml_tensor * pos_b, // second half
-        const float freq_base,
-        const bool interleave_freq
-    );
-
-    // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
-    // support dynamic resolution
-    ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
-};
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -1,5 +1,3 @@
-#pragma once
-
 #include "ggml.h"
 #include "gguf.h"
 #include "clip.h"
@@ -15,8 +13,6 @@

 // Internal header for clip.cpp

-#define MTMD_INTERNAL_HEADER
-
 #define KEY_FTYPE               "general.file_type"
 #define KEY_NAME                "general.name"
 #define KEY_DESCRIPTION         "general.description"
@@ -136,10 +132,6 @@
 // align x to upper multiple of n
 #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))

-// forward declaration
-// TODO: improve this later
-struct clip_ctx;
-
 enum projector_type {
    PROJECTOR_TYPE_MLP,
    PROJECTOR_TYPE_MLP_NORM,
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -1,279 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "clip.h"
-#include "clip-impl.h"
-
-#include <vector>
-#include <unordered_set>
-#include <cstdint>
-#include <cmath>
-
-enum ffn_op_type {
-    FFN_GELU,
-    FFN_GELU_ERF,
-    FFN_SILU,
-    FFN_GELU_QUICK,
-};
-
-enum norm_type {
-    NORM_TYPE_NORMAL,
-    NORM_TYPE_RMS,
-};
-
-enum patch_merge_type {
-    PATCH_MERGE_FLAT,
-    PATCH_MERGE_SPATIAL_UNPAD,
-};
-
-struct clip_hparams {
-    int32_t image_size = 0;
-    int32_t patch_size = 0;
-    int32_t n_embd = 0;
-    int32_t n_ff = 0;
-    int32_t projection_dim = 0;
-    int32_t n_head = 0;
-    int32_t n_layer = 0;
-    // idefics3
-    int32_t image_longest_edge = 0;
-    int32_t image_min_pixels = -1;
-    int32_t image_max_pixels = -1;
-    int32_t n_merge = 0; // number of patch merges **per-side**
-
-    float image_mean[3];
-    float image_std[3];
-
-    // for models using dynamic image size, we need to have a smaller image size to warmup
-    // otherwise, user will get OOM everytime they load the model
-    int32_t warmup_image_size = 0;
-    int32_t warmup_audio_size = 3000;
-
-    ffn_op_type ffn_op = FFN_GELU;
-
-    patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
-
-    float eps = 1e-6;
-    float rope_theta = 0.0;
-
-    std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
-    int32_t image_crop_resolution;
-    std::unordered_set<int32_t> vision_feature_layer;
-    int32_t attn_window_size = 0;
-    int32_t n_wa_pattern = 0;
-
-    // audio
-    int32_t n_mel_bins = 0; // whisper preprocessor
-    int32_t proj_stack_factor = 0; // ultravox
-
-    // legacy
-    bool has_llava_projector = false;
-    int minicpmv_version = 0;
-    int32_t minicpmv_query_num = 0;         // MiniCPM-V query number
-
-    // custom value provided by user, can be undefined if not set
-    int32_t custom_image_min_tokens = -1;
-    int32_t custom_image_max_tokens = -1;
-
-    void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
-        const int cur_merge = n_merge == 0 ? 1 : n_merge;
-        const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
-        image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
-        image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
-        warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
-    }
-
-    void set_warmup_n_tokens(int n_tokens) {
-        int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
-        GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
-        const int cur_merge = n_merge == 0 ? 1 : n_merge;
-        warmup_image_size = n_tok_per_side * patch_size * cur_merge;
-        // TODO: support warmup size for custom token numbers
-    }
-};
-
-struct clip_layer {
-    // attention
-    ggml_tensor * k_w = nullptr;
-    ggml_tensor * k_b = nullptr;
-    ggml_tensor * q_w = nullptr;
-    ggml_tensor * q_b = nullptr;
-    ggml_tensor * v_w = nullptr;
-    ggml_tensor * v_b = nullptr;
-    ggml_tensor * qkv_w = nullptr;
-    ggml_tensor * qkv_b = nullptr;
-
-    ggml_tensor * o_w = nullptr;
-    ggml_tensor * o_b = nullptr;
-
-    ggml_tensor * k_norm = nullptr;
-    ggml_tensor * q_norm = nullptr;
-
-    // layernorm 1
-    ggml_tensor * ln_1_w = nullptr;
-    ggml_tensor * ln_1_b = nullptr;
-
-    ggml_tensor * ff_up_w = nullptr;
-    ggml_tensor * ff_up_b = nullptr;
-    ggml_tensor * ff_gate_w = nullptr;
-    ggml_tensor * ff_gate_b = nullptr;
-    ggml_tensor * ff_down_w = nullptr;
-    ggml_tensor * ff_down_b = nullptr;
-
-    // layernorm 2
-    ggml_tensor * ln_2_w = nullptr;
-    ggml_tensor * ln_2_b = nullptr;
-
-    // layer scale (no bias)
-    ggml_tensor * ls_1_w = nullptr;
-    ggml_tensor * ls_2_w = nullptr;
-
-    // qwen3vl deepstack merger
-    ggml_tensor * deepstack_norm_w = nullptr;
-    ggml_tensor * deepstack_norm_b = nullptr;
-    ggml_tensor * deepstack_fc1_w = nullptr;
-    ggml_tensor * deepstack_fc1_b = nullptr;
-    ggml_tensor * deepstack_fc2_w = nullptr;
-    ggml_tensor * deepstack_fc2_b = nullptr;
-
-    bool has_deepstack() const {
-        return deepstack_fc1_w != nullptr;
-    }
-};
-
-struct clip_model {
-    clip_modality modality = CLIP_MODALITY_VISION;
-    projector_type proj_type = PROJECTOR_TYPE_MLP;
-    clip_hparams hparams;
-
-    // embeddings
-    ggml_tensor * class_embedding = nullptr;
-    ggml_tensor * patch_embeddings_0 = nullptr;
-    ggml_tensor * patch_embeddings_1 = nullptr;  // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
-    ggml_tensor * patch_bias = nullptr;
-    ggml_tensor * position_embeddings = nullptr;
-
-    ggml_tensor * pre_ln_w = nullptr;
-    ggml_tensor * pre_ln_b = nullptr;
-
-    std::vector<clip_layer> layers;
-
-    int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer
-
-    ggml_tensor * post_ln_w;
-    ggml_tensor * post_ln_b;
-
-    ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
-    ggml_tensor * mm_fc_w;
-    ggml_tensor * mm_fc_b;
-
-    // LLaVA projection
-    ggml_tensor * mm_input_norm_w = nullptr;
-    ggml_tensor * mm_input_norm_b = nullptr;
-    ggml_tensor * mm_0_w = nullptr;
-    ggml_tensor * mm_0_b = nullptr;
-    ggml_tensor * mm_2_w = nullptr;
-    ggml_tensor * mm_2_b = nullptr;
-
-    ggml_tensor * image_newline = nullptr;
-
-    // Yi type models with mlp+normalization projection
-    ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
-    ggml_tensor * mm_1_b = nullptr;
-    ggml_tensor * mm_3_w = nullptr;
-    ggml_tensor * mm_3_b = nullptr;
-    ggml_tensor * mm_4_w = nullptr;
-    ggml_tensor * mm_4_b = nullptr;
-
-    // GLMV-Edge projection
-    ggml_tensor * mm_model_adapter_conv_w = nullptr;
-    ggml_tensor * mm_model_adapter_conv_b = nullptr;
-
-    // MobileVLM projection
-    ggml_tensor * mm_model_mlp_1_w = nullptr;
-    ggml_tensor * mm_model_mlp_1_b = nullptr;
-    ggml_tensor * mm_model_mlp_3_w = nullptr;
-    ggml_tensor * mm_model_mlp_3_b = nullptr;
-    ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
-    ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
-    ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
-    ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
-    ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
-    ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
-    ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
-    ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
-    ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
-    ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
-    ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
-    ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
-    ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
-    ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
-    ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
-    ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
-    ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
-    ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
-    ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
-    ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
-
-    // MobileVLM_V2 projection
-    ggml_tensor * mm_model_mlp_0_w = nullptr;
-    ggml_tensor * mm_model_mlp_0_b = nullptr;
-    ggml_tensor * mm_model_mlp_2_w = nullptr;
-    ggml_tensor * mm_model_mlp_2_b = nullptr;
-    ggml_tensor * mm_model_peg_0_w = nullptr;
-    ggml_tensor * mm_model_peg_0_b = nullptr;
-
-    // MINICPMV projection
-    ggml_tensor * mm_model_pos_embed_k = nullptr;
-    ggml_tensor * mm_model_query = nullptr;
-    ggml_tensor * mm_model_proj = nullptr;
-    ggml_tensor * mm_model_kv_proj = nullptr;
-    ggml_tensor * mm_model_attn_q_w = nullptr;
-    ggml_tensor * mm_model_attn_q_b = nullptr;
-    ggml_tensor * mm_model_attn_k_w = nullptr;
-    ggml_tensor * mm_model_attn_k_b = nullptr;
-    ggml_tensor * mm_model_attn_v_w = nullptr;
-    ggml_tensor * mm_model_attn_v_b = nullptr;
-    ggml_tensor * mm_model_attn_o_w = nullptr;
-    ggml_tensor * mm_model_attn_o_b = nullptr;
-    ggml_tensor * mm_model_ln_q_w = nullptr;
-    ggml_tensor * mm_model_ln_q_b = nullptr;
-    ggml_tensor * mm_model_ln_kv_w = nullptr;
-    ggml_tensor * mm_model_ln_kv_b = nullptr;
-    ggml_tensor * mm_model_ln_post_w = nullptr;
-    ggml_tensor * mm_model_ln_post_b = nullptr;
-
-    // gemma3
-    ggml_tensor * mm_input_proj_w = nullptr;
-    ggml_tensor * mm_soft_emb_norm_w = nullptr;
-
-    // pixtral
-    ggml_tensor * token_embd_img_break = nullptr;
-    ggml_tensor * mm_patch_merger_w = nullptr;
-
-    // ultravox / whisper encoder
-    ggml_tensor * conv1d_1_w = nullptr;
-    ggml_tensor * conv1d_1_b = nullptr;
-    ggml_tensor * conv1d_2_w = nullptr;
-    ggml_tensor * conv1d_2_b = nullptr;
-    ggml_tensor * mm_norm_pre_w = nullptr;
-    ggml_tensor * mm_norm_mid_w = nullptr;
-
-    // cogvlm
-    ggml_tensor * mm_post_fc_norm_w = nullptr;
-    ggml_tensor * mm_post_fc_norm_b = nullptr;
-    ggml_tensor * mm_h_to_4h_w = nullptr;
-    ggml_tensor * mm_gate_w = nullptr;
-    ggml_tensor * mm_4h_to_h_w = nullptr;
-    ggml_tensor * mm_boi = nullptr;
-    ggml_tensor * mm_eoi = nullptr;
-
-    bool audio_has_avgpool() const {
-        return proj_type == PROJECTOR_TYPE_QWEN2A
-            || proj_type == PROJECTOR_TYPE_VOXTRAL;
-    }
-
-    bool audio_has_stack_frames() const {
-        return proj_type == PROJECTOR_TYPE_ULTRAVOX
-            || proj_type == PROJECTOR_TYPE_VOXTRAL;
-    }
-};
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -7,8 +7,6 @@

 // !!! Internal header, to be used by mtmd only !!!

-#define MTMD_INTERNAL_HEADER
-
 struct clip_ctx;

 struct clip_image_size {
--- a/tools/mtmd/models/cogvlm.cpp
+++ b/tools/mtmd/models/cogvlm.cpp
@@ -1,98 +0,0 @@
-#include "models.h"
-
-ggml_cgraph * clip_graph_cogvlm::build() {
-    GGML_ASSERT(model.class_embedding != nullptr);
-    GGML_ASSERT(model.position_embeddings != nullptr);
-
-    const int n_pos = n_patches + 1; // +1 for [CLS]
-
-    // build input and concatenate class embedding
-    ggml_tensor * inp = build_inp();
-    inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
-
-    inp = ggml_add(ctx0, inp, model.position_embeddings);
-    cb(inp, "inp_pos", -1);
-
-    ggml_tensor * inpL = inp;
-
-    for (int il = 0; il < n_layer; il++) {
-        auto & layer = model.layers[il];
-        ggml_tensor * cur = inpL;
-
-        cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
-
-        cur = ggml_add(ctx0, cur, layer.qkv_b);
-
-        ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
-            cur->nb[1], 0);
-        ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
-            cur->nb[1], n_embd * sizeof(float));
-        ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
-            cur->nb[1], 2 * n_embd * sizeof(float));
-
-        cb(Qcur, "Qcur", il);
-        cb(Kcur, "Kcur", il);
-        cb(Vcur, "Vcur", il);
-
-        cur = build_attn(layer.o_w, layer.o_b,
-            Qcur, Kcur, Vcur, nullptr, kq_scale, il);
-        cb(cur, "attn_out", il);
-
-        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
-        cb(cur, "attn_post_norm", il);
-
-        cur = ggml_add(ctx0, cur, inpL);
-        inpL = cur;
-
-        cur = build_ffn(cur,
-            layer.ff_up_w, layer.ff_up_b,
-            layer.ff_gate_w, layer.ff_gate_b,
-            layer.ff_down_w, layer.ff_down_b,
-            hparams.ffn_op, il);
-
-        cb(cur, "ffn_out", il);
-
-        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
-        cb(cur, "ffn_post_norm", il);
-
-        cur = ggml_add(ctx0, cur, inpL);
-        cb(cur, "layer_out", il);
-        inpL = cur;
-
-    }
-
-    // remove CLS token (like build_llama4 does)
-    ggml_tensor * cur = ggml_view_2d(ctx0, inpL,
-        n_embd, n_patches,
-        ggml_row_size(inpL->type, n_embd), 0);
-
-    // Multiply with mm_model_proj
-    cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
-
-    // Apply layernorm, weight, bias
-    cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
-
-    // Apply GELU
-    cur = ggml_gelu_inplace(ctx0, cur);
-
-    // Branch 1: multiply with mm_h_to_4h_w
-    ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur);
-
-    // Branch 2: multiply with mm_gate_w
-    ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur);
-
-    // Apply silu
-    gate = ggml_swiglu_split(ctx0, gate, h_to_4h);
-
-    // Apply mm_4h_to_h_w
-    cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate);
-
-    // Concatenate with boi and eoi
-    cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
-    cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
-
-    // build the graph
-    ggml_build_forward_expand(gf, cur);
-
-    return gf;
-}
--- a/tools/mtmd/models/internvl.cpp
+++ b/tools/mtmd/models/internvl.cpp
@@ -1,69 +0,0 @@
-#include "models.h"
-
-ggml_cgraph * clip_graph_internvl::build() {
-    GGML_ASSERT(model.class_embedding != nullptr);
-    GGML_ASSERT(model.position_embeddings != nullptr);
-
-    const int n_pos = n_patches + 1;
-    ggml_tensor * inp = build_inp();
-
-    // add CLS token
-    inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
-
-    // The larger models use a different ViT, which uses RMS norm instead of layer norm
-    // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
-    norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
-        ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
-        : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
-
-    ggml_tensor * cur = build_vit(
-                            inp, n_pos,
-                            norm_t,
-                            hparams.ffn_op,
-                            model.position_embeddings,
-                            nullptr);
-
-    // remove CLS token
-    cur = ggml_view_2d(ctx0, cur,
-        n_embd, n_patches,
-        ggml_row_size(cur->type, n_embd), 0);
-
-    // pixel shuffle
-    {
-        const int scale_factor = model.hparams.n_merge;
-        const int bsz    = 1; // batch size, always 1 for now since we don't support batching
-        const int height = n_patches_y;
-        const int width  = n_patches_x;
-        GGML_ASSERT(scale_factor > 0);
-        cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
-        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-        cur = ggml_cont_4d(ctx0, cur,
-            n_embd * scale_factor * scale_factor,
-            height / scale_factor,
-            width / scale_factor,
-            bsz);
-        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-        // flatten to 2D
-        cur = ggml_cont_2d(ctx0, cur,
-            n_embd * scale_factor * scale_factor,
-            cur->ne[1] * cur->ne[2]);
-    }
-
-    // projector (always using GELU activation)
-    {
-        // projector LayerNorm uses pytorch's default eps = 1e-5
-        // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
-        cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
-        cur = build_ffn(cur,
-            model.mm_1_w, model.mm_1_b,
-            nullptr, nullptr,
-            model.mm_3_w, model.mm_3_b,
-            FFN_GELU,
-            -1);
-    }
-
-    // build the graph
-    ggml_build_forward_expand(gf, cur);
-
-    return gf;
-}
--- a/tools/mtmd/models/kimivl.cpp
+++ b/tools/mtmd/models/kimivl.cpp
@@ -1,63 +0,0 @@
-#include "models.h"
-
-ggml_cgraph * clip_graph_kimivl::build() {
-    // 2D input positions
-    ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
-    ggml_set_name(pos_h, "pos_h");
-    ggml_set_input(pos_h);
-
-    ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
-    ggml_set_name(pos_w, "pos_w");
-    ggml_set_input(pos_w);
-
-    ggml_tensor * learned_pos_embd = resize_position_embeddings();
-
-    // build ViT with 2D position embeddings
-    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
-        // first half is X axis and second half is Y axis
-        return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
-    };
-
-    ggml_tensor * inp = build_inp();
-    ggml_tensor * cur = build_vit(
-                            inp, n_patches,
-                            NORM_TYPE_NORMAL,
-                            hparams.ffn_op,
-                            learned_pos_embd,
-                            add_pos);
-
-    cb(cur, "vit_out", -1);
-
-    {
-        // patch_merger
-        const int scale_factor = model.hparams.n_merge;
-        cur = build_patch_merge_permute(cur, scale_factor);
-
-        // projection norm
-        int proj_inp_dim = cur->ne[0];
-        cur = ggml_view_2d(ctx0, cur,
-            n_embd, cur->ne[1] * scale_factor * scale_factor,
-            ggml_row_size(cur->type, n_embd), 0);
-        cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
-        cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
-        cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
-        cur = ggml_view_2d(ctx0, cur,
-            proj_inp_dim, cur->ne[1] / scale_factor / scale_factor,
-            ggml_row_size(cur->type, proj_inp_dim), 0);
-        cb(cur, "proj_inp_normed", -1);
-
-        // projection mlp
-        cur = build_ffn(cur,
-            model.mm_1_w, model.mm_1_b,
-            nullptr, nullptr,
-            model.mm_2_w, model.mm_2_b,
-            FFN_GELU,
-            -1);
-        cb(cur, "proj_out", -1);
-    }
-
-    // build the graph
-    ggml_build_forward_expand(gf, cur);
-
-    return gf;
-}
--- a/tools/mtmd/models/llama4.cpp
+++ b/tools/mtmd/models/llama4.cpp
@@ -1,96 +0,0 @@
-#include "models.h"
-
-ggml_cgraph * clip_graph_llama4::build() {
-    GGML_ASSERT(model.class_embedding != nullptr);
-    GGML_ASSERT(model.position_embeddings != nullptr);
-
-    const int n_pos = n_patches + 1; // +1 for [CLS]
-
-    // 2D input positions
-    ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
-    ggml_set_name(pos_h, "pos_h");
-    ggml_set_input(pos_h);
-
-    ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
-    ggml_set_name(pos_w, "pos_w");
-    ggml_set_input(pos_w);
-
-    ggml_tensor * inp = build_inp_raw();
-
-    // Llama4UnfoldConvolution
-    {
-        ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
-                                                patch_size, patch_size, 3, n_embd);
-        inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
-        inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
-        inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
-        cb(inp, "patch_conv", -1);
-    }
-
-    // add CLS token
-    inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
-
-    // build ViT with 2D position embeddings
-    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
-        // first half is X axis and second half is Y axis
-        // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
-        // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
-        return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
-    };
-    ggml_tensor * cur = build_vit(
-                            inp, n_pos,
-                            NORM_TYPE_NORMAL,
-                            hparams.ffn_op,
-                            model.position_embeddings,
-                            add_pos);
-
-    // remove CLS token
-    cur = ggml_view_2d(ctx0, cur,
-        n_embd, n_patches,
-        ggml_row_size(cur->type, n_embd), 0);
-
-    // pixel shuffle
-    // based on Llama4VisionPixelShuffleMLP
-    // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
-    {
-        const int scale_factor = model.hparams.n_merge;
-        const int bsz = 1; // batch size, always 1 for now since we don't support batching
-        GGML_ASSERT(scale_factor > 0);
-        GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
-        cur = ggml_reshape_4d(ctx0, cur,
-            n_embd * scale_factor,
-            n_patches_x / scale_factor,
-            n_patches_y,
-            bsz);
-        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-        cur = ggml_cont_4d(ctx0, cur,
-            n_embd * scale_factor * scale_factor,
-            n_patches_x / scale_factor,
-            n_patches_y / scale_factor,
-            bsz);
-        //cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-        // flatten to 2D
-        cur = ggml_cont_2d(ctx0, cur,
-            n_embd * scale_factor * scale_factor,
-            n_patches / scale_factor / scale_factor);
-        cb(cur, "pixel_shuffle", -1);
-    }
-
-    // based on Llama4VisionMLP2 (always uses GELU activation, no bias)
-    {
-        cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
-        cur = ggml_gelu(ctx0, cur);
-        cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
-        cur = ggml_gelu(ctx0, cur);
-        cb(cur, "adapter_mlp", -1);
-    }
-
-    // Llama4MultiModalProjector
-    cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
-    cb(cur, "projected", -1);
-
-    // build the graph
-    ggml_build_forward_expand(gf, cur);
-
-    return gf;
-}
--- a/tools/mtmd/models/llava.cpp
+++ b/tools/mtmd/models/llava.cpp
@@ -1,374 +0,0 @@
-#include "models.h"
-
-// this graph is used by llava, granite and glm
-// due to having embedding_stack (used by granite), we cannot reuse build_vit
-ggml_cgraph * clip_graph_llava::build() {
-    const int batch_size = 1;
-    const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
-
-    GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
-
-    // Calculate the deepest feature layer based on hparams and projector type
-    int max_feature_layer = n_layer;
-    {
-        // Get the index of the second to last layer; this is the default for models that have a llava projector
-        int il_last = hparams.n_layer - 1;
-        int deepest_feature_layer = -1;
-
-        if (proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE) {
-            il_last += 1;
-        }
-
-        // If we set explicit vision feature layers, only go up to the deepest one
-        // NOTE: only used by granite-vision models for now
-        for (const auto & feature_layer : hparams.vision_feature_layer) {
-            if (feature_layer > deepest_feature_layer) {
-                deepest_feature_layer = feature_layer;
-            }
-        }
-        max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
-    }
-
-    ggml_tensor * inp = build_inp();
-
-    // concat class_embeddings and patch_embeddings
-    if (model.class_embedding) {
-        inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
-    }
-
-    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
-    ggml_set_name(positions, "positions");
-    ggml_set_input(positions);
-
-    inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
-
-    ggml_tensor * inpL = inp;
-
-    // pre-layernorm
-    if (model.pre_ln_w) {
-        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
-        cb(inpL, "pre_ln", -1);
-    }
-
-    std::vector<ggml_tensor *> embedding_stack;
-    const auto & vision_feature_layer = hparams.vision_feature_layer;
-
-    // loop over layers
-    for (int il = 0; il < max_feature_layer; il++) {
-        auto & layer = model.layers[il];
-        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
-
-        // If this is an embedding feature layer, save the output.
-        // NOTE: 0 index here refers to the input to the encoder.
-        if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
-            embedding_stack.push_back(cur);
-        }
-
-        // layernorm1
-        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
-        cb(cur, "layer_inp_normed", il);
-
-        // self-attention
-        {
-            ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
-            if (layer.q_b) {
-                Qcur = ggml_add(ctx0, Qcur, layer.q_b);
-            }
-
-            ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
-            if (layer.k_b) {
-                Kcur = ggml_add(ctx0, Kcur, layer.k_b);
-            }
-
-            ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
-            if (layer.v_b) {
-                Vcur = ggml_add(ctx0, Vcur, layer.v_b);
-            }
-
-            Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
-            Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
-            Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
-
-            cb(Qcur, "Qcur", il);
-            cb(Kcur, "Kcur", il);
-            cb(Vcur, "Vcur", il);
-
-            cur = build_attn(layer.o_w, layer.o_b,
-                Qcur, Kcur, Vcur, nullptr, kq_scale, il);
-            cb(cur, "attn_out", il);
-        }
-
-        // re-add the layer input, e.g., residual
-        cur = ggml_add(ctx0, cur, inpL);
-
-        inpL = cur; // inpL = residual, cur = hidden_states
-
-        cb(cur, "ffn_inp", il);
-
-        // layernorm2
-        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
-        cb(cur, "ffn_inp_normed", il);
-
-        // ffn
-        cur = build_ffn(cur,
-            layer.ff_up_w, layer.ff_up_b,
-            layer.ff_gate_w, layer.ff_gate_b,
-            layer.ff_down_w, layer.ff_down_b,
-            hparams.ffn_op, il);
-
-        cb(cur, "ffn_out", il);
-
-        // residual 2
-        cur = ggml_add(ctx0, inpL, cur);
-        cb(cur, "layer_out", il);
-
-        inpL = cur;
-    }
-
-    // post-layernorm
-    if (model.post_ln_w) {
-        inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
-    }
-
-    ggml_tensor * embeddings = inpL;
-
-    // process vision feature layers (used by granite)
-    {
-        // final layer is a vision feature layer
-        if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
-            embedding_stack.push_back(inpL);
-        }
-
-        // If feature layers are explicitly set, stack them (if we have multiple)
-        if (!embedding_stack.empty()) {
-            embeddings = embedding_stack[0];
-            for (size_t i = 1; i < embedding_stack.size(); i++) {
-                embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
-            }
-        }
-    }
-
-    // llava projector (also used by granite)
-    if (hparams.has_llava_projector) {
-        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
-
-        ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
-        ggml_set_name(patches, "patches");
-        ggml_set_input(patches);
-
-        // shape [1, 576, 1024]
-        // ne is whcn, ne = [1024, 576, 1, 1]
-        embeddings = ggml_get_rows(ctx0, embeddings, patches);
-
-        // print_tensor_info(embeddings, "embeddings");
-
-        // llava projector
-        if (proj_type == PROJECTOR_TYPE_MLP) {
-            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
-            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
-
-            embeddings = ggml_gelu(ctx0, embeddings);
-            if (model.mm_2_w) {
-                embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
-                embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
-            }
-        }
-        else if (proj_type == PROJECTOR_TYPE_MLP_NORM) {
-            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
-            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
-            // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
-            // First LayerNorm
-            embeddings = ggml_norm(ctx0, embeddings, eps);
-            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
-                                model.mm_1_b);
-
-            // GELU activation
-            embeddings = ggml_gelu(ctx0, embeddings);
-
-            // Second linear layer
-            embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
-            embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
-
-            // Second LayerNorm
-            embeddings = ggml_norm(ctx0, embeddings, eps);
-            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
-                                model.mm_4_b);
-        }
-        else if (proj_type == PROJECTOR_TYPE_LDP) {
-            // MobileVLM projector
-            int n_patch = 24;
-            ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
-            mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
-            mlp_1 = ggml_gelu(ctx0, mlp_1);
-            ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
-            mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
-            // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
-
-            // block 1
-            ggml_tensor * block_1 = nullptr;
-            {
-                // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
-                mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3);
-                mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
-                // stride = 1, padding = 1, bias is nullptr
-                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
-
-                // layer norm
-                // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
-                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
-                block_1 = ggml_norm(ctx0, block_1, eps);
-                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
-
-                // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
-                // hardswish
-                ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
-
-                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
-                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
-                // pointwise conv
-                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
-                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
-                block_1 = ggml_relu(ctx0, block_1);
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
-                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
-                block_1 = ggml_hardsigmoid(ctx0, block_1);
-                // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
-                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
-                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
-
-                int w = block_1->ne[0], h = block_1->ne[1];
-                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
-
-                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
-                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
-
-                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
-                block_1 = ggml_norm(ctx0, block_1, eps);
-                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
-                // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
-                // residual
-                block_1 = ggml_add(ctx0, mlp_3, block_1);
-            }
-
-            // block_2
-            {
-                // stride = 2
-                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
-
-                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
-                // layer norm
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
-                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
-                block_1 = ggml_norm(ctx0, block_1, eps);
-                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
-                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
-                // hardswish
-                ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
-
-                // not sure the parameters is right for globalAvgPooling
-                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
-                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
-                // pointwise conv
-                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
-                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
-                block_1 = ggml_relu(ctx0, block_1);
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
-                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
-                block_1 = ggml_hardsigmoid(ctx0, block_1);
-
-                // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
-                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
-                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
-
-                int w = block_1->ne[0], h = block_1->ne[1];
-                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
-                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
-                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
-
-
-                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
-                block_1 = ggml_norm(ctx0, block_1, eps);
-                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
-                block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
-                // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
-            }
-            embeddings = block_1;
-        }
-        else if (proj_type == PROJECTOR_TYPE_LDPV2)
-        {
-            int n_patch = 24;
-            ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
-            mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
-            mlp_0 = ggml_gelu(ctx0, mlp_0);
-            ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
-            mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
-            // mlp_2 ne = [2048, 576, 1, 1]
-            // // AVG Pool Layer 2*2, strides = 2
-            mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3);
-            // mlp_2 ne = [576, 2048, 1, 1]
-            mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
-            // mlp_2 ne [24, 24, 2048, 1]
-            mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
-            // weight ne = [3, 3, 2048, 1]
-            ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
-            peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
-            peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
-            mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
-            peg_0 = ggml_add(ctx0, peg_0, mlp_2);
-            peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
-            embeddings = peg_0;
-        }
-        else {
-            GGML_ABORT("fatal error");
-        }
-    }
-
-    // glm projector
-    else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
-        size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
-        embeddings = ggml_permute(ctx0,embeddings,1,0,2,3);
-        embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
-        embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
-        embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
-        embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
-        embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
-        // GLU
-        {
-            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
-            embeddings = ggml_norm(ctx0, embeddings, eps);
-            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
-            embeddings = ggml_gelu_inplace(ctx0, embeddings);
-            ggml_tensor * x = embeddings;
-            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
-            x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
-            embeddings = ggml_swiglu_split(ctx0, embeddings, x);
-            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
-        }
-        // arrangement of BOI/EOI token embeddings
-        // note: these embeddings are not present in text model, hence we cannot process them as text tokens
-        // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
-        {
-            embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI
-            embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI
-        }
-    }
-
-    else {
-        GGML_ABORT("llava: unknown projector type");
-    }
-
-    // build the graph
-    ggml_build_forward_expand(gf, embeddings);
-
-    return gf;
-}
--- a/Show More
+++ b/Show More